diff options
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL')
96 files changed, 26220 insertions, 27817 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h index 0ed8aa8232..b296992525 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h @@ -12,33 +12,31 @@ class b3GpuBroadphaseInterface { public: - - typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue q); + typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q); virtual ~b3GpuBroadphaseInterface() { } - virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; - virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0; + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0; - virtual void calculateOverlappingPairs(int maxPairs)=0; - virtual void calculateOverlappingPairsHost(int maxPairs)=0; + virtual void calculateOverlappingPairs(int maxPairs) = 0; + virtual void calculateOverlappingPairsHost(int maxPairs) = 0; //call writeAabbsToGpu after done making all changes (createProxy etc) - virtual void writeAabbsToGpu()=0; + virtual void writeAabbsToGpu() = 0; + + virtual cl_mem getAabbBufferWS() = 0; + virtual int getNumOverlap() = 0; + virtual cl_mem getOverlappingPairBuffer() = 0; - virtual cl_mem getAabbBufferWS()=0; - virtual int getNumOverlap()=0; - virtual cl_mem getOverlappingPairBuffer()=0; + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0; + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0; - virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0; - virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0; - virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0; virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0; virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0; - }; -#endif //B3_GPU_BROADPHASE_INTERFACE_H +#endif //B3_GPU_BROADPHASE_INTERFACE_H diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp index 74d0c8056c..e714fadac3 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp @@ -5,12 +5,9 @@ #include "kernels/sapKernels.h" //#include "kernels/gridBroadphase.cl" - #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" - - #define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" #define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" @@ -21,31 +18,25 @@ cl_kernel kFindOverlappingPairs; cl_kernel m_copyAabbsKernel; cl_kernel m_sap2Kernel; - - - - //int maxPairsPerBody = 64; -int maxBodiesPerCell = 256;//?? +int maxBodiesPerCell = 256; //?? -b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ) -:m_context(ctx), -m_device(device), -m_queue(q), -m_allAabbsGPU1(ctx,q), -m_smallAabbsMappingGPU(ctx,q), -m_largeAabbsMappingGPU(ctx,q), -m_gpuPairs(ctx,q), +b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q) + : m_context(ctx), + m_device(device), + m_queue(q), + m_allAabbsGPU1(ctx, q), + m_smallAabbsMappingGPU(ctx, q), + m_largeAabbsMappingGPU(ctx, q), + m_gpuPairs(ctx, q), -m_hashGpu(ctx,q), + m_hashGpu(ctx, q), -m_cellStartGpu(ctx,q), -m_paramsGPU(ctx,q) + m_cellStartGpu(ctx, q), + m_paramsGPU(ctx, q) { - - - b3Vector3 gridSize = b3MakeVector3(3,3,3); - b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]); + b3Vector3 gridSize = b3MakeVector3(3, 3, 3); + b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]); m_paramsCPU.m_gridSize[0] = 128; m_paramsCPU.m_gridSize[1] = 128; @@ -58,92 +49,79 @@ m_paramsGPU(ctx,q) m_paramsCPU.m_invCellSize[3] = 0.f; m_paramsGPU.push_back(m_paramsCPU); - cl_int errNum=0; + cl_int errNum = 0; { const char* sapSrc = sapCL; - cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); - b3Assert(errNum==CL_SUCCESS); - m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); - m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); - b3Assert(errNum==CL_SUCCESS); + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH); + b3Assert(errNum == CL_SUCCESS); + m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg); + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg); + b3Assert(errNum == CL_SUCCESS); } { - - cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH); - b3Assert(errNum==CL_SUCCESS); - - kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); - - kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); - - kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); - - - kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); - - - - - } + cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH); + b3Assert(errNum == CL_SUCCESS); - m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); + kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg); + b3Assert(errNum == CL_SUCCESS); + kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg); + b3Assert(errNum == CL_SUCCESS); + + kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg); + b3Assert(errNum == CL_SUCCESS); + + kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg); + b3Assert(errNum == CL_SUCCESS); + } + + m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue); } b3GpuGridBroadphase::~b3GpuGridBroadphase() { - clReleaseKernel( kCalcHashAABB); - clReleaseKernel( kClearCellStart); - clReleaseKernel( kFindCellStart); - clReleaseKernel( kFindOverlappingPairs); - clReleaseKernel( m_sap2Kernel); - clReleaseKernel( m_copyAabbsKernel); - - - + clReleaseKernel(kCalcHashAABB); + clReleaseKernel(kClearCellStart); + clReleaseKernel(kFindCellStart); + clReleaseKernel(kFindOverlappingPairs); + clReleaseKernel(m_sap2Kernel); + clReleaseKernel(m_copyAabbsKernel); + delete m_sorter; } - - -void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { b3SapAabb aabb; aabb.m_minVec = aabbMin; aabb.m_maxVec = aabbMax; aabb.m_minIndices[3] = userPtr; - aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr; m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size()); m_allAabbsCPU1.push_back(aabb); - } -void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { b3SapAabb aabb; aabb.m_minVec = aabbMin; aabb.m_maxVec = aabbMax; aabb.m_minIndices[3] = userPtr; - aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; + aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr; m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size()); m_allAabbsCPU1.push_back(aabb); } -void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) +void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) { B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs"); - if (0) { calculateOverlappingPairsHost(maxPairs); - /* + /* b3AlignedObjectArray<b3Int4> cpuPairs; m_gpuPairs.copyToHost(cpuPairs); printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size()); @@ -154,57 +132,50 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) */ return; } - - - - int numSmallAabbs = m_smallAabbsMappingGPU.size(); - b3OpenCLArray<int> pairCount(m_context,m_queue); + b3OpenCLArray<int> pairCount(m_context, m_queue); pairCount.push_back(0); - m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody); + m_gpuPairs.resize(maxPairs); //numSmallAabbs*maxPairsPerBody); { int numLargeAabbs = m_largeAabbsMappingGPU.size(); if (numLargeAabbs && numSmallAabbs) { B3_PROFILE("sap2Kernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ), - b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_gpuPairs.getBufferCL() ), + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()), + b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_gpuPairs.getBufferCL()), b3BufferInfoCL(pairCount.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numLargeAabbs ); - launcher.setConst( numSmallAabbs); - launcher.setConst( 0 );//axis is not used - launcher.setConst( maxPairs ); - //@todo: use actual maximum work item sizes of the device instead of hardcoded values - launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); - + b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numLargeAabbs); + launcher.setConst(numSmallAabbs); + launcher.setConst(0); //axis is not used + launcher.setConst(maxPairs); + //@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64); + int numPairs = pairCount.at(0); - - if (numPairs >maxPairs) + + if (numPairs > maxPairs) { b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs =maxPairs; + numPairs = maxPairs; } } } - - - if (numSmallAabbs) { B3_PROFILE("gridKernel"); m_hashGpu.resize(numSmallAabbs); { B3_PROFILE("kCalcHashAABB"); - b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB"); + b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB"); launch.setConst(numSmallAabbs); launch.setBuffer(m_allAabbsGPU1.getBufferCL()); launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); @@ -214,117 +185,104 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) } m_sorter->execute(m_hashGpu); - - int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2]; + + int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2]; m_cellStartGpu.resize(numCells); //b3AlignedObjectArray<int > cellStartCpu; - - + { B3_PROFILE("kClearCellStart"); - b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart"); + b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart"); launch.setConst(numCells); launch.setBuffer(m_cellStartGpu.getBufferCL()); launch.launch1D(numCells); //m_cellStartGpu.copyToHost(cellStartCpu); //printf("??\n"); - } - { B3_PROFILE("kFindCellStart"); - b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart"); + b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart"); launch.setConst(numSmallAabbs); launch.setBuffer(m_hashGpu.getBufferCL()); launch.setBuffer(m_cellStartGpu.getBufferCL()); launch.launch1D(numSmallAabbs); //m_cellStartGpu.copyToHost(cellStartCpu); //printf("??\n"); - } - + { B3_PROFILE("kFindOverlappingPairs"); - - - b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs"); + + b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs"); launch.setConst(numSmallAabbs); launch.setBuffer(m_allAabbsGPU1.getBufferCL()); launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); launch.setBuffer(m_hashGpu.getBufferCL()); launch.setBuffer(m_cellStartGpu.getBufferCL()); - + launch.setBuffer(m_paramsGPU.getBufferCL()); //launch.setBuffer(0); launch.setBuffer(pairCount.getBufferCL()); launch.setBuffer(m_gpuPairs.getBufferCL()); - + launch.setConst(maxPairs); launch.launch1D(numSmallAabbs); - int numPairs = pairCount.at(0); - if (numPairs >maxPairs) + if (numPairs > maxPairs) { b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs =maxPairs; + numPairs = maxPairs; } - + m_gpuPairs.resize(numPairs); - + if (0) { b3AlignedObjectArray<b3Int4> pairsCpu; m_gpuPairs.copyToHost(pairsCpu); int sz = m_gpuPairs.size(); - printf("m_gpuPairs.size()=%d\n",sz); - for (int i=0;i<m_gpuPairs.size();i++) + printf("m_gpuPairs.size()=%d\n", sz); + for (int i = 0; i < m_gpuPairs.size(); i++) { - printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y); + printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y); } printf("?!?\n"); } - } - - } - - - - //calculateOverlappingPairsHost(maxPairs); } -void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) +void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) { - m_hostPairs.resize(0); m_allAabbsGPU1.copyToHost(m_allAabbsCPU1); - for (int i=0;i<m_allAabbsCPU1.size();i++) + for (int i = 0; i < m_allAabbsCPU1.size(); i++) { - for (int j=i+1;j<m_allAabbsCPU1.size();j++) + for (int j = i + 1; j < m_allAabbsCPU1.size(); j++) { if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec, - m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec)) + m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec)) { b3Int4 pair; int a = m_allAabbsCPU1[j].m_minIndices[3]; int b = m_allAabbsCPU1[i].m_minIndices[3]; - if (a<=b) + if (a <= b) { - pair.x = a; - pair.y = b;//store the original index in the unsorted aabb array - } else + pair.x = a; + pair.y = b; //store the original index in the unsorted aabb array + } + else { pair.x = b; - pair.y = a;//store the original index in the unsorted aabb array + pair.y = a; //store the original index in the unsorted aabb array } - - if (m_hostPairs.size()<maxPairs) + + if (m_hostPairs.size() < maxPairs) { m_hostPairs.push_back(pair); } @@ -332,40 +290,36 @@ void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) } } - m_gpuPairs.copyFromHost(m_hostPairs); - - } - //call writeAabbsToGpu after done making all changes (createProxy etc) +//call writeAabbsToGpu after done making all changes (createProxy etc) void b3GpuGridBroadphase::writeAabbsToGpu() { m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1); m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); - } -cl_mem b3GpuGridBroadphase::getAabbBufferWS() +cl_mem b3GpuGridBroadphase::getAabbBufferWS() { return this->m_allAabbsGPU1.getBufferCL(); } -int b3GpuGridBroadphase::getNumOverlap() +int b3GpuGridBroadphase::getNumOverlap() { return m_gpuPairs.size(); } -cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer() +cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer() { return m_gpuPairs.getBufferCL(); } -b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU() +b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU() { return m_allAabbsGPU1; } -b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU() +b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU() { return m_allAabbsCPU1; } @@ -382,4 +336,3 @@ b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU() { return m_largeAabbsMappingGPU; } - diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h index ec18c9f716..b76cb43b68 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h @@ -6,83 +6,75 @@ struct b3ParamsGridBroadphaseCL { - float m_invCellSize[4]; - int m_gridSize[4]; + int m_gridSize[4]; - int getMaxBodiesPerCell() const + int getMaxBodiesPerCell() const { return m_gridSize[3]; } - void setMaxBodiesPerCell(int maxOverlap) + void setMaxBodiesPerCell(int maxOverlap) { m_gridSize[3] = maxOverlap; } }; - class b3GpuGridBroadphase : public b3GpuBroadphaseInterface { protected: - cl_context m_context; - cl_device_id m_device; - cl_command_queue m_queue; + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; - b3OpenCLArray<b3SapAabb> m_allAabbsGPU1; - b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1; + b3OpenCLArray<b3SapAabb> m_allAabbsGPU1; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1; - b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3OpenCLArray<int> m_smallAabbsMappingGPU; b3AlignedObjectArray<int> m_smallAabbsMappingCPU; - b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3OpenCLArray<int> m_largeAabbsMappingGPU; b3AlignedObjectArray<int> m_largeAabbsMappingCPU; b3AlignedObjectArray<b3Int4> m_hostPairs; - b3OpenCLArray<b3Int4> m_gpuPairs; + b3OpenCLArray<b3Int4> m_gpuPairs; - b3OpenCLArray<b3SortData> m_hashGpu; - b3OpenCLArray<int> m_cellStartGpu; - + b3OpenCLArray<b3SortData> m_hashGpu; + b3OpenCLArray<int> m_cellStartGpu; - b3ParamsGridBroadphaseCL m_paramsCPU; - b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU; + b3ParamsGridBroadphaseCL m_paramsCPU; + b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU; - class b3RadixSort32CL* m_sorter; + class b3RadixSort32CL* m_sorter; public: - - b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ); + b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q); virtual ~b3GpuGridBroadphase(); - static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuGridBroadphase(ctx,device,q); + return new b3GpuGridBroadphase(ctx, device, q); } - - + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); - virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); - virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); - - virtual void calculateOverlappingPairs(int maxPairs); - virtual void calculateOverlappingPairsHost(int maxPairs); + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); //call writeAabbsToGpu after done making all changes (createProxy etc) virtual void writeAabbsToGpu(); - virtual cl_mem getAabbBufferWS(); - virtual int getNumOverlap(); - virtual cl_mem getOverlappingPairBuffer(); + virtual cl_mem getAabbBufferWS(); + virtual int getNumOverlap(); + virtual cl_mem getOverlappingPairBuffer(); + + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU(); + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU(); - virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU(); - virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU(); - virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU(); virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU(); virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU(); - }; -#endif //B3_GPU_GRID_BROADPHASE_H
\ No newline at end of file +#endif //B3_GPU_GRID_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp index 641df9eb12..0721928684 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp @@ -16,177 +16,174 @@ subject to the following restrictions: #include "b3GpuParallelLinearBvh.h" -b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : - m_queue(queue), - m_radixSorter(context, device, queue), - - m_rootNodeIndex(context, queue), - m_maxDistanceFromRoot(context, queue), - m_temp(context, queue), - - m_internalNodeAabbs(context, queue), - m_internalNodeLeafIndexRanges(context, queue), - m_internalNodeChildNodes(context, queue), - m_internalNodeParentNodes(context, queue), - - m_commonPrefixes(context, queue), - m_commonPrefixLengths(context, queue), - m_distanceFromRoot(context, queue), - - m_leafNodeParentNodes(context, queue), - m_mortonCodesAndAabbIndicies(context, queue), - m_mergedAabb(context, queue), - m_leafNodeAabbs(context, queue), - - m_largeAabbs(context, queue) +b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue), + m_radixSorter(context, device, queue), + + m_rootNodeIndex(context, queue), + m_maxDistanceFromRoot(context, queue), + m_temp(context, queue), + + m_internalNodeAabbs(context, queue), + m_internalNodeLeafIndexRanges(context, queue), + m_internalNodeChildNodes(context, queue), + m_internalNodeParentNodes(context, queue), + + m_commonPrefixes(context, queue), + m_commonPrefixLengths(context, queue), + m_distanceFromRoot(context, queue), + + m_leafNodeParentNodes(context, queue), + m_mortonCodesAndAabbIndicies(context, queue), + m_mergedAabb(context, queue), + m_leafNodeAabbs(context, queue), + + m_largeAabbs(context, queue) { m_rootNodeIndex.resize(1); m_maxDistanceFromRoot.resize(1); m_temp.resize(1); - + // const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; - - const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h + + const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h cl_int error; char* additionalMacros = 0; m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH); b3Assert(m_parallelLinearBvhProgram); - - m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); + + m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_separateAabbsKernel); - m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_findAllNodesMergedAabbKernel); - m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); - - m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros ); + + m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_computeAdjacentPairCommonPrefixKernel); - m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_buildBinaryRadixTreeLeafNodesKernel); - m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_buildBinaryRadixTreeInternalNodesKernel); - m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_findDistanceFromRootKernel); - m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel); - - m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros ); + + m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_findLeafIndexRangesKernel); - - m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); + + m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_plbvhCalculateOverlappingPairsKernel); - m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_plbvhRayTraverseKernel); - m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_plbvhLargeAabbAabbTestKernel); - m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros); b3Assert(m_plbvhLargeAabbRayTestKernel); } -b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() +b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() { clReleaseKernel(m_separateAabbsKernel); clReleaseKernel(m_findAllNodesMergedAabbKernel); clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); - + clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel); clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel); clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel); clReleaseKernel(m_findDistanceFromRootKernel); clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel); - + clReleaseKernel(m_findLeafIndexRangesKernel); - + clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel); clReleaseKernel(m_plbvhRayTraverseKernel); clReleaseKernel(m_plbvhLargeAabbAabbTestKernel); clReleaseKernel(m_plbvhLargeAabbRayTestKernel); - + clReleaseProgram(m_parallelLinearBvhProgram); } -void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, - const b3OpenCLArray<int>& largeAabbIndices) +void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, + const b3OpenCLArray<int>& largeAabbIndices) { B3_PROFILE("b3ParallelLinearBvh::build()"); - + int numLargeAabbs = largeAabbIndices.size(); int numSmallAabbs = smallAabbIndices.size(); - - //Since all AABBs(both large and small) are input as a contiguous array, + + //Since all AABBs(both large and small) are input as a contiguous array, //with 2 additional arrays used to indicate the indices of large and small AABBs, //it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH. { B3_PROFILE("Separate large and small AABBs"); - + m_largeAabbs.resize(numLargeAabbs); m_leafNodeAabbs.resize(numSmallAabbs); - + //Write large AABBs into m_largeAabbs { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), - b3BufferInfoCL( largeAabbIndices.getBufferCL() ), - - b3BufferInfoCL( m_largeAabbs.getBufferCL() ) - }; - + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(worldSpaceAabbs.getBufferCL()), + b3BufferInfoCL(largeAabbIndices.getBufferCL()), + + b3BufferInfoCL(m_largeAabbs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numLargeAabbs); - + launcher.launch1D(numLargeAabbs); } - + //Write small AABBs into m_leafNodeAabbs { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), - b3BufferInfoCL( smallAabbIndices.getBufferCL() ), - - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ) - }; - + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(worldSpaceAabbs.getBufferCL()), + b3BufferInfoCL(smallAabbIndices.getBufferCL()), + + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numSmallAabbs); - + launcher.launch1D(numSmallAabbs); } - + clFinish(m_queue); } - + // - int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs + int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs int numInternalNodes = numLeaves - 1; - - if(numLeaves < 2) + + if (numLeaves < 2) { //Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(), //so it does not matter if numLeaves == 0 and rootNodeIndex == -1 int rootNodeIndex = numLeaves - 1; m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1); - + //Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm, //m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index //instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work. //( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs ) - if(numLeaves == 1) + if (numLeaves == 1) { b3SortData leaf; - leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set - + leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set + m_mortonCodesAndAabbIndicies.resize(1); m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1); } - + return; } - + // { m_internalNodeAabbs.resize(numInternalNodes); @@ -197,37 +194,37 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab m_commonPrefixes.resize(numInternalNodes); m_commonPrefixLengths.resize(numInternalNodes); m_distanceFromRoot.resize(numInternalNodes); - + m_leafNodeParentNodes.resize(numLeaves); m_mortonCodesAndAabbIndicies.resize(numLeaves); m_mergedAabb.resize(numLeaves); } - - //Find the merged AABB of all small AABBs; this is used to define the size of + + //Find the merged AABB of all small AABBs; this is used to define the size of //each cell in the virtual grid for the next kernel(2^10 cells in each dimension). { B3_PROFILE("Find AABB of merged nodes"); - - m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array - - for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; - numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) + + m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array + + for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; + numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] - }; - + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_mergedAabb.getBufferCL()) //Resulting AABB is stored in m_mergedAabb[0] + }; + b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numAabbsNeedingMerge); - + launcher.launch1D(numAabbsNeedingMerge); } - + clFinish(m_queue); } - + //Insert the center of the AABBs into a virtual grid, //then convert the discrete grid coordinates into a morton code //For each element in m_mortonCodesAndAabbIndicies, set @@ -235,34 +232,32 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab // m_value == small AABB index { B3_PROFILE("Assign morton codes"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_mergedAabb.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_mergedAabb.getBufferCL()), + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numLeaves); - + launcher.launch1D(numLeaves); clFinish(m_queue); } - + // { B3_PROFILE("Sort leaves by morton codes"); - + m_radixSorter.execute(m_mortonCodesAndAabbIndicies); clFinish(m_queue); } - + // constructBinaryRadixTree(); - - + //Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices. //The root node contains leaf node indices in the range [0, numLeafNodes - 1]. //The child nodes of each node split their parent's index range into 2 contiguous halves. @@ -273,17 +268,16 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab //This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice { B3_PROFILE("m_findLeafIndexRangesKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numInternalNodes); - + launcher.launch1D(numInternalNodes); clFinish(m_queue); } @@ -293,285 +287,271 @@ void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& ou { int maxPairs = out_overlappingPairs.size(); b3OpenCLArray<int>& numPairsGpu = m_temp; - + int reset = 0; numPairsGpu.copyFromHostPointer(&reset, 1); - + // - if( m_leafNodeAabbs.size() > 1 ) + if (m_leafNodeAabbs.size() > 1) { B3_PROFILE("PLBVH small-small AABB test"); - + int numQueryAabbs = m_leafNodeAabbs.size(); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - - b3BufferInfoCL( numPairsGpu.getBufferCL() ), - b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + + b3BufferInfoCL(m_rootNodeIndex.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()), + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + + b3BufferInfoCL(numPairsGpu.getBufferCL()), + b3BufferInfoCL(out_overlappingPairs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(maxPairs); launcher.setConst(numQueryAabbs); - + launcher.launch1D(numQueryAabbs); clFinish(m_queue); } - + int numLargeAabbRigids = m_largeAabbs.size(); - if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 ) + if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0) { B3_PROFILE("PLBVH large-small AABB test"); - + int numQueryAabbs = m_leafNodeAabbs.size(); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_largeAabbs.getBufferCL() ), - - b3BufferInfoCL( numPairsGpu.getBufferCL() ), - b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_largeAabbs.getBufferCL()), + + b3BufferInfoCL(numPairsGpu.getBufferCL()), + b3BufferInfoCL(out_overlappingPairs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(maxPairs); launcher.setConst(numLargeAabbRigids); launcher.setConst(numQueryAabbs); - + launcher.launch1D(numQueryAabbs); clFinish(m_queue); } - - + // int numPairs = -1; numPairsGpu.copyToHostPointer(&numPairs, 1); - if(numPairs > maxPairs) + if (numPairs > maxPairs) { b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); numPairs = maxPairs; numPairsGpu.copyFromHostPointer(&maxPairs, 1); } - + out_overlappingPairs.resize(numPairs); } - -void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, - b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs) +void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, + b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs) { B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()"); - + int numRays = rays.size(); int maxRayRigidPairs = out_rayRigidPairs.size(); - + int reset = 0; out_numRayRigidPairs.copyFromHostPointer(&reset, 1); - + // - if( m_leafNodeAabbs.size() > 0 ) + if (m_leafNodeAabbs.size() > 0) { B3_PROFILE("PLBVH ray test small AABB"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - - b3BufferInfoCL( rays.getBufferCL() ), - - b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), - b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + + b3BufferInfoCL(m_rootNodeIndex.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()), + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + + b3BufferInfoCL(rays.getBufferCL()), + + b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()), + b3BufferInfoCL(out_rayRigidPairs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(maxRayRigidPairs); launcher.setConst(numRays); - + launcher.launch1D(numRays); clFinish(m_queue); } - + int numLargeAabbRigids = m_largeAabbs.size(); - if(numLargeAabbRigids > 0) + if (numLargeAabbRigids > 0) { B3_PROFILE("PLBVH ray test large AABB"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_largeAabbs.getBufferCL() ), - b3BufferInfoCL( rays.getBufferCL() ), - - b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), - b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_largeAabbs.getBufferCL()), + b3BufferInfoCL(rays.getBufferCL()), + + b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()), + b3BufferInfoCL(out_rayRigidPairs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numLargeAabbRigids); launcher.setConst(maxRayRigidPairs); launcher.setConst(numRays); - + launcher.launch1D(numRays); clFinish(m_queue); } - + // int numRayRigidPairs = -1; out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1); - - if(numRayRigidPairs > maxRayRigidPairs) + + if (numRayRigidPairs > maxRayRigidPairs) b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs); - } void b3GpuParallelLinearBvh::constructBinaryRadixTree() { B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()"); - + int numLeaves = m_leafNodeAabbs.size(); int numInternalNodes = numLeaves - 1; - + //Each internal node is placed in between 2 leaf nodes. //By using this arrangement and computing the common prefix between //these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree. { B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), - b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + b3BufferInfoCL(m_commonPrefixes.getBufferCL()), + b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numInternalNodes); - + launcher.launch1D(numInternalNodes); clFinish(m_queue); } - - //For each leaf node, select its parent node by + + //For each leaf node, select its parent node by //comparing the 2 nearest internal nodes and assign child node indices { B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), - b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()), + b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numLeaves); - + launcher.launch1D(numLeaves); clFinish(m_queue); } - + //For each internal node, perform 2 binary searches among the other internal nodes //to its left and right to find its potential parent nodes and assign child node indices { B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), - b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_commonPrefixes.getBufferCL()), + b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()), + b3BufferInfoCL(m_rootNodeIndex.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numInternalNodes); - + launcher.launch1D(numInternalNodes); clFinish(m_queue); } - + //Find the number of nodes seperating each internal node and the root node //so that the AABBs can be set using the next kernel. //Also determine the maximum number of nodes separating an internal node and the root node. { B3_PROFILE("m_findDistanceFromRootKernel"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), - b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), - b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ), - b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_rootNodeIndex.getBufferCL()), + b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()), + b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()), + b3BufferInfoCL(m_distanceFromRoot.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numInternalNodes); - + launcher.launch1D(numInternalNodes); clFinish(m_queue); } - + //Starting from the internal nodes nearest to the leaf nodes, recursively move up //the tree towards the root to set the AABBs of each internal node; each internal node //checks its children and merges their AABBs { B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel"); - + int maxDistanceFromRoot = -1; { B3_PROFILE("copy maxDistanceFromRoot to CPU"); m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1); clFinish(m_queue); } - - for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot) + + for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot) { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) - }; - + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_distanceFromRoot.getBufferCL()), + b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), + b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(maxDistanceFromRoot); launcher.setConst(distanceFromRoot); launcher.setConst(numInternalNodes); - + //It may seem inefficent to launch a thread for each internal node when a //much smaller number of nodes is actually processed, but this is actually - //faster than determining the exact nodes that are ready to merge their child AABBs. + //faster than determining the exact nodes that are ready to merge their child AABBs. launcher.launch1D(numInternalNodes); } - + clFinish(m_queue); } } - -
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h index effe617b7b..b390775129 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -37,10 +37,10 @@ subject to the following restrictions: ///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n ///@par ///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages: -/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) +/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) /// - [fully parallel] Sort morton codes -/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) -/// - [somewhat parallel] Set internal node AABBs +/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) +/// - [somewhat parallel] Set internal node AABBs ///@par ///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages. ///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree. @@ -49,75 +49,75 @@ subject to the following restrictions: class b3GpuParallelLinearBvh { cl_command_queue m_queue; - + cl_program m_parallelLinearBvhProgram; - + cl_kernel m_separateAabbsKernel; cl_kernel m_findAllNodesMergedAabbKernel; cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; - + //Binary radix tree construction kernels cl_kernel m_computeAdjacentPairCommonPrefixKernel; cl_kernel m_buildBinaryRadixTreeLeafNodesKernel; cl_kernel m_buildBinaryRadixTreeInternalNodesKernel; cl_kernel m_findDistanceFromRootKernel; cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel; - + cl_kernel m_findLeafIndexRangesKernel; - + //Traversal kernels cl_kernel m_plbvhCalculateOverlappingPairsKernel; cl_kernel m_plbvhRayTraverseKernel; cl_kernel m_plbvhLargeAabbAabbTestKernel; cl_kernel m_plbvhLargeAabbRayTestKernel; - + b3RadixSort32CL m_radixSorter; - + //1 element - b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node - b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node - b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs() - + b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node + b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node + b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs() + //1 element per internal node (number_of_internal_nodes == number_of_leaves - 1) b3OpenCLArray<b3SapAabb> m_internalNodeAabbs; - b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index - b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node - b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal - + b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index + b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node + b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + //1 element per internal node; for binary radix tree construction b3OpenCLArray<b3Int64> m_commonPrefixes; b3OpenCLArray<int> m_commonPrefixLengths; - b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root - + b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root + //1 element per leaf node (leaf nodes only include small AABBs) - b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal - b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs - b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes - b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs - + b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs + b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes + b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs + //1 element per large AABB, which is not stored in the BVH b3OpenCLArray<b3SapAabb> m_largeAabbs; - + public: b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue); virtual ~b3GpuParallelLinearBvh(); - + ///Must be called before any other function - void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, - const b3OpenCLArray<int>& largeAabbIndices); - + void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, + const b3OpenCLArray<int>& largeAabbIndices); + ///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs. ///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs. ///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized. void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs); - + ///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections; ///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough. ///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index. ///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded. - void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, - b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs); - + void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, + b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs); + private: void constructBinaryRadixTree(); }; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp index d2618024ac..62ea7a32df 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp @@ -13,45 +13,44 @@ subject to the following restrictions: #include "b3GpuParallelLinearBvhBroadphase.h" -b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : - m_plbvh(context, device, queue), - - m_overlappingPairsGpu(context, queue), - - m_aabbsGpu(context, queue), - m_smallAabbsMappingGpu(context, queue), - m_largeAabbsMappingGpu(context, queue) +b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue), + + m_overlappingPairsGpu(context, queue), + + m_aabbsGpu(context, queue), + m_smallAabbsMappingGpu(context, queue), + m_largeAabbsMappingGpu(context, queue) { } -void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) +void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int newAabbIndex = m_aabbsCpu.size(); b3SapAabb aabb; aabb.m_minVec = aabbMin; aabb.m_maxVec = aabbMax; - + aabb.m_minIndices[3] = userPtr; aabb.m_signedMaxIndices[3] = newAabbIndex; - + m_smallAabbsMappingCpu.push_back(newAabbIndex); - + m_aabbsCpu.push_back(aabb); } -void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) +void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int newAabbIndex = m_aabbsCpu.size(); b3SapAabb aabb; aabb.m_minVec = aabbMin; aabb.m_maxVec = aabbMax; - + aabb.m_minIndices[3] = userPtr; aabb.m_signedMaxIndices[3] = newAabbIndex; - + m_largeAabbsMappingCpu.push_back(newAabbIndex); - + m_aabbsCpu.push_back(aabb); } @@ -59,22 +58,19 @@ void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs) { //Reconstruct BVH m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu); - + // m_overlappingPairsGpu.resize(maxPairs); m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu); } void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs) { - b3Assert(0); //CPU version not implemented + b3Assert(0); //CPU version not implemented } -void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() -{ - m_aabbsGpu.copyFromHost(m_aabbsCpu); +void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() +{ + m_aabbsGpu.copyFromHost(m_aabbsCpu); m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu); m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu); } - - - diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h index e518500637..dda0eea7be 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -21,42 +21,42 @@ subject to the following restrictions: class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface { b3GpuParallelLinearBvh m_plbvh; - + b3OpenCLArray<b3Int4> m_overlappingPairsGpu; - + b3OpenCLArray<b3SapAabb> m_aabbsGpu; b3OpenCLArray<int> m_smallAabbsMappingGpu; b3OpenCLArray<int> m_largeAabbsMappingGpu; - + b3AlignedObjectArray<b3SapAabb> m_aabbsCpu; b3AlignedObjectArray<int> m_smallAabbsMappingCpu; b3AlignedObjectArray<int> m_largeAabbsMappingCpu; - + public: b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue); virtual ~b3GpuParallelLinearBvhBroadphase() {} - virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); - virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); - + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void calculateOverlappingPairs(int maxPairs); virtual void calculateOverlappingPairsHost(int maxPairs); //call writeAabbsToGpu after done making all changes (createProxy etc) virtual void writeAabbsToGpu(); - - virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); } + + virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); } virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); } virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); } virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; } - + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; } virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; } virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; } - + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; } - + static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue) { return new b3GpuParallelLinearBvhBroadphase(context, device, queue); diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp index c45fbbdcaa..4126d03ed0 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp @@ -6,7 +6,6 @@ bool searchIncremental3dSapOnGpu = true; #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h" - #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "kernels/sapKernels.h" @@ -56,110 +55,105 @@ bool searchIncremental3dSapOnGpu = true; class b3PrefixScanFloat4CL* m_prefixScanFloat4; */ -b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType) -:m_context(ctx), -m_device(device), -m_queue(q), - -m_objectMinMaxIndexGPUaxis0(ctx,q), -m_objectMinMaxIndexGPUaxis1(ctx,q), -m_objectMinMaxIndexGPUaxis2(ctx,q), -m_objectMinMaxIndexGPUaxis0prev(ctx,q), -m_objectMinMaxIndexGPUaxis1prev(ctx,q), -m_objectMinMaxIndexGPUaxis2prev(ctx,q), -m_sortedAxisGPU0(ctx,q), -m_sortedAxisGPU1(ctx,q), -m_sortedAxisGPU2(ctx,q), -m_sortedAxisGPU0prev(ctx,q), -m_sortedAxisGPU1prev(ctx,q), -m_sortedAxisGPU2prev(ctx,q), -m_addedHostPairsGPU(ctx,q), -m_removedHostPairsGPU(ctx,q), -m_addedCountGPU(ctx,q), -m_removedCountGPU(ctx,q), -m_currentBuffer(-1), -m_pairCount(ctx,q), -m_allAabbsGPU(ctx,q), -m_sum(ctx,q), -m_sum2(ctx,q), -m_dst(ctx,q), -m_smallAabbsMappingGPU(ctx,q), -m_largeAabbsMappingGPU(ctx,q), -m_overlappingPairs(ctx,q), -m_gpuSmallSortData(ctx,q), -m_gpuSmallSortedAabbs(ctx,q) +b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType) + : m_context(ctx), + m_device(device), + m_queue(q), + + m_objectMinMaxIndexGPUaxis0(ctx, q), + m_objectMinMaxIndexGPUaxis1(ctx, q), + m_objectMinMaxIndexGPUaxis2(ctx, q), + m_objectMinMaxIndexGPUaxis0prev(ctx, q), + m_objectMinMaxIndexGPUaxis1prev(ctx, q), + m_objectMinMaxIndexGPUaxis2prev(ctx, q), + m_sortedAxisGPU0(ctx, q), + m_sortedAxisGPU1(ctx, q), + m_sortedAxisGPU2(ctx, q), + m_sortedAxisGPU0prev(ctx, q), + m_sortedAxisGPU1prev(ctx, q), + m_sortedAxisGPU2prev(ctx, q), + m_addedHostPairsGPU(ctx, q), + m_removedHostPairsGPU(ctx, q), + m_addedCountGPU(ctx, q), + m_removedCountGPU(ctx, q), + m_currentBuffer(-1), + m_pairCount(ctx, q), + m_allAabbsGPU(ctx, q), + m_sum(ctx, q), + m_sum2(ctx, q), + m_dst(ctx, q), + m_smallAabbsMappingGPU(ctx, q), + m_largeAabbsMappingGPU(ctx, q), + m_overlappingPairs(ctx, q), + m_gpuSmallSortData(ctx, q), + m_gpuSmallSortedAabbs(ctx, q) { const char* sapSrc = sapCL; - - - cl_int errNum=0; + + cl_int errNum = 0; b3Assert(m_context); b3Assert(m_device); - cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); - b3Assert(errNum==CL_SUCCESS); - + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH); + b3Assert(errNum == CL_SUCCESS); - b3Assert(errNum==CL_SUCCESS); + b3Assert(errNum == CL_SUCCESS); #ifndef __APPLE__ - m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue); + m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context, m_device, m_queue); #else m_prefixScanFloat4 = 0; #endif m_sapKernel = 0; - + switch (kernelType) { case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU: { - m_sapKernel=0; + m_sapKernel = 0; break; } - case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: + case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBruteForce",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBruteForce", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_ORIGINAL: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelOriginal", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_BARRIER: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBarrier", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg); break; } default: { - m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); + m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg); b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory"); } }; - - - - m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); - b3Assert(errNum==CL_SUCCESS); - m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg ); - b3Assert(errNum==CL_SUCCESS); + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg); + b3Assert(errNum == CL_SUCCESS); - - m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg ); + m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "prepareSumVarianceKernel", &errNum, sapProg); + b3Assert(errNum == CL_SUCCESS); - m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); + m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "flipFloatKernel", &errNum, sapProg); - m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg ); + m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg); - m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); + m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "scatterKernel", &errNum, sapProg); + + m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue); } b3GpuSapBroadphase::~b3GpuSapBroadphase() @@ -173,13 +167,11 @@ b3GpuSapBroadphase::~b3GpuSapBroadphase() clReleaseKernel(m_sapKernel); clReleaseKernel(m_sap2Kernel); clReleaseKernel(m_prepareSumVarianceKernel); - - } /// conservative test for overlap between two aabbs -static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1, - const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2) +static bool TestAabbAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1, + const b3Vector3& aabbMin2, const b3Vector3& aabbMax2) { bool overlap = true; overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; @@ -188,8 +180,6 @@ static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aab return overlap; } - - //http://stereopsis.com/radix.html static unsigned int FloatFlip(float fl) { @@ -198,79 +188,77 @@ static unsigned int FloatFlip(float fl) return f ^ mask; }; -void b3GpuSapBroadphase::init3dSap() +void b3GpuSapBroadphase::init3dSap() { - if (m_currentBuffer<0) + if (m_currentBuffer < 0) { m_allAabbsGPU.copyToHost(m_allAabbsCPU); m_currentBuffer = 0; - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { - for (int buf=0;buf<2;buf++) + for (int buf = 0; buf < 2; buf++) { int totalNumAabbs = m_allAabbsCPU.size(); - int numEndPoints = 2*totalNumAabbs; + int numEndPoints = 2 * totalNumAabbs; m_sortedAxisCPU[axis][buf].resize(numEndPoints); - if (buf==m_currentBuffer) + if (buf == m_currentBuffer) { - for (int i=0;i<totalNumAabbs;i++) + for (int i = 0; i < totalNumAabbs; i++) { - m_sortedAxisCPU[axis][buf][i*2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis])-1; - m_sortedAxisCPU[axis][buf][i*2].m_value = i*2; - m_sortedAxisCPU[axis][buf][i*2+1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis])+1; - m_sortedAxisCPU[axis][buf][i*2+1].m_value = i*2+1; + m_sortedAxisCPU[axis][buf][i * 2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis]) - 1; + m_sortedAxisCPU[axis][buf][i * 2].m_value = i * 2; + m_sortedAxisCPU[axis][buf][i * 2 + 1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis]) + 1; + m_sortedAxisCPU[axis][buf][i * 2 + 1].m_value = i * 2 + 1; } } } } - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); } - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { //int totalNumAabbs = m_allAabbsCPU.size(); int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints); - for (int i=0;i<numEndPoints;i++) + for (int i = 0; i < numEndPoints; i++) { int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; - int newDest = destIndex/2; - if (destIndex&1) + int newDest = destIndex / 2; + if (destIndex & 1) { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; - } else + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i; + } + else { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i; } } } - } } - static bool b3PairCmp(const b3Int4& p, const b3Int4& q) { - return ((p.x<q.x) || ((p.x==q.x) && (p.y<q.y))); + return ((p.x < q.x) || ((p.x == q.x) && (p.y < q.y))); } - -static bool operator==(const b3Int4& a,const b3Int4& b) +static bool operator==(const b3Int4& a, const b3Int4& b) { return a.x == b.x && a.y == b.y; }; -static bool operator<(const b3Int4& a,const b3Int4& b) +static bool operator<(const b3Int4& a, const b3Int4& b) { return a.x < b.x || (a.x == b.x && a.y < b.y); }; -static bool operator>(const b3Int4& a,const b3Int4& b) +static bool operator>(const b3Int4& a, const b3Int4& b) { return a.x > b.x || (a.x == b.x && a.y > b.y); }; @@ -278,31 +266,29 @@ static bool operator>(const b3Int4& a,const b3Int4& b) b3AlignedObjectArray<b3Int4> addedHostPairs; b3AlignedObjectArray<b3Int4> removedHostPairs; -b3AlignedObjectArray<b3SapAabb> preAabbs; +b3AlignedObjectArray<b3SapAabb> preAabbs; -void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() +void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() { //static int framepje = 0; //printf("framepje=%d\n",framepje++); - B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap"); addedHostPairs.resize(0); removedHostPairs.resize(0); - b3Assert(m_currentBuffer>=0); - + b3Assert(m_currentBuffer >= 0); + { preAabbs.resize(m_allAabbsCPU.size()); - for (int i=0;i<preAabbs.size();i++) + for (int i = 0; i < preAabbs.size(); i++) { - preAabbs[i]=m_allAabbsCPU[i]; + preAabbs[i] = m_allAabbsCPU[i]; } } - - if (m_currentBuffer<0) + if (m_currentBuffer < 0) return; { B3_PROFILE("m_allAabbsGPU.copyToHost"); @@ -316,100 +302,87 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } if (0) { - { - printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", - m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1],m_allAabbsCPU[40].m_min[2], - m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1],m_allAabbsCPU[40].m_max[2]); - } - - { - printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", - m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1],m_allAabbsCPU[53].m_min[2], - m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1],m_allAabbsCPU[53].m_max[2]); - } - - - { - b3Int4 newPair; - newPair.x = 40; - newPair.y = 53; - int index = allPairs.findBinarySearch(newPair); - printf("hasPair(40,53)=%d out of %d\n",index, allPairs.size()); - { - int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max,(const b3Vector3&)m_allAabbsCPU[53].m_min,(const b3Vector3&)m_allAabbsCPU[53].m_max); - printf("overlap=%d\n",overlap); + printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", + m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1], m_allAabbsCPU[40].m_min[2], + m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1], m_allAabbsCPU[40].m_max[2]); } - if (preAabbs.size()) - { - int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max,(const b3Vector3&)preAabbs[53].m_min,(const b3Vector3&)preAabbs[53].m_max); - printf("prevoverlap=%d\n",prevOverlap); - } else { - printf("unknown prevoverlap\n"); + printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", + m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1], m_allAabbsCPU[53].m_min[2], + m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1], m_allAabbsCPU[53].m_max[2]); } - } - } + { + b3Int4 newPair; + newPair.x = 40; + newPair.y = 53; + int index = allPairs.findBinarySearch(newPair); + printf("hasPair(40,53)=%d out of %d\n", index, allPairs.size()); + { + int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max, (const b3Vector3&)m_allAabbsCPU[53].m_min, (const b3Vector3&)m_allAabbsCPU[53].m_max); + printf("overlap=%d\n", overlap); + } + + if (preAabbs.size()) + { + int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max, (const b3Vector3&)preAabbs[53].m_min, (const b3Vector3&)preAabbs[53].m_max); + printf("prevoverlap=%d\n", prevOverlap); + } + else + { + printf("unknown prevoverlap\n"); + } + } + } if (0) { - for (int i=0;i<m_allAabbsCPU.size();i++) + for (int i = 0; i < m_allAabbsCPU.size(); i++) { //printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2], m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]); - - } - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { - for (int buf=0;buf<2;buf++) + for (int buf = 0; buf < 2; buf++) { - b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size()*2); + b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size() * 2); } } } - - - m_currentBuffer = 1-m_currentBuffer; - - + m_currentBuffer = 1 - m_currentBuffer; int totalNumAabbs = m_allAabbsCPU.size(); { B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)"); - for (int i=0;i<totalNumAabbs;i++) + for (int i = 0; i < totalNumAabbs; i++) { - - unsigned int keyMin[3]; unsigned int keyMax[3]; - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { - float vmin=m_allAabbsCPU[i].m_min[axis]; + float vmin = m_allAabbsCPU[i].m_min[axis]; float vmax = m_allAabbsCPU[i].m_max[axis]; keyMin[axis] = FloatFlip(vmin); keyMax[axis] = FloatFlip(vmax); - - m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_key = keyMin[axis]-1; - m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_value = i*2; - m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_key = keyMax[axis]+1; - m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_value = i*2+1; + + m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_key = keyMin[axis] - 1; + m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_value = i * 2; + m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_key = keyMax[axis] + 1; + m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_value = i * 2 + 1; } //printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]); - } } - - { B3_PROFILE("sort m_sortedAxisCPU"); - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); } @@ -432,21 +405,22 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() { B3_PROFILE("assign m_objectMinMaxIndexCPU"); - for (int axis=0;axis<3;axis++) + for (int axis = 0; axis < 3; axis++) { int totalNumAabbs = m_allAabbsCPU.size(); int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs); - for (int i=0;i<numEndPoints;i++) + for (int i = 0; i < numEndPoints; i++) { int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; - int newDest = destIndex/2; - if (destIndex&1) + int newDest = destIndex / 2; + if (destIndex & 1) { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; - } else + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i; + } + else { - m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; + m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i; } } } @@ -485,12 +459,11 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } #endif - int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size(); int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size(); - b3Assert(a==b); - b3Assert(b==c); + b3Assert(a == b); + b3Assert(b == c); /* if (searchIncremental3dSapOnGpu) { @@ -574,175 +547,170 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size(); B3_PROFILE("actual search"); - for (int i=0;i<numObjects;i++) + for (int i = 0; i < numObjects; i++) { //int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size(); //int checkObjects[]={40,53}; //int numCheckObjects = sizeof(checkObjects)/sizeof(int); - + //for (int a=0;a<numCheckObjects ;a++) - - for (int axis=0;axis<3;axis++) + + for (int axis = 0; axis < 3; axis++) { //int i = checkObjects[a]; unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x; unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y; - unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].x; + unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].x; int dmin = curMinIndex - prevMinIndex; - - unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].y; - + unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].y; int dmax = curMaxIndex - prevMaxIndex; - if (dmin!=0) + if (dmin != 0) { //printf("for object %d, dmin=%d\n",i,dmin); } - if (dmax!=0) + if (dmax != 0) { //printf("for object %d, dmax=%d\n",i,dmax); } - for (int otherbuffer = 0;otherbuffer<2;otherbuffer++) + for (int otherbuffer = 0; otherbuffer < 2; otherbuffer++) { - if (dmin!=0) + if (dmin != 0) { - int stepMin = dmin<0 ? -1 : 1; - for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin) + int stepMin = dmin < 0 ? -1 : 1; + for (int j = prevMinIndex; j != curMinIndex; j += stepMin) { int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; - int otherIndex = otherIndex2/2; - if (otherIndex!=i) + int otherIndex = otherIndex2 / 2; + if (otherIndex != i) { - bool otherIsMax = ((otherIndex2&1)!=0); + bool otherIsMax = ((otherIndex2 & 1) != 0); if (otherIsMax) { //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); - + bool overlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) - overlap=false; + overlap = false; } - // b3Assert(overlap2==overlap); + // b3Assert(overlap2==overlap); bool prevOverlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { - if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || - (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) - prevOverlap=false; + if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) + prevOverlap = false; } - //b3Assert(overlap==overlap2); - - - if (dmin<0) + if (dmin < 0) { if (overlap && !prevOverlap) { //add a pair b3Int4 newPair; - if (i<=otherIndex) + if (i <= otherIndex) { newPair.x = i; newPair.y = otherIndex; - } else + } + else { newPair.x = otherIndex; newPair.y = i; } addedHostPairs.push_back(newPair); } - } + } else { if (!overlap && prevOverlap) { - //remove a pair b3Int4 removedPair; - if (i<=otherIndex) + if (i <= otherIndex) { removedPair.x = i; removedPair.y = otherIndex; - } else + } + else { removedPair.x = otherIndex; removedPair.y = i; } removedHostPairs.push_back(removedPair); } - }//otherisMax - }//if (dmin<0) - }//if (otherIndex!=i) - }//for (int j= + } //otherisMax + } //if (dmin<0) + } //if (otherIndex!=i) + } //for (int j= } - - if (dmax!=0) + + if (dmax != 0) { - int stepMax = dmax<0 ? -1 : 1; - for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax) + int stepMax = dmax < 0 ? -1 : 1; + for (int j = prevMaxIndex; j != curMaxIndex; j += stepMax) { int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; - int otherIndex = otherIndex2/2; - if (otherIndex!=i) + int otherIndex = otherIndex2 / 2; + if (otherIndex != i) { //bool otherIsMin = ((otherIndex2&1)==0); //if (otherIsMin) { //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); - + bool overlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) - overlap=false; + overlap = false; } //b3Assert(overlap2==overlap); bool prevOverlap = true; - for (int ax=0;ax<3;ax++) + for (int ax = 0; ax < 3; ax++) { - if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || - (m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) - prevOverlap=false; + if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || + (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) + prevOverlap = false; } - - if (dmax>0) + if (dmax > 0) { if (overlap && !prevOverlap) { //add a pair b3Int4 newPair; - if (i<=otherIndex) + if (i <= otherIndex) { newPair.x = i; newPair.y = otherIndex; - } else + } + else { newPair.x = otherIndex; newPair.y = i; } addedHostPairs.push_back(newPair); - } - } + } else { if (!overlap && prevOverlap) @@ -750,33 +718,31 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() //if (otherIndex2&1==0) -> min? //remove a pair b3Int4 removedPair; - if (i<=otherIndex) + if (i <= otherIndex) { removedPair.x = i; removedPair.y = otherIndex; - } else + } + else { removedPair.x = otherIndex; removedPair.y = i; } removedHostPairs.push_back(removedPair); - } } - - }//if (dmin<0) - }//if (otherIndex!=i) - }//for (int j= + + } //if (dmin<0) + } //if (otherIndex!=i) + } //for (int j= } - }//for (int otherbuffer - }//for (int axis=0; - }//for (int i=0;i<numObjects + } //for (int otherbuffer + } //for (int axis=0; + } //for (int i=0;i<numObjects } //remove duplicates and add/remove then to existing m_overlappingPairs - - - + { { B3_PROFILE("sort allPairs"); @@ -795,31 +761,28 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() b3Int4 prevPair; prevPair.x = -1; prevPair.y = -1; - + int uniqueRemovedPairs = 0; b3AlignedObjectArray<int> removedPositions; { B3_PROFILE("actual removing"); - for (int i=0;i<removedHostPairs.size();i++) + for (int i = 0; i < removedHostPairs.size(); i++) { b3Int4 removedPair = removedHostPairs[i]; if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y)) { + int index1 = allPairs.findBinarySearch(removedPair); - int index1 = allPairs.findBinarySearch(removedPair); + //#ifdef _DEBUG - //#ifdef _DEBUG - - - int index2 = allPairs.findLinearSearch(removedPair); - b3Assert(index1==index2); - + b3Assert(index1 == index2); + //b3Assert(index1!=allPairs.size()); - if (index1<allPairs.size()) - //#endif//_DEBUG + if (index1 < allPairs.size()) + //#endif//_DEBUG { uniqueRemovedPairs++; removedPositions.push_back(index1); @@ -833,13 +796,13 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() if (uniqueRemovedPairs) { - for (int i=0;i<removedPositions.size();i++) + for (int i = 0; i < removedPositions.size(); i++) { - allPairs[removedPositions[i]].x = INT_MAX ; - allPairs[removedPositions[i]].y = INT_MAX ; + allPairs[removedPositions[i]].x = INT_MAX; + allPairs[removedPositions[i]].y = INT_MAX; } allPairs.quickSort(b3PairCmp); - allPairs.resize(allPairs.size()-uniqueRemovedPairs); + allPairs.resize(allPairs.size() - uniqueRemovedPairs); } } //if (uniqueRemovedPairs) @@ -848,33 +811,31 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() prevPair.x = -1; prevPair.y = -1; - - int uniqueAddedPairs=0; + + int uniqueAddedPairs = 0; b3AlignedObjectArray<b3Int4> actualAddedPairs; { B3_PROFILE("actual adding"); - for (int i=0;i<addedHostPairs.size();i++) + for (int i = 0; i < addedHostPairs.size(); i++) { b3Int4 newPair = addedHostPairs[i]; if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y)) { -//#ifdef _DEBUG + //#ifdef _DEBUG int index1 = allPairs.findBinarySearch(newPair); - - + int index2 = allPairs.findLinearSearch(newPair); - b3Assert(index1==index2); - + b3Assert(index1 == index2); - b3Assert(index1==allPairs.size()); - if (index1!=allPairs.size()) + b3Assert(index1 == allPairs.size()); + if (index1 != allPairs.size()) { printf("??\n"); } - if (index1==allPairs.size()) -//#endif //_DEBUG + if (index1 == allPairs.size()) + //#endif //_DEBUG { uniqueAddedPairs++; actualAddedPairs.push_back(newPair); @@ -882,94 +843,83 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() } prevPair = newPair; } - for (int i=0;i<actualAddedPairs.size();i++) + for (int i = 0; i < actualAddedPairs.size(); i++) { //printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y); allPairs.push_back(actualAddedPairs[i]); } } - + //if (uniqueAddedPairs) // printf("uniqueAddedPairs=%d\n", uniqueAddedPairs); - { B3_PROFILE("m_overlappingPairs.copyFromHost"); m_overlappingPairs.copyFromHost(allPairs); } - - } - - - -void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) +void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) { //test -// if (m_currentBuffer>=0) + // if (m_currentBuffer>=0) // return calculateOverlappingPairsHostIncremental3Sap(); b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); m_allAabbsGPU.copyToHost(m_allAabbsCPU); - - - int axis=0; + int axis = 0; { B3_PROFILE("CPU compute best variance axis"); - b3Vector3 s=b3MakeVector3(0,0,0),s2=b3MakeVector3(0,0,0); + b3Vector3 s = b3MakeVector3(0, 0, 0), s2 = b3MakeVector3(0, 0, 0); int numRigidBodies = m_smallAabbsMappingCPU.size(); - for(int i=0;i<numRigidBodies;i++) + for (int i = 0; i < numRigidBodies; i++) { b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; - b3Vector3 maxAabb=b3MakeVector3(aabb.m_max[0],aabb.m_max[1],aabb.m_max[2]); - b3Vector3 minAabb=b3MakeVector3(aabb.m_min[0],aabb.m_min[1],aabb.m_min[2]); - b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f; - + b3Vector3 maxAabb = b3MakeVector3(aabb.m_max[0], aabb.m_max[1], aabb.m_max[2]); + b3Vector3 minAabb = b3MakeVector3(aabb.m_min[0], aabb.m_min[1], aabb.m_min[2]); + b3Vector3 centerAabb = (maxAabb + minAabb) * 0.5f; + s += centerAabb; - s2 += centerAabb*centerAabb; + s2 += centerAabb * centerAabb; } - b3Vector3 v = s2 - (s*s) / (float)numRigidBodies; - - if(v[1] > v[0]) + b3Vector3 v = s2 - (s * s) / (float)numRigidBodies; + + if (v[1] > v[0]) axis = 1; - if(v[2] > v[axis]) + if (v[2] > v[axis]) axis = 2; } - - - b3AlignedObjectArray<b3Int4> hostPairs; { int numSmallAabbs = m_smallAabbsMappingCPU.size(); - for (int i=0;i<numSmallAabbs;i++) + for (int i = 0; i < numSmallAabbs; i++) { b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; //float reference = smallAabbi.m_max[axis]; - for (int j=i+1;j<numSmallAabbs;j++) + for (int j = i + 1; j < numSmallAabbs; j++) { - b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]]; if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, - (b3Vector3&)smallAabbj.m_min,(b3Vector3&)smallAabbj.m_max)) + (b3Vector3&)smallAabbj.m_min, (b3Vector3&)smallAabbj.m_max)) { b3Int4 pair; int a = smallAabbi.m_minIndices[3]; int b = smallAabbj.m_minIndices[3]; - if (a<=b) + if (a <= b) { - pair.x = a;//store the original index in the unsorted aabb array + pair.x = a; //store the original index in the unsorted aabb array pair.y = b; - } else + } + else { - pair.x = b;//store the original index in the unsorted aabb array + pair.x = b; //store the original index in the unsorted aabb array pair.y = a; } hostPairs.push_back(pair); @@ -978,35 +928,35 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) } } - { int numSmallAabbs = m_smallAabbsMappingCPU.size(); - for (int i=0;i<numSmallAabbs;i++) + for (int i = 0; i < numSmallAabbs; i++) { b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; //float reference = smallAabbi.m_max[axis]; int numLargeAabbs = m_largeAabbsMappingCPU.size(); - for (int j=0;j<numLargeAabbs;j++) + for (int j = 0; j < numLargeAabbs; j++) { b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]]; if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, - (b3Vector3&)largeAabbj.m_min,(b3Vector3&)largeAabbj.m_max)) + (b3Vector3&)largeAabbj.m_min, (b3Vector3&)largeAabbj.m_max)) { b3Int4 pair; int a = largeAabbj.m_minIndices[3]; int b = smallAabbi.m_minIndices[3]; - if (a<=b) + if (a <= b) { - pair.x = a; - pair.y = b;//store the original index in the unsorted aabb array - } else + pair.x = a; + pair.y = b; //store the original index in the unsorted aabb array + } + else { pair.x = b; - pair.y = a;//store the original index in the unsorted aabb array + pair.y = a; //store the original index in the unsorted aabb array } - + hostPairs.push_back(pair); } } @@ -1021,21 +971,20 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) if (hostPairs.size()) { m_overlappingPairs.copyFromHost(hostPairs); - } else + } + else { m_overlappingPairs.resize(0); } //init3dSap(); - } -void b3GpuSapBroadphase::reset() +void b3GpuSapBroadphase::reset() { m_allAabbsGPU.resize(0); m_allAabbsCPU.resize(0); - m_smallAabbsMappingGPU.resize(0); m_smallAabbsMappingCPU.resize(0); @@ -1043,13 +992,11 @@ void b3GpuSapBroadphase::reset() m_largeAabbsMappingGPU.resize(0); m_largeAabbsMappingCPU.resize(0); - } - -void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) +void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) { - if (m_sapKernel==0) + if (m_sapKernel == 0) { calculateOverlappingPairsHost(maxPairs); return; @@ -1065,68 +1012,62 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) int axis = 0; { + //bool syncOnHost = false; - //bool syncOnHost = false; - - int numSmallAabbs = m_smallAabbsMappingCPU.size(); - if (m_prefixScanFloat4 && numSmallAabbs) - { - B3_PROFILE("GPU compute best variance axis"); - - if (m_dst.size()!=(numSmallAabbs+1)) + int numSmallAabbs = m_smallAabbsMappingCPU.size(); + if (m_prefixScanFloat4 && numSmallAabbs) { - m_dst.resize(numSmallAabbs+128); - m_sum.resize(numSmallAabbs+128); - m_sum2.resize(numSmallAabbs+128); - m_sum.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? - m_sum2.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? - } + B3_PROFILE("GPU compute best variance axis"); - b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel ,"m_prepareSumVarianceKernel"); - launcher.setBuffer(m_allAabbsGPU.getBufferCL()); - - launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); - launcher.setBuffer(m_sum.getBufferCL()); - launcher.setBuffer(m_sum2.getBufferCL()); - launcher.setConst( numSmallAabbs ); - int num = numSmallAabbs; - launcher.launch1D( num); - + if (m_dst.size() != (numSmallAabbs + 1)) + { + m_dst.resize(numSmallAabbs + 128); + m_sum.resize(numSmallAabbs + 128); + m_sum2.resize(numSmallAabbs + 128); + m_sum.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow? + m_sum2.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow? + } - b3Vector3 s; - b3Vector3 s2; - m_prefixScanFloat4->execute(m_sum,m_dst,numSmallAabbs+1,&s); - m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2); + b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel, "m_prepareSumVarianceKernel"); + launcher.setBuffer(m_allAabbsGPU.getBufferCL()); - b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs; - - if(v[1] > v[0]) - axis = 1; - if(v[2] > v[axis]) - axis = 2; - } + launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); + launcher.setBuffer(m_sum.getBufferCL()); + launcher.setBuffer(m_sum2.getBufferCL()); + launcher.setConst(numSmallAabbs); + int num = numSmallAabbs; + launcher.launch1D(num); + b3Vector3 s; + b3Vector3 s2; + m_prefixScanFloat4->execute(m_sum, m_dst, numSmallAabbs + 1, &s); + m_prefixScanFloat4->execute(m_sum2, m_dst, numSmallAabbs + 1, &s2); + + b3Vector3 v = s2 - (s * s) / (float)numSmallAabbs; + + if (v[1] > v[0]) + axis = 1; + if (v[2] > v[axis]) + axis = 2; + } - m_gpuSmallSortData.resize(numSmallAabbs); - #if 1 if (m_smallAabbsMappingGPU.size()) { - B3_PROFILE("flipFloatKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), - b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_flipFloatKernel ,"m_flipFloatKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numSmallAabbs ); - launcher.setConst( axis ); - + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL(m_gpuSmallSortData.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_flipFloatKernel, "m_flipFloatKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numSmallAabbs); + launcher.setConst(axis); + int num = numSmallAabbs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); } @@ -1141,69 +1082,66 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) if (numSmallAabbs) { B3_PROFILE("scatterKernel"); - - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), - b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true), + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), + b3BufferInfoCL(m_gpuSmallSortData.getBufferCL(), true), b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_scatterKernel ,"m_scatterKernel "); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numSmallAabbs); + b3LauncherCL launcher(m_queue, m_scatterKernel, "m_scatterKernel "); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numSmallAabbs); int num = numSmallAabbs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); - } - - m_overlappingPairs.resize(maxPairs); + m_overlappingPairs.resize(maxPairs); - m_pairCount.resize(0); - m_pairCount.push_back(0); - int numPairs=0; + m_pairCount.resize(0); + m_pairCount.push_back(0); + int numPairs = 0; + { + int numLargeAabbs = m_largeAabbsMappingGPU.size(); + if (numLargeAabbs && numSmallAabbs) { - int numLargeAabbs = m_largeAabbsMappingGPU.size(); - if (numLargeAabbs && numSmallAabbs) + //@todo + B3_PROFILE("sap2Kernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_allAabbsGPU.getBufferCL()), + b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()), + b3BufferInfoCL(m_overlappingPairs.getBufferCL()), + b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numLargeAabbs); + launcher.setConst(numSmallAabbs); + launcher.setConst(axis); + launcher.setConst(maxPairs); + //@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64); + + numPairs = m_pairCount.at(0); + if (numPairs > maxPairs) { - //@todo - B3_PROFILE("sap2Kernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_allAabbsGPU.getBufferCL() ), - b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), - b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), - b3BufferInfoCL(m_pairCount.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numLargeAabbs ); - launcher.setConst( numSmallAabbs); - launcher.setConst( axis ); - launcher.setConst( maxPairs ); -//@todo: use actual maximum work item sizes of the device instead of hardcoded values - launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); - - numPairs = m_pairCount.at(0); - if (numPairs >maxPairs) - { - b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs =maxPairs; - } + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; } } - if (m_gpuSmallSortedAabbs.size()) - { - B3_PROFILE("sapKernel"); - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(m_pairCount.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_sapKernel,"m_sapKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numSmallAabbs ); - launcher.setConst( axis ); - launcher.setConst( maxPairs ); - - - int num = numSmallAabbs; + } + if (m_gpuSmallSortedAabbs.size()) + { + B3_PROFILE("sapKernel"); + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL()), b3BufferInfoCL(m_overlappingPairs.getBufferCL()), b3BufferInfoCL(m_pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sapKernel, "m_sapKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numSmallAabbs); + launcher.setConst(axis); + launcher.setConst(maxPairs); + + int num = numSmallAabbs; #if 0 int buffSize = launcher.getSerializationBufferSize(); unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; @@ -1225,73 +1163,71 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) FILE* f = fopen("m_sapKernelArgs.bin","wb"); fwrite(buf,buffSize+sizeof(int),1,f); fclose(f); -#endif// +#endif // - launcher.launch1D( num); - clFinish(m_queue); - - numPairs = m_pairCount.at(0); - if (numPairs>maxPairs) - { - b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs = maxPairs; - m_pairCount.resize(0); - m_pairCount.push_back(maxPairs); - } + launcher.launch1D(num); + clFinish(m_queue); + + numPairs = m_pairCount.at(0); + if (numPairs > maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + m_pairCount.resize(0); + m_pairCount.push_back(maxPairs); } - + } + #else - int numPairs = 0; - - - b3LauncherCL launcher(m_queue, m_sapKernel); - - const char* fileName = "m_sapKernelArgs.bin"; - FILE* f = fopen(fileName,"rb"); - if (f) - { - int sizeInBytes=0; - if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) - { - printf("error, cannot get file size\n"); - exit(0); - } - - unsigned char* buf = (unsigned char*) malloc(sizeInBytes); - fread(buf,sizeInBytes,1,f); - int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); - int num = *(int*)&buf[serializedBytes]; - launcher.launch1D( num); - - b3OpenCLArray<int> pairCount(m_context, m_queue); - int numElements = launcher.m_arrays[2]->size()/sizeof(int); - pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements); - numPairs = pairCount.at(0); - //printf("overlapping pairs = %d\n",numPairs); - b3AlignedObjectArray<b3Int4> hostOoverlappingPairs; - b3OpenCLArray<b3Int4> tmpGpuPairs(m_context,m_queue); - tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs ); - - tmpGpuPairs.copyToHost(hostOoverlappingPairs); - m_overlappingPairs.copyFromHost(hostOoverlappingPairs); - //printf("hello %d\n", m_overlappingPairs.size()); - free(buf); - fclose(f); - - } else { - printf("error: cannot find file %s\n",fileName); - } - - clFinish(m_queue); - - + int numPairs = 0; + + b3LauncherCL launcher(m_queue, m_sapKernel); + + const char* fileName = "m_sapKernelArgs.bin"; + FILE* f = fopen(fileName, "rb"); + if (f) + { + int sizeInBytes = 0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*)malloc(sizeInBytes); + fread(buf, sizeInBytes, 1, f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context); + int num = *(int*)&buf[serializedBytes]; + launcher.launch1D(num); + + b3OpenCLArray<int> pairCount(m_context, m_queue); + int numElements = launcher.m_arrays[2]->size() / sizeof(int); + pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(), numElements); + numPairs = pairCount.at(0); + //printf("overlapping pairs = %d\n",numPairs); + b3AlignedObjectArray<b3Int4> hostOoverlappingPairs; + b3OpenCLArray<b3Int4> tmpGpuPairs(m_context, m_queue); + tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(), numPairs); + + tmpGpuPairs.copyToHost(hostOoverlappingPairs); + m_overlappingPairs.copyFromHost(hostOoverlappingPairs); + //printf("hello %d\n", m_overlappingPairs.size()); + free(buf); + fclose(f); + } + else + { + printf("error: cannot find file %s\n", fileName); + } + + clFinish(m_queue); + #endif - - m_overlappingPairs.resize(numPairs); - - }//B3_PROFILE("GPU_RADIX SORT"); - //init3dSap(); + m_overlappingPairs.resize(numPairs); + + } //B3_PROFILE("GPU_RADIX SORT"); + //init3dSap(); } void b3GpuSapBroadphase::writeAabbsToGpu() @@ -1299,17 +1235,14 @@ void b3GpuSapBroadphase::writeAabbsToGpu() m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); - m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this - - - + m_allAabbsGPU.copyFromHost(m_allAabbsCPU); //might not be necessary, the 'setupGpuAabbsFull' already takes care of this } -void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int index = userPtr; b3SapAabb aabb; - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; @@ -1317,15 +1250,15 @@ void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vec aabb.m_minIndices[3] = index; aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size()); - + m_allAabbsCPU.push_back(aabb); } -void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int index = userPtr; b3SapAabb aabb; - for (int i=0;i<4;i++) + for (int i = 0; i < 4; i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; @@ -1334,20 +1267,19 @@ void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size()); - m_allAabbsCPU.push_back(aabb); } -cl_mem b3GpuSapBroadphase::getAabbBufferWS() +cl_mem b3GpuSapBroadphase::getAabbBufferWS() { return m_allAabbsGPU.getBufferCL(); } -int b3GpuSapBroadphase::getNumOverlap() +int b3GpuSapBroadphase::getNumOverlap() { return m_overlappingPairs.size(); } -cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() +cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() { return m_overlappingPairs.getBufferCL(); } diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h index 8d36ac78f2..d17590b14a 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h @@ -2,7 +2,7 @@ #define B3_GPU_SAP_BROADPHASE_H #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" -#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 class b3Vector3; #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" @@ -11,141 +11,133 @@ class b3Vector3; #include "b3GpuBroadphaseInterface.h" - class b3GpuSapBroadphase : public b3GpuBroadphaseInterface { - - cl_context m_context; - cl_device_id m_device; - cl_command_queue m_queue; - cl_kernel m_flipFloatKernel; - cl_kernel m_scatterKernel ; - cl_kernel m_copyAabbsKernel; - cl_kernel m_sapKernel; - cl_kernel m_sap2Kernel; - cl_kernel m_prepareSumVarianceKernel; - + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + cl_kernel m_flipFloatKernel; + cl_kernel m_scatterKernel; + cl_kernel m_copyAabbsKernel; + cl_kernel m_sapKernel; + cl_kernel m_sap2Kernel; + cl_kernel m_prepareSumVarianceKernel; class b3RadixSort32CL* m_sorter; ///test for 3d SAP - b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2]; - b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2]; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev; - b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev; - - b3OpenCLArray<b3SortData> m_sortedAxisGPU0; - b3OpenCLArray<b3SortData> m_sortedAxisGPU1; - b3OpenCLArray<b3SortData> m_sortedAxisGPU2; - b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev; - b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev; - b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev; - - - b3OpenCLArray<b3Int4> m_addedHostPairsGPU; - b3OpenCLArray<b3Int4> m_removedHostPairsGPU; - b3OpenCLArray<int> m_addedCountGPU; - b3OpenCLArray<int> m_removedCountGPU; - - int m_currentBuffer; + b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2]; + b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2]; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev; + b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev; + + b3OpenCLArray<b3SortData> m_sortedAxisGPU0; + b3OpenCLArray<b3SortData> m_sortedAxisGPU1; + b3OpenCLArray<b3SortData> m_sortedAxisGPU2; + b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev; + b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev; + b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev; + + b3OpenCLArray<b3Int4> m_addedHostPairsGPU; + b3OpenCLArray<b3Int4> m_removedHostPairsGPU; + b3OpenCLArray<int> m_addedCountGPU; + b3OpenCLArray<int> m_removedCountGPU; + + int m_currentBuffer; public: - b3OpenCLArray<int> m_pairCount; + b3OpenCLArray<b3SapAabb> m_allAabbsGPU; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; - b3OpenCLArray<b3SapAabb> m_allAabbsGPU; - b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; - - virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() + virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_allAabbsGPU; } - virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() + virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_allAabbsCPU; } - b3OpenCLArray<b3Vector3> m_sum; - b3OpenCLArray<b3Vector3> m_sum2; - b3OpenCLArray<b3Vector3> m_dst; + b3OpenCLArray<b3Vector3> m_sum; + b3OpenCLArray<b3Vector3> m_sum2; + b3OpenCLArray<b3Vector3> m_dst; - b3OpenCLArray<int> m_smallAabbsMappingGPU; + b3OpenCLArray<int> m_smallAabbsMappingGPU; b3AlignedObjectArray<int> m_smallAabbsMappingCPU; - b3OpenCLArray<int> m_largeAabbsMappingGPU; + b3OpenCLArray<int> m_largeAabbsMappingGPU; b3AlignedObjectArray<int> m_largeAabbsMappingCPU; - - b3OpenCLArray<b3Int4> m_overlappingPairs; + b3OpenCLArray<b3Int4> m_overlappingPairs; //temporary gpu work memory - b3OpenCLArray<b3SortData> m_gpuSmallSortData; - b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs; + b3OpenCLArray<b3SortData> m_gpuSmallSortData; + b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs; - class b3PrefixScanFloat4CL* m_prefixScanFloat4; + class b3PrefixScanFloat4CL* m_prefixScanFloat4; enum b3GpuSapKernelType { - B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1, + B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU, B3_GPU_SAP_KERNEL_ORIGINAL, B3_GPU_SAP_KERNEL_BARRIER, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY }; - b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); + b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); virtual ~b3GpuSapBroadphase(); - - static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue q) + + static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU); } - static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU); } - static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL); } - static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER); } - static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue q) + static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q) { - return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); + return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); } - - virtual void calculateOverlappingPairs(int maxPairs); - virtual void calculateOverlappingPairsHost(int maxPairs); - - void reset(); + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); + + void reset(); void init3dSap(); virtual void calculateOverlappingPairsHostIncremental3Sap(); - virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); - virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); //call writeAabbsToGpu after done making all changes (createProxy etc) virtual void writeAabbsToGpu(); - virtual cl_mem getAabbBufferWS(); - virtual int getNumOverlap(); - virtual cl_mem getOverlappingPairBuffer(); - + virtual cl_mem getAabbBufferWS(); + virtual int getNumOverlap(); + virtual cl_mem getOverlappingPairBuffer(); + virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU(); virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU(); virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU(); }; -#endif //B3_GPU_SAP_BROADPHASE_H
\ No newline at end of file +#endif //B3_GPU_SAP_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h index ea6550fede..60570f2605 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h @@ -5,10 +5,9 @@ #include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" ///just make sure that the b3Aabb is 16-byte aligned -B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb -{ +B3_ATTRIBUTE_ALIGNED16(struct) +b3SapAabb : public b3Aabb{ -}; + }; - -#endif //B3_SAP_AABB_H +#endif //B3_SAP_AABB_H diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h index dad42477c3..0185417786 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h @@ -1,199 +1,198 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* gridBroadphaseCL= \ -"int getPosHash(int4 gridPos, __global float4* pParams)\n" -"{\n" -" int4 gridDim = *((__global int4*)(pParams + 1));\n" -" gridPos.x &= gridDim.x - 1;\n" -" gridPos.y &= gridDim.y - 1;\n" -" gridPos.z &= gridDim.z - 1;\n" -" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n" -" return hash;\n" -"} \n" -"int4 getGridPos(float4 worldPos, __global float4* pParams)\n" -"{\n" -" int4 gridPos;\n" -" int4 gridDim = *((__global int4*)(pParams + 1));\n" -" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n" -" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n" -" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n" -" return gridPos;\n" -"}\n" -"// calculate grid hash value for each body using its AABB\n" -"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n" -"{\n" -" int index = get_global_id(0);\n" -" if(index >= numObjects)\n" -" {\n" -" return;\n" -" }\n" -" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n" -" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n" -" float4 pos;\n" -" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" -" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" -" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" -" pos.w = 0.f;\n" -" // get address in grid\n" -" int4 gridPos = getGridPos(pos, pParams);\n" -" int gridHash = getPosHash(gridPos, pParams);\n" -" // store grid hash and body index\n" -" int2 hashVal;\n" -" hashVal.x = gridHash;\n" -" hashVal.y = index;\n" -" pHash[index] = hashVal;\n" -"}\n" -"__kernel void kClearCellStart( int numCells, \n" -" __global int* pCellStart )\n" -"{\n" -" int index = get_global_id(0);\n" -" if(index >= numCells)\n" -" {\n" -" return;\n" -" }\n" -" pCellStart[index] = -1;\n" -"}\n" -"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n" -"{\n" -" __local int sharedHash[513];\n" -" int index = get_global_id(0);\n" -" int2 sortedData;\n" -" if(index < numObjects)\n" -" {\n" -" sortedData = pHash[index];\n" -" // Load hash data into shared memory so that we can look \n" -" // at neighboring body's hash value without loading\n" -" // two hash values per thread\n" -" sharedHash[get_local_id(0) + 1] = sortedData.x;\n" -" if((index > 0) && (get_local_id(0) == 0))\n" -" {\n" -" // first thread in block must load neighbor body hash\n" -" sharedHash[0] = pHash[index-1].x;\n" -" }\n" -" }\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" if(index < numObjects)\n" -" {\n" -" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" -" {\n" -" cellStart[sortedData.x] = index;\n" -" }\n" -" }\n" -"}\n" -"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n" -"{\n" -" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n" -" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n" -" (min0.z <= max1.z)&& (min1.z <= max0.z); \n" -"}\n" -"//search for AABB 'index' against other AABBs' in this cell\n" -"void findPairsInCell( int numObjects,\n" -" int4 gridPos,\n" -" int index,\n" -" __global int2* pHash,\n" -" __global int* pCellStart,\n" -" __global float4* allpAABB, \n" -" __global const int* smallAabbMapping,\n" -" __global float4* pParams,\n" -" volatile __global int* pairCount,\n" -" __global int4* pPairBuff2,\n" -" int maxPairs\n" -" )\n" -"{\n" -" int4 pGridDim = *((__global int4*)(pParams + 1));\n" -" int maxBodiesPerCell = pGridDim.w;\n" -" int gridHash = getPosHash(gridPos, pParams);\n" -" // get start of bucket for this cell\n" -" int bucketStart = pCellStart[gridHash];\n" -" if (bucketStart == -1)\n" -" {\n" -" return; // cell empty\n" -" }\n" -" // iterate over bodies in this cell\n" -" int2 sortedData = pHash[index];\n" -" int unsorted_indx = sortedData.y;\n" -" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n" -" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" -" int handleIndex = as_int(min0.w);\n" -" \n" -" int bucketEnd = bucketStart + maxBodiesPerCell;\n" -" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n" -" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n" -" {\n" -" int2 cellData = pHash[index2];\n" -" if (cellData.x != gridHash)\n" -" {\n" -" break; // no longer in same bucket\n" -" }\n" -" int unsorted_indx2 = cellData.y;\n" -" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" -" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n" -" { \n" -" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n" -" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n" -" if(testAABBOverlap(min0, max0, min1, max1))\n" -" {\n" -" if (pairCount)\n" -" {\n" -" int handleIndex2 = as_int(min1.w);\n" -" if (handleIndex<handleIndex2)\n" -" {\n" -" int curPair = atomic_add(pairCount,1);\n" -" if (curPair<maxPairs)\n" -" {\n" -" int4 newpair;\n" -" newpair.x = handleIndex;\n" -" newpair.y = handleIndex2;\n" -" newpair.z = -1;\n" -" newpair.w = -1;\n" -" pPairBuff2[curPair] = newpair;\n" -" }\n" -" }\n" -" \n" -" }\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__kernel void kFindOverlappingPairs( int numObjects,\n" -" __global float4* allpAABB, \n" -" __global const int* smallAabbMapping,\n" -" __global int2* pHash, \n" -" __global int* pCellStart, \n" -" __global float4* pParams ,\n" -" volatile __global int* pairCount,\n" -" __global int4* pPairBuff2,\n" -" int maxPairs\n" -" )\n" -"{\n" -" int index = get_global_id(0);\n" -" if(index >= numObjects)\n" -" {\n" -" return;\n" -" }\n" -" int2 sortedData = pHash[index];\n" -" int unsorted_indx = sortedData.y;\n" -" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n" -" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" -" float4 pos;\n" -" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" -" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" -" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" -" // get address in grid\n" -" int4 gridPosA = getGridPos(pos, pParams);\n" -" int4 gridPosB; \n" -" // examine only neighbouring cells\n" -" for(int z=-1; z<=1; z++) \n" -" {\n" -" gridPosB.z = gridPosA.z + z;\n" -" for(int y=-1; y<=1; y++) \n" -" {\n" -" gridPosB.y = gridPosA.y + y;\n" -" for(int x=-1; x<=1; x++) \n" -" {\n" -" gridPosB.x = gridPosA.x + x;\n" -" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n" -" }\n" -" }\n" -" }\n" -"}\n" -; +static const char* gridBroadphaseCL = + "int getPosHash(int4 gridPos, __global float4* pParams)\n" + "{\n" + " int4 gridDim = *((__global int4*)(pParams + 1));\n" + " gridPos.x &= gridDim.x - 1;\n" + " gridPos.y &= gridDim.y - 1;\n" + " gridPos.z &= gridDim.z - 1;\n" + " int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n" + " return hash;\n" + "} \n" + "int4 getGridPos(float4 worldPos, __global float4* pParams)\n" + "{\n" + " int4 gridPos;\n" + " int4 gridDim = *((__global int4*)(pParams + 1));\n" + " gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n" + " gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n" + " gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n" + " return gridPos;\n" + "}\n" + "// calculate grid hash value for each body using its AABB\n" + "__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n" + "{\n" + " int index = get_global_id(0);\n" + " if(index >= numObjects)\n" + " {\n" + " return;\n" + " }\n" + " float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n" + " float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n" + " float4 pos;\n" + " pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" + " pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" + " pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" + " pos.w = 0.f;\n" + " // get address in grid\n" + " int4 gridPos = getGridPos(pos, pParams);\n" + " int gridHash = getPosHash(gridPos, pParams);\n" + " // store grid hash and body index\n" + " int2 hashVal;\n" + " hashVal.x = gridHash;\n" + " hashVal.y = index;\n" + " pHash[index] = hashVal;\n" + "}\n" + "__kernel void kClearCellStart( int numCells, \n" + " __global int* pCellStart )\n" + "{\n" + " int index = get_global_id(0);\n" + " if(index >= numCells)\n" + " {\n" + " return;\n" + " }\n" + " pCellStart[index] = -1;\n" + "}\n" + "__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n" + "{\n" + " __local int sharedHash[513];\n" + " int index = get_global_id(0);\n" + " int2 sortedData;\n" + " if(index < numObjects)\n" + " {\n" + " sortedData = pHash[index];\n" + " // Load hash data into shared memory so that we can look \n" + " // at neighboring body's hash value without loading\n" + " // two hash values per thread\n" + " sharedHash[get_local_id(0) + 1] = sortedData.x;\n" + " if((index > 0) && (get_local_id(0) == 0))\n" + " {\n" + " // first thread in block must load neighbor body hash\n" + " sharedHash[0] = pHash[index-1].x;\n" + " }\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " if(index < numObjects)\n" + " {\n" + " if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" + " {\n" + " cellStart[sortedData.x] = index;\n" + " }\n" + " }\n" + "}\n" + "int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n" + "{\n" + " return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n" + " (min0.y <= max1.y)&& (min1.y <= max0.y) && \n" + " (min0.z <= max1.z)&& (min1.z <= max0.z); \n" + "}\n" + "//search for AABB 'index' against other AABBs' in this cell\n" + "void findPairsInCell( int numObjects,\n" + " int4 gridPos,\n" + " int index,\n" + " __global int2* pHash,\n" + " __global int* pCellStart,\n" + " __global float4* allpAABB, \n" + " __global const int* smallAabbMapping,\n" + " __global float4* pParams,\n" + " volatile __global int* pairCount,\n" + " __global int4* pPairBuff2,\n" + " int maxPairs\n" + " )\n" + "{\n" + " int4 pGridDim = *((__global int4*)(pParams + 1));\n" + " int maxBodiesPerCell = pGridDim.w;\n" + " int gridHash = getPosHash(gridPos, pParams);\n" + " // get start of bucket for this cell\n" + " int bucketStart = pCellStart[gridHash];\n" + " if (bucketStart == -1)\n" + " {\n" + " return; // cell empty\n" + " }\n" + " // iterate over bodies in this cell\n" + " int2 sortedData = pHash[index];\n" + " int unsorted_indx = sortedData.y;\n" + " float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n" + " float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" + " int handleIndex = as_int(min0.w);\n" + " \n" + " int bucketEnd = bucketStart + maxBodiesPerCell;\n" + " bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n" + " for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n" + " {\n" + " int2 cellData = pHash[index2];\n" + " if (cellData.x != gridHash)\n" + " {\n" + " break; // no longer in same bucket\n" + " }\n" + " int unsorted_indx2 = cellData.y;\n" + " //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" + " if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n" + " { \n" + " float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n" + " float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n" + " if(testAABBOverlap(min0, max0, min1, max1))\n" + " {\n" + " if (pairCount)\n" + " {\n" + " int handleIndex2 = as_int(min1.w);\n" + " if (handleIndex<handleIndex2)\n" + " {\n" + " int curPair = atomic_add(pairCount,1);\n" + " if (curPair<maxPairs)\n" + " {\n" + " int4 newpair;\n" + " newpair.x = handleIndex;\n" + " newpair.y = handleIndex2;\n" + " newpair.z = -1;\n" + " newpair.w = -1;\n" + " pPairBuff2[curPair] = newpair;\n" + " }\n" + " }\n" + " \n" + " }\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__kernel void kFindOverlappingPairs( int numObjects,\n" + " __global float4* allpAABB, \n" + " __global const int* smallAabbMapping,\n" + " __global int2* pHash, \n" + " __global int* pCellStart, \n" + " __global float4* pParams ,\n" + " volatile __global int* pairCount,\n" + " __global int4* pPairBuff2,\n" + " int maxPairs\n" + " )\n" + "{\n" + " int index = get_global_id(0);\n" + " if(index >= numObjects)\n" + " {\n" + " return;\n" + " }\n" + " int2 sortedData = pHash[index];\n" + " int unsorted_indx = sortedData.y;\n" + " float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n" + " float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" + " float4 pos;\n" + " pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" + " pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" + " pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" + " // get address in grid\n" + " int4 gridPosA = getGridPos(pos, pParams);\n" + " int4 gridPosB; \n" + " // examine only neighbouring cells\n" + " for(int z=-1; z<=1; z++) \n" + " {\n" + " gridPosB.z = gridPosA.z + z;\n" + " for(int y=-1; y<=1; y++) \n" + " {\n" + " gridPosB.y = gridPosA.y + y;\n" + " for(int x=-1; x<=1; x++) \n" + " {\n" + " gridPosB.x = gridPosA.x + x;\n" + " findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n" + " }\n" + " }\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h index 5eb8f45b16..c02877dde9 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -1,729 +1,728 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* parallelLinearBvhCL= \ -"/*\n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose,\n" -"including commercial applications, and to alter it and redistribute it freely,\n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Initial Author Jackson Lee, 2014\n" -"typedef float b3Scalar;\n" -"typedef float4 b3Vector3;\n" -"#define b3Max max\n" -"#define b3Min min\n" -"#define b3Sqrt sqrt\n" -"typedef struct\n" -"{\n" -" unsigned int m_key;\n" -" unsigned int m_value;\n" -"} SortDataCL;\n" -"typedef struct \n" -"{\n" -" union\n" -" {\n" -" float4 m_min;\n" -" float m_minElems[4];\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float4 m_max;\n" -" float m_maxElems[4];\n" -" int m_maxIndices[4];\n" -" };\n" -"} b3AabbCL;\n" -"unsigned int interleaveBits(unsigned int x)\n" -"{\n" -" //........ ........ ......12 3456789A //x\n" -" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n" -" \n" -" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n" -" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n" -" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n" -" \n" -" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n" -" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n" -" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n" -" \n" -" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n" -" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n" -" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n" -" \n" -" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n" -" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n" -" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n" -" \n" -" //........ ........ ......11 11111111 //0x000003FF\n" -" x &= 0x000003FF; //Clear all bits above bit 10\n" -" \n" -" x = (x ^ (x << 16)) & 0xFF0000FF;\n" -" x = (x ^ (x << 8)) & 0x0300F00F;\n" -" x = (x ^ (x << 4)) & 0x030C30C3;\n" -" x = (x ^ (x << 2)) & 0x09249249;\n" -" \n" -" return x;\n" -"}\n" -"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" -"{\n" -" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" -"}\n" -"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n" -"{\n" -" int separatedAabbIndex = get_global_id(0);\n" -" if(separatedAabbIndex >= numAabbsToSeparate) return;\n" -" int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n" -" out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n" -"}\n" -"//Should replace with an optimized parallel reduction\n" -"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" -"{\n" -" //Each time this kernel is added to the command queue, \n" -" //the number of AABBs needing to be merged is halved\n" -" //\n" -" //Example with 159 AABBs:\n" -" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" -" // numMergedAabbs == 159 - 80 == 79\n" -" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" -" \n" -" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" -" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" -" \n" -" int aabbIndex = get_global_id(0);\n" -" if(aabbIndex >= numMergedAabbs) return;\n" -" \n" -" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" -" \n" -" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" -" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" -" \n" -" b3AabbCL mergedAabb;\n" -" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" -" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" -" out_mergedAabb[aabbIndex] = mergedAabb;\n" -"}\n" -"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" -" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" -"{\n" -" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n" -" if(leafNodeIndex >= numAabbs) return;\n" -" \n" -" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" -" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" -" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" -" \n" -" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" -" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" -" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" -" \n" -" //Quantize into integer coordinates\n" -" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" -" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" -" \n" -" int4 discretePosition;\n" -" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" -" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" -" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" -" \n" -" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" -" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" -" discretePosition += 512;\n" -" \n" -" //Interleave bits(assign a morton code, also known as a z-curve)\n" -" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" -" \n" -" //\n" -" SortDataCL mortonCodeIndexPair;\n" -" mortonCodeIndexPair.m_key = mortonCode;\n" -" mortonCodeIndexPair.m_value = leafNodeIndex;\n" -" \n" -" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" -"}\n" -"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" -"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" -"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" -"//In both cases, the bit should be cleared to access the actual node index.\n" -"int isLeafNode(int index) { return (index >> 31 == 0); }\n" -"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" -"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" -"//From sap.cl\n" -"#define NEW_PAIR_MARKER -1\n" -"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"//From sap.cl\n" -"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" -" __global int* rootNodeIndex, \n" -" __global int2* internalNodeChildIndices, \n" -" __global b3AabbCL* internalNodeAabbs,\n" -" __global int2* internalNodeLeafIndexRanges,\n" -" \n" -" __global SortDataCL* mortonCodesAndAabbIndices,\n" -" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" -" int maxPairs, int numQueryAabbs)\n" -"{\n" -" //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n" -" //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n" -" int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" -" if(queryBvhNodeIndex >= numQueryAabbs) return;\n" -" \n" -" int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n" -" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" -" \n" -" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" -" \n" -" int stackSize = 1;\n" -" stack[0] = *rootNodeIndex;\n" -" \n" -" while(stackSize)\n" -" {\n" -" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" -" --stackSize;\n" -" \n" -" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" -" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" -" \n" -" //Optimization - if the BVH is structured as a binary radix tree, then\n" -" //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n" -" //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n" -" {\n" -" int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" -" if(highestLeafIndex <= queryBvhNodeIndex) continue;\n" -" }\n" -" \n" -" //bvhRigidIndex is not used if internal node\n" -" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" -" \n" -" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" -" if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" -" {\n" -" if(isLeaf)\n" -" {\n" -" int4 pair;\n" -" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" -" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" -" pair.z = NEW_PAIR_MARKER;\n" -" pair.w = NEW_PAIR_MARKER;\n" -" \n" -" int pairIndex = atomic_inc(out_numPairs);\n" -" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" -" }\n" -" \n" -" if(!isLeaf) //Internal node\n" -" {\n" -" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" -" {\n" -" //Error\n" -" }\n" -" else\n" -" {\n" -" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" -" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" -" }\n" -" }\n" -" }\n" -" \n" -" }\n" -"}\n" -"//From rayCastKernels.cl\n" -"typedef struct\n" -"{\n" -" float4 m_from;\n" -" float4 m_to;\n" -"} b3RayInfo;\n" -"//From rayCastKernels.cl\n" -"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n" -"{\n" -" b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n" -" return normalize(normal); //OpenCL normalize == vector4 normalize\n" -"}\n" -"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n" -"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n" -"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n" -"{\n" -" //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n" -" //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n" -" //\n" -" //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n" -" //and min.x will be the far plane; otherwise, it is reversed.\n" -" //\n" -" //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n" -" //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n" -" \n" -" int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)\n" -" \n" -" //When using vector types, the select() function checks the most signficant bit, \n" -" //but isless() sets the least significant bit.\n" -" isNegative <<= 31;\n" -" //select(b, a, condition) == condition ? a : b\n" -" //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n" -" b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" -" b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" -" \n" -" b3Scalar t_min_final = 0.0f;\n" -" b3Scalar t_max_final = rayLength;\n" -" \n" -" //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n" -" //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n" -" //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n" -" t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n" -" t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n" -" \n" -" return (t_min_final <= t_max_final);\n" -"}\n" -"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n" -" __global int* rootNodeIndex, \n" -" __global int2* internalNodeChildIndices, \n" -" __global b3AabbCL* internalNodeAabbs,\n" -" __global int2* internalNodeLeafIndexRanges,\n" -" __global SortDataCL* mortonCodesAndAabbIndices,\n" -" \n" -" __global b3RayInfo* rays,\n" -" \n" -" __global int* out_numRayRigidPairs, \n" -" __global int2* out_rayRigidPairs,\n" -" int maxRayRigidPairs, int numRays)\n" -"{\n" -" int rayIndex = get_global_id(0);\n" -" if(rayIndex >= numRays) return;\n" -" \n" -" //\n" -" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" -" b3Vector3 rayTo = rays[rayIndex].m_to;\n" -" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" -" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" -" \n" -" //\n" -" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" -" \n" -" int stackSize = 1;\n" -" stack[0] = *rootNodeIndex;\n" -" \n" -" while(stackSize)\n" -" {\n" -" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" -" --stackSize;\n" -" \n" -" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" -" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" -" \n" -" //bvhRigidIndex is not used if internal node\n" -" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" -" \n" -" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" -" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n" -" {\n" -" if(isLeaf)\n" -" {\n" -" int2 rayRigidPair;\n" -" rayRigidPair.x = rayIndex;\n" -" rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" -" \n" -" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" -" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" -" }\n" -" \n" -" if(!isLeaf) //Internal node\n" -" {\n" -" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" -" {\n" -" //Error\n" -" }\n" -" else\n" -" {\n" -" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" -" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" -" }\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n" -" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" -" int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n" -"{\n" -" int smallAabbIndex = get_global_id(0);\n" -" if(smallAabbIndex >= numSmallAabbRigids) return;\n" -" \n" -" b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n" -" for(int i = 0; i < numLargeAabbRigids; ++i)\n" -" {\n" -" b3AabbCL largeAabb = largeAabbs[i];\n" -" if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n" -" {\n" -" int4 pair;\n" -" pair.x = largeAabb.m_minIndices[3];\n" -" pair.y = smallAabb.m_minIndices[3];\n" -" pair.z = NEW_PAIR_MARKER;\n" -" pair.w = NEW_PAIR_MARKER;\n" -" \n" -" int pairIndex = atomic_inc(out_numPairs);\n" -" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" -" }\n" -" }\n" -"}\n" -"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n" -" __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n" -" int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n" -"{\n" -" int rayIndex = get_global_id(0);\n" -" if(rayIndex >= numRays) return;\n" -" \n" -" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" -" b3Vector3 rayTo = rays[rayIndex].m_to;\n" -" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" -" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" -" \n" -" for(int i = 0; i < numLargeAabbRigids; ++i)\n" -" {\n" -" b3AabbCL rigidAabb = largeRigidAabbs[i];\n" -" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n" -" {\n" -" int2 rayRigidPair;\n" -" rayRigidPair.x = rayIndex;\n" -" rayRigidPair.y = rigidAabb.m_minIndices[3];\n" -" \n" -" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" -" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" -" }\n" -" }\n" -"}\n" -"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" -"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" -"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n" -"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n" -"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" -"#define B3_PLBVH_ROOT_NODE_MARKER -1\n" -"#define b3Int64 long\n" -"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" -"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" -"{\n" -" //This function only needs to return (i & j) in order for the algorithm to work,\n" -" //but it may help with debugging to mask out the lower bits.\n" -" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" -" b3Int64 sharedBits = i & j;\n" -" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n" -" \n" -" return sharedBits & bitmask;\n" -"}\n" -"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n" -"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" -"{\n" -" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" -"}\n" -"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" -" __global b3Int64* out_commonPrefixes,\n" -" __global int* out_commonPrefixLengths,\n" -" int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if (internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" -" //and the number of internal nodes is always numLeafNodes - 1\n" -" int leftLeafIndex = internalNodeIndex;\n" -" int rightLeafIndex = internalNodeIndex + 1;\n" -" \n" -" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" -" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" -" \n" -" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" -" //Append the index of each leaf node to each morton code so that there are no duplicates.\n" -" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" -" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" -" //\n" -" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" -" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" -" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" -" \n" -" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" -" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" -"}\n" -"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" -" __global int2* out_childNodes, int numLeafNodes)\n" -"{\n" -" int leafNodeIndex = get_global_id(0);\n" -" if (leafNodeIndex >= numLeafNodes) return;\n" -" \n" -" int numInternalNodes = numLeafNodes - 1;\n" -" \n" -" int leftSplitIndex = leafNodeIndex - 1;\n" -" int rightSplitIndex = leafNodeIndex;\n" -" \n" -" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" \n" -" //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" -" //Leaf nodes are considered as having the highest common prefix\n" -" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" -" \n" -" //Handle cases for the edge nodes; the first and last node\n" -" //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n" -" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" -" if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" -" \n" -" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" -" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" -" \n" -" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n" -" \n" -" //out_childNodesAsInt[0] == int2.x == left child\n" -" //out_childNodesAsInt[1] == int2.y == right child\n" -" int isLeaf = 1;\n" -" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" -" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" -"}\n" -"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" -" __global int2* out_childNodes,\n" -" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" -" int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" -" if(internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" -" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" -" \n" -"//#define USE_LINEAR_SEARCH\n" -"#ifdef USE_LINEAR_SEARCH\n" -" int leftIndex = -1;\n" -" int rightIndex = -1;\n" -" \n" -" //Find nearest element to left with a lower common prefix\n" -" for(int i = internalNodeIndex - 1; i >= 0; --i)\n" -" {\n" -" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" -" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" -" {\n" -" leftIndex = i;\n" -" break;\n" -" }\n" -" }\n" -" \n" -" //Find nearest element to right with a lower common prefix\n" -" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" -" {\n" -" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" -" if(nodeRightSharedPrefixLength < nodePrefixLength)\n" -" {\n" -" rightIndex = i;\n" -" break;\n" -" }\n" -" }\n" -" \n" -"#else //Use binary search\n" -" //Find nearest element to left with a lower common prefix\n" -" int leftIndex = -1;\n" -" {\n" -" int lower = 0;\n" -" int upper = internalNodeIndex - 1;\n" -" \n" -" while(lower <= upper)\n" -" {\n" -" int mid = (lower + upper) / 2;\n" -" b3Int64 midPrefix = commonPrefixes[mid];\n" -" int midPrefixLength = commonPrefixLengths[mid];\n" -" \n" -" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" -" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" -" {\n" -" int right = mid + 1;\n" -" if(right < internalNodeIndex)\n" -" {\n" -" b3Int64 rightPrefix = commonPrefixes[right];\n" -" int rightPrefixLength = commonPrefixLengths[right];\n" -" \n" -" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" -" if(nodeRightSharedPrefixLength < nodePrefixLength) \n" -" {\n" -" lower = right;\n" -" leftIndex = right;\n" -" }\n" -" else \n" -" {\n" -" leftIndex = mid;\n" -" break;\n" -" }\n" -" }\n" -" else \n" -" {\n" -" leftIndex = mid;\n" -" break;\n" -" }\n" -" }\n" -" else upper = mid - 1;\n" -" }\n" -" }\n" -" \n" -" //Find nearest element to right with a lower common prefix\n" -" int rightIndex = -1;\n" -" {\n" -" int lower = internalNodeIndex + 1;\n" -" int upper = numInternalNodes - 1;\n" -" \n" -" while(lower <= upper)\n" -" {\n" -" int mid = (lower + upper) / 2;\n" -" b3Int64 midPrefix = commonPrefixes[mid];\n" -" int midPrefixLength = commonPrefixLengths[mid];\n" -" \n" -" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" -" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" -" {\n" -" int left = mid - 1;\n" -" if(left > internalNodeIndex)\n" -" {\n" -" b3Int64 leftPrefix = commonPrefixes[left];\n" -" int leftPrefixLength = commonPrefixLengths[left];\n" -" \n" -" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" -" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" -" {\n" -" upper = left;\n" -" rightIndex = left;\n" -" }\n" -" else \n" -" {\n" -" rightIndex = mid;\n" -" break;\n" -" }\n" -" }\n" -" else \n" -" {\n" -" rightIndex = mid;\n" -" break;\n" -" }\n" -" }\n" -" else lower = mid + 1;\n" -" }\n" -" }\n" -"#endif\n" -" \n" -" //Select parent\n" -" {\n" -" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" \n" -" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" -" \n" -" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" -" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" -" \n" -" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" -" \n" -" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" -" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" -" \n" -" int isLeaf = 0;\n" -" if(!isRootNode)\n" -" {\n" -" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n" -" \n" -" //out_childNodesAsInt[0] == int2.x == left child\n" -" //out_childNodesAsInt[1] == int2.y == right child\n" -" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" -" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" -" }\n" -" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" -" }\n" -"}\n" -"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" -" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" -"{\n" -" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" -" int internalNodeIndex = get_global_id(0);\n" -" if(internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" //\n" -" int distanceFromRoot = 0;\n" -" {\n" -" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" -" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" -" {\n" -" parentIndex = internalNodeParentNodes[parentIndex];\n" -" ++distanceFromRoot;\n" -" }\n" -" }\n" -" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" -" \n" -" //\n" -" __local int localMaxDistanceFromRoot;\n" -" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" -"}\n" -"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" -" __global int2* childNodes,\n" -" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" -" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if(internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" int distance = distanceFromRoot[internalNodeIndex];\n" -" \n" -" if(distance == processedDistance)\n" -" {\n" -" int leftChildIndex = childNodes[internalNodeIndex].x;\n" -" int rightChildIndex = childNodes[internalNodeIndex].y;\n" -" \n" -" int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" -" int isRightChildLeaf = isLeafNode(rightChildIndex);\n" -" \n" -" leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n" -" rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n" -" \n" -" //leftRigidIndex/rightRigidIndex is not used if internal node\n" -" int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" -" int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" -" \n" -" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n" -" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n" -" \n" -" b3AabbCL mergedAabb;\n" -" mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" -" mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" -" internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" -" }\n" -"}\n" -"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if(internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" int numLeafNodes = numInternalNodes + 1;\n" -" \n" -" int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n" -" \n" -" int2 leafIndexRange; //x == min leaf index, y == max leaf index\n" -" \n" -" //Find lowest leaf index covered by this internal node\n" -" {\n" -" int lowestIndex = childNodes.x; //childNodes.x == Left child\n" -" while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n" -" leafIndexRange.x = lowestIndex;\n" -" }\n" -" \n" -" //Find highest leaf index covered by this internal node\n" -" {\n" -" int highestIndex = childNodes.y; //childNodes.y == Right child\n" -" while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n" -" leafIndexRange.y = highestIndex;\n" -" }\n" -" \n" -" //\n" -" out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n" -"}\n" -; +static const char* parallelLinearBvhCL = + "/*\n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose,\n" + "including commercial applications, and to alter it and redistribute it freely,\n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Initial Author Jackson Lee, 2014\n" + "typedef float b3Scalar;\n" + "typedef float4 b3Vector3;\n" + "#define b3Max max\n" + "#define b3Min min\n" + "#define b3Sqrt sqrt\n" + "typedef struct\n" + "{\n" + " unsigned int m_key;\n" + " unsigned int m_value;\n" + "} SortDataCL;\n" + "typedef struct \n" + "{\n" + " union\n" + " {\n" + " float4 m_min;\n" + " float m_minElems[4];\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float4 m_max;\n" + " float m_maxElems[4];\n" + " int m_maxIndices[4];\n" + " };\n" + "} b3AabbCL;\n" + "unsigned int interleaveBits(unsigned int x)\n" + "{\n" + " //........ ........ ......12 3456789A //x\n" + " //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n" + " \n" + " //......12 3456789A ......12 3456789A //x ^ (x << 16)\n" + " //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n" + " //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n" + " \n" + " //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n" + " //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n" + " //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n" + " \n" + " //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n" + " //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n" + " //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n" + " \n" + " //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n" + " //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n" + " //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n" + " \n" + " //........ ........ ......11 11111111 //0x000003FF\n" + " x &= 0x000003FF; //Clear all bits above bit 10\n" + " \n" + " x = (x ^ (x << 16)) & 0xFF0000FF;\n" + " x = (x ^ (x << 8)) & 0x0300F00F;\n" + " x = (x ^ (x << 4)) & 0x030C30C3;\n" + " x = (x ^ (x << 2)) & 0x09249249;\n" + " \n" + " return x;\n" + "}\n" + "unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" + "{\n" + " return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" + "}\n" + "__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n" + "{\n" + " int separatedAabbIndex = get_global_id(0);\n" + " if(separatedAabbIndex >= numAabbsToSeparate) return;\n" + " int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n" + " out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n" + "}\n" + "//Should replace with an optimized parallel reduction\n" + "__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" + "{\n" + " //Each time this kernel is added to the command queue, \n" + " //the number of AABBs needing to be merged is halved\n" + " //\n" + " //Example with 159 AABBs:\n" + " // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" + " // numMergedAabbs == 159 - 80 == 79\n" + " //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" + " \n" + " int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" + " int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" + " \n" + " int aabbIndex = get_global_id(0);\n" + " if(aabbIndex >= numMergedAabbs) return;\n" + " \n" + " int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" + " \n" + " b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" + " b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" + " \n" + " b3AabbCL mergedAabb;\n" + " mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" + " mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" + " out_mergedAabb[aabbIndex] = mergedAabb;\n" + "}\n" + "__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" + " __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" + "{\n" + " int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n" + " if(leafNodeIndex >= numAabbs) return;\n" + " \n" + " b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" + " b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" + " b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" + " \n" + " b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" + " b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" + " b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" + " \n" + " //Quantize into integer coordinates\n" + " //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" + " b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" + " \n" + " int4 discretePosition;\n" + " discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" + " discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" + " discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" + " \n" + " //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" + " discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" + " discretePosition += 512;\n" + " \n" + " //Interleave bits(assign a morton code, also known as a z-curve)\n" + " unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" + " \n" + " //\n" + " SortDataCL mortonCodeIndexPair;\n" + " mortonCodeIndexPair.m_key = mortonCode;\n" + " mortonCodeIndexPair.m_value = leafNodeIndex;\n" + " \n" + " out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" + "}\n" + "#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" + "//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" + "//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" + "//In both cases, the bit should be cleared to access the actual node index.\n" + "int isLeafNode(int index) { return (index >> 31 == 0); }\n" + "int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" + "int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" + "//From sap.cl\n" + "#define NEW_PAIR_MARKER -1\n" + "bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" + " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" + " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "//From sap.cl\n" + "__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" + " __global int* rootNodeIndex, \n" + " __global int2* internalNodeChildIndices, \n" + " __global b3AabbCL* internalNodeAabbs,\n" + " __global int2* internalNodeLeafIndexRanges,\n" + " \n" + " __global SortDataCL* mortonCodesAndAabbIndices,\n" + " __global int* out_numPairs, __global int4* out_overlappingPairs, \n" + " int maxPairs, int numQueryAabbs)\n" + "{\n" + " //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n" + " //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n" + " int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" + " if(queryBvhNodeIndex >= numQueryAabbs) return;\n" + " \n" + " int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n" + " b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" + " \n" + " int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" + " \n" + " int stackSize = 1;\n" + " stack[0] = *rootNodeIndex;\n" + " \n" + " while(stackSize)\n" + " {\n" + " int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" + " --stackSize;\n" + " \n" + " int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" + " int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" + " \n" + " //Optimization - if the BVH is structured as a binary radix tree, then\n" + " //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n" + " //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n" + " {\n" + " int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" + " if(highestLeafIndex <= queryBvhNodeIndex) continue;\n" + " }\n" + " \n" + " //bvhRigidIndex is not used if internal node\n" + " int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" + " \n" + " b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" + " if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" + " {\n" + " if(isLeaf)\n" + " {\n" + " int4 pair;\n" + " pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" + " pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" + " pair.z = NEW_PAIR_MARKER;\n" + " pair.w = NEW_PAIR_MARKER;\n" + " \n" + " int pairIndex = atomic_inc(out_numPairs);\n" + " if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" + " }\n" + " \n" + " if(!isLeaf) //Internal node\n" + " {\n" + " if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" + " {\n" + " //Error\n" + " }\n" + " else\n" + " {\n" + " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" + " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" + " }\n" + " }\n" + " }\n" + " \n" + " }\n" + "}\n" + "//From rayCastKernels.cl\n" + "typedef struct\n" + "{\n" + " float4 m_from;\n" + " float4 m_to;\n" + "} b3RayInfo;\n" + "//From rayCastKernels.cl\n" + "b3Vector3 b3Vector3_normalize(b3Vector3 v)\n" + "{\n" + " b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n" + " return normalize(normal); //OpenCL normalize == vector4 normalize\n" + "}\n" + "b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n" + "b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n" + "int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n" + "{\n" + " //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n" + " //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n" + " //\n" + " //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n" + " //and min.x will be the far plane; otherwise, it is reversed.\n" + " //\n" + " //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n" + " //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n" + " \n" + " int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) ); //isless(x,y) returns (x < y)\n" + " \n" + " //When using vector types, the select() function checks the most signficant bit, \n" + " //but isless() sets the least significant bit.\n" + " isNegative <<= 31;\n" + " //select(b, a, condition) == condition ? a : b\n" + " //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n" + " b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" + " b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" + " \n" + " b3Scalar t_min_final = 0.0f;\n" + " b3Scalar t_max_final = rayLength;\n" + " \n" + " //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n" + " //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n" + " //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n" + " t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n" + " t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n" + " \n" + " return (t_min_final <= t_max_final);\n" + "}\n" + "__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n" + " __global int* rootNodeIndex, \n" + " __global int2* internalNodeChildIndices, \n" + " __global b3AabbCL* internalNodeAabbs,\n" + " __global int2* internalNodeLeafIndexRanges,\n" + " __global SortDataCL* mortonCodesAndAabbIndices,\n" + " \n" + " __global b3RayInfo* rays,\n" + " \n" + " __global int* out_numRayRigidPairs, \n" + " __global int2* out_rayRigidPairs,\n" + " int maxRayRigidPairs, int numRays)\n" + "{\n" + " int rayIndex = get_global_id(0);\n" + " if(rayIndex >= numRays) return;\n" + " \n" + " //\n" + " b3Vector3 rayFrom = rays[rayIndex].m_from;\n" + " b3Vector3 rayTo = rays[rayIndex].m_to;\n" + " b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" + " b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" + " \n" + " //\n" + " int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" + " \n" + " int stackSize = 1;\n" + " stack[0] = *rootNodeIndex;\n" + " \n" + " while(stackSize)\n" + " {\n" + " int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" + " --stackSize;\n" + " \n" + " int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" + " int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" + " \n" + " //bvhRigidIndex is not used if internal node\n" + " int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" + " \n" + " b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" + " if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n" + " {\n" + " if(isLeaf)\n" + " {\n" + " int2 rayRigidPair;\n" + " rayRigidPair.x = rayIndex;\n" + " rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" + " \n" + " int pairIndex = atomic_inc(out_numRayRigidPairs);\n" + " if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" + " }\n" + " \n" + " if(!isLeaf) //Internal node\n" + " {\n" + " if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" + " {\n" + " //Error\n" + " }\n" + " else\n" + " {\n" + " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" + " stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" + " }\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n" + " __global int* out_numPairs, __global int4* out_overlappingPairs, \n" + " int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n" + "{\n" + " int smallAabbIndex = get_global_id(0);\n" + " if(smallAabbIndex >= numSmallAabbRigids) return;\n" + " \n" + " b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n" + " for(int i = 0; i < numLargeAabbRigids; ++i)\n" + " {\n" + " b3AabbCL largeAabb = largeAabbs[i];\n" + " if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n" + " {\n" + " int4 pair;\n" + " pair.x = largeAabb.m_minIndices[3];\n" + " pair.y = smallAabb.m_minIndices[3];\n" + " pair.z = NEW_PAIR_MARKER;\n" + " pair.w = NEW_PAIR_MARKER;\n" + " \n" + " int pairIndex = atomic_inc(out_numPairs);\n" + " if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" + " }\n" + " }\n" + "}\n" + "__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n" + " __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n" + " int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n" + "{\n" + " int rayIndex = get_global_id(0);\n" + " if(rayIndex >= numRays) return;\n" + " \n" + " b3Vector3 rayFrom = rays[rayIndex].m_from;\n" + " b3Vector3 rayTo = rays[rayIndex].m_to;\n" + " b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" + " b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" + " \n" + " for(int i = 0; i < numLargeAabbRigids; ++i)\n" + " {\n" + " b3AabbCL rigidAabb = largeRigidAabbs[i];\n" + " if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n" + " {\n" + " int2 rayRigidPair;\n" + " rayRigidPair.x = rayIndex;\n" + " rayRigidPair.y = rigidAabb.m_minIndices[3];\n" + " \n" + " int pairIndex = atomic_inc(out_numRayRigidPairs);\n" + " if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" + " }\n" + " }\n" + "}\n" + "//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" + "//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" + "//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n" + "//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n" + "#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" + "#define B3_PLBVH_ROOT_NODE_MARKER -1\n" + "#define b3Int64 long\n" + "int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" + "b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" + "{\n" + " //This function only needs to return (i & j) in order for the algorithm to work,\n" + " //but it may help with debugging to mask out the lower bits.\n" + " b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" + " b3Int64 sharedBits = i & j;\n" + " b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n" + " \n" + " return sharedBits & bitmask;\n" + "}\n" + "//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n" + "int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" + "{\n" + " return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" + "}\n" + "__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" + " __global b3Int64* out_commonPrefixes,\n" + " __global int* out_commonPrefixLengths,\n" + " int numInternalNodes)\n" + "{\n" + " int internalNodeIndex = get_global_id(0);\n" + " if (internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" + " //and the number of internal nodes is always numLeafNodes - 1\n" + " int leftLeafIndex = internalNodeIndex;\n" + " int rightLeafIndex = internalNodeIndex + 1;\n" + " \n" + " int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" + " int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" + " \n" + " //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" + " //Append the index of each leaf node to each morton code so that there are no duplicates.\n" + " //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" + " //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" + " //\n" + " //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" + " b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" + " b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" + " \n" + " out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" + " out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" + "}\n" + "__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" + " __global int2* out_childNodes, int numLeafNodes)\n" + "{\n" + " int leafNodeIndex = get_global_id(0);\n" + " if (leafNodeIndex >= numLeafNodes) return;\n" + " \n" + " int numInternalNodes = numLeafNodes - 1;\n" + " \n" + " int leftSplitIndex = leafNodeIndex - 1;\n" + " int rightSplitIndex = leafNodeIndex;\n" + " \n" + " int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" + " int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" + " \n" + " //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" + " //Leaf nodes are considered as having the highest common prefix\n" + " int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" + " \n" + " //Handle cases for the edge nodes; the first and last node\n" + " //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n" + " if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" + " if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" + " \n" + " int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" + " out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" + " \n" + " int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n" + " \n" + " //out_childNodesAsInt[0] == int2.x == left child\n" + " //out_childNodesAsInt[1] == int2.y == right child\n" + " int isLeaf = 1;\n" + " __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" + " out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" + "}\n" + "__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" + " __global int2* out_childNodes,\n" + " __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" + " int numInternalNodes)\n" + "{\n" + " int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" + " if(internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" + " int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" + " \n" + "//#define USE_LINEAR_SEARCH\n" + "#ifdef USE_LINEAR_SEARCH\n" + " int leftIndex = -1;\n" + " int rightIndex = -1;\n" + " \n" + " //Find nearest element to left with a lower common prefix\n" + " for(int i = internalNodeIndex - 1; i >= 0; --i)\n" + " {\n" + " int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" + " if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" + " {\n" + " leftIndex = i;\n" + " break;\n" + " }\n" + " }\n" + " \n" + " //Find nearest element to right with a lower common prefix\n" + " for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" + " {\n" + " int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" + " if(nodeRightSharedPrefixLength < nodePrefixLength)\n" + " {\n" + " rightIndex = i;\n" + " break;\n" + " }\n" + " }\n" + " \n" + "#else //Use binary search\n" + " //Find nearest element to left with a lower common prefix\n" + " int leftIndex = -1;\n" + " {\n" + " int lower = 0;\n" + " int upper = internalNodeIndex - 1;\n" + " \n" + " while(lower <= upper)\n" + " {\n" + " int mid = (lower + upper) / 2;\n" + " b3Int64 midPrefix = commonPrefixes[mid];\n" + " int midPrefixLength = commonPrefixLengths[mid];\n" + " \n" + " int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" + " if(nodeMidSharedPrefixLength < nodePrefixLength) \n" + " {\n" + " int right = mid + 1;\n" + " if(right < internalNodeIndex)\n" + " {\n" + " b3Int64 rightPrefix = commonPrefixes[right];\n" + " int rightPrefixLength = commonPrefixLengths[right];\n" + " \n" + " int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" + " if(nodeRightSharedPrefixLength < nodePrefixLength) \n" + " {\n" + " lower = right;\n" + " leftIndex = right;\n" + " }\n" + " else \n" + " {\n" + " leftIndex = mid;\n" + " break;\n" + " }\n" + " }\n" + " else \n" + " {\n" + " leftIndex = mid;\n" + " break;\n" + " }\n" + " }\n" + " else upper = mid - 1;\n" + " }\n" + " }\n" + " \n" + " //Find nearest element to right with a lower common prefix\n" + " int rightIndex = -1;\n" + " {\n" + " int lower = internalNodeIndex + 1;\n" + " int upper = numInternalNodes - 1;\n" + " \n" + " while(lower <= upper)\n" + " {\n" + " int mid = (lower + upper) / 2;\n" + " b3Int64 midPrefix = commonPrefixes[mid];\n" + " int midPrefixLength = commonPrefixLengths[mid];\n" + " \n" + " int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" + " if(nodeMidSharedPrefixLength < nodePrefixLength) \n" + " {\n" + " int left = mid - 1;\n" + " if(left > internalNodeIndex)\n" + " {\n" + " b3Int64 leftPrefix = commonPrefixes[left];\n" + " int leftPrefixLength = commonPrefixLengths[left];\n" + " \n" + " int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" + " if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" + " {\n" + " upper = left;\n" + " rightIndex = left;\n" + " }\n" + " else \n" + " {\n" + " rightIndex = mid;\n" + " break;\n" + " }\n" + " }\n" + " else \n" + " {\n" + " rightIndex = mid;\n" + " break;\n" + " }\n" + " }\n" + " else lower = mid + 1;\n" + " }\n" + " }\n" + "#endif\n" + " \n" + " //Select parent\n" + " {\n" + " int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" + " int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" + " \n" + " int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" + " \n" + " if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" + " else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" + " \n" + " int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" + " \n" + " int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" + " out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" + " \n" + " int isLeaf = 0;\n" + " if(!isRootNode)\n" + " {\n" + " int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n" + " \n" + " //out_childNodesAsInt[0] == int2.x == left child\n" + " //out_childNodesAsInt[1] == int2.y == right child\n" + " __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" + " out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" + " }\n" + " else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" + " }\n" + "}\n" + "__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" + " __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" + "{\n" + " if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" + " int internalNodeIndex = get_global_id(0);\n" + " if(internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " //\n" + " int distanceFromRoot = 0;\n" + " {\n" + " int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" + " while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" + " {\n" + " parentIndex = internalNodeParentNodes[parentIndex];\n" + " ++distanceFromRoot;\n" + " }\n" + " }\n" + " out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" + " \n" + " //\n" + " __local int localMaxDistanceFromRoot;\n" + " if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" + "}\n" + "__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" + " __global int2* childNodes,\n" + " __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" + " int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" + "{\n" + " int internalNodeIndex = get_global_id(0);\n" + " if(internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " int distance = distanceFromRoot[internalNodeIndex];\n" + " \n" + " if(distance == processedDistance)\n" + " {\n" + " int leftChildIndex = childNodes[internalNodeIndex].x;\n" + " int rightChildIndex = childNodes[internalNodeIndex].y;\n" + " \n" + " int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" + " int isRightChildLeaf = isLeafNode(rightChildIndex);\n" + " \n" + " leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n" + " rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n" + " \n" + " //leftRigidIndex/rightRigidIndex is not used if internal node\n" + " int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" + " int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" + " \n" + " b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n" + " b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n" + " \n" + " b3AabbCL mergedAabb;\n" + " mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" + " mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" + " internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" + " }\n" + "}\n" + "__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n" + "{\n" + " int internalNodeIndex = get_global_id(0);\n" + " if(internalNodeIndex >= numInternalNodes) return;\n" + " \n" + " int numLeafNodes = numInternalNodes + 1;\n" + " \n" + " int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n" + " \n" + " int2 leafIndexRange; //x == min leaf index, y == max leaf index\n" + " \n" + " //Find lowest leaf index covered by this internal node\n" + " {\n" + " int lowestIndex = childNodes.x; //childNodes.x == Left child\n" + " while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n" + " leafIndexRange.x = lowestIndex;\n" + " }\n" + " \n" + " //Find highest leaf index covered by this internal node\n" + " {\n" + " int highestIndex = childNodes.y; //childNodes.y == Right child\n" + " while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n" + " leafIndexRange.y = highestIndex;\n" + " }\n" + " \n" + " //\n" + " out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h index 04d40fcf26..d6999b94cb 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h @@ -1,342 +1,341 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* sapCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Erwin Coumans\n" -"#define NEW_PAIR_MARKER -1\n" -"typedef struct \n" -"{\n" -" union\n" -" {\n" -" float4 m_min;\n" -" float m_minElems[4];\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float4 m_max;\n" -" float m_maxElems[4];\n" -" int m_maxIndices[4];\n" -" };\n" -"} btAabbCL;\n" -"/// conservative test for overlap between two aabbs\n" -"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numUnsortedAabbs)\n" -" return;\n" -" int j = get_global_id(1);\n" -" if (j>=numUnSortedAabbs2)\n" -" return;\n" -" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n" -" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n" -" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n" -" {\n" -" int4 myPair;\n" -" \n" -" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n" -" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n" -" if (xIndex>yIndex)\n" -" {\n" -" int tmp = xIndex;\n" -" xIndex=yIndex;\n" -" yIndex=tmp;\n" -" }\n" -" \n" -" myPair.x = xIndex;\n" -" myPair.y = yIndex;\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -"}\n" -"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" for (int j=i+1;j<numObjects;j++)\n" -" {\n" -" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" -" {\n" -" int4 myPair;\n" -" myPair.x = aabbs[i].m_minIndices[3];\n" -" myPair.y = aabbs[j].m_minIndices[3];\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" for (int j=i+1;j<numObjects;j++)\n" -" {\n" -" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" -" {\n" -" break;\n" -" }\n" -" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" -" {\n" -" int4 myPair;\n" -" myPair.x = aabbs[i].m_minIndices[3];\n" -" myPair.y = aabbs[j].m_minIndices[3];\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" int localId = get_local_id(0);\n" -" __local int numActiveWgItems[1];\n" -" __local int breakRequest[1];\n" -" if (localId==0)\n" -" {\n" -" numActiveWgItems[0] = 0;\n" -" breakRequest[0] = 0;\n" -" }\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" atomic_inc(numActiveWgItems);\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" int localBreak = 0;\n" -" int j=i+1;\n" -" do\n" -" {\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (j<numObjects)\n" -" {\n" -" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" -" {\n" -" if (!localBreak)\n" -" {\n" -" atomic_inc(breakRequest);\n" -" localBreak = 1;\n" -" }\n" -" }\n" -" }\n" -" \n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (j>=numObjects && !localBreak)\n" -" {\n" -" atomic_inc(breakRequest);\n" -" localBreak = 1;\n" -" }\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (!localBreak)\n" -" {\n" -" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" -" {\n" -" int4 myPair;\n" -" myPair.x = aabbs[i].m_minIndices[3];\n" -" myPair.y = aabbs[j].m_minIndices[3];\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -" }\n" -" j++;\n" -" } while (breakRequest[0]<numActiveWgItems[0]);\n" -"}\n" -"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" int localId = get_local_id(0);\n" -" __local int numActiveWgItems[1];\n" -" __local int breakRequest[1];\n" -" __local btAabbCL localAabbs[128];// = aabbs[i];\n" -" \n" -" btAabbCL myAabb;\n" -" \n" -" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n" -" float testValue = myAabb.m_maxElems[axis];\n" -" \n" -" if (localId==0)\n" -" {\n" -" numActiveWgItems[0] = 0;\n" -" breakRequest[0] = 0;\n" -" }\n" -" int localCount=0;\n" -" int block=0;\n" -" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n" -" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n" -" \n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" atomic_inc(numActiveWgItems);\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" int localBreak = 0;\n" -" \n" -" int j=i+1;\n" -" do\n" -" {\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (j<numObjects)\n" -" {\n" -" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n" -" {\n" -" if (!localBreak)\n" -" {\n" -" atomic_inc(breakRequest);\n" -" localBreak = 1;\n" -" }\n" -" }\n" -" }\n" -" \n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (j>=numObjects && !localBreak)\n" -" {\n" -" atomic_inc(breakRequest);\n" -" localBreak = 1;\n" -" }\n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" \n" -" if (!localBreak)\n" -" {\n" -" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" -" {\n" -" int4 myPair;\n" -" myPair.x = myAabb.m_minIndices[3];\n" -" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" -" myPair.z = NEW_PAIR_MARKER;\n" -" myPair.w = NEW_PAIR_MARKER;\n" -" int curPair = atomic_inc (pairCount);\n" -" if (curPair<maxPairs)\n" -" {\n" -" pairsOut[curPair] = myPair; //flush to main memory\n" -" }\n" -" }\n" -" }\n" -" \n" -" barrier(CLK_LOCAL_MEM_FENCE);\n" -" localCount++;\n" -" if (localCount==64)\n" -" {\n" -" localCount = 0;\n" -" block+=64; \n" -" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n" -" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n" -" }\n" -" j++;\n" -" \n" -" } while (breakRequest[0]<numActiveWgItems[0]);\n" -" \n" -"}\n" -"//http://stereopsis.com/radix.html\n" -"unsigned int FloatFlip(float fl);\n" -"unsigned int FloatFlip(float fl)\n" -"{\n" -" unsigned int f = *(unsigned int*)&fl;\n" -" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n" -" return f ^ mask;\n" -"}\n" -"float IFloatFlip(unsigned int f);\n" -"float IFloatFlip(unsigned int f)\n" -"{\n" -" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n" -" unsigned int fl = f ^ mask;\n" -" return *(float*)&fl;\n" -"}\n" -"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" int src = destAabbs[i].m_maxIndices[3];\n" -" destAabbs[i] = allAabbs[src];\n" -" destAabbs[i].m_maxIndices[3] = src;\n" -"}\n" -"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" \n" -" \n" -" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n" -" sortData[i].y = i;\n" -" \n" -"}\n" -"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numObjects)\n" -" return;\n" -" \n" -" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n" -"}\n" -"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numAabbs)\n" -" return;\n" -" \n" -" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n" -" \n" -" float4 s;\n" -" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n" -" sum[i]=s;\n" -" sum2[i]=s*s; \n" -"}\n" -; +static const char* sapCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Erwin Coumans\n" + "#define NEW_PAIR_MARKER -1\n" + "typedef struct \n" + "{\n" + " union\n" + " {\n" + " float4 m_min;\n" + " float m_minElems[4];\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float4 m_max;\n" + " float m_maxElems[4];\n" + " int m_maxIndices[4];\n" + " };\n" + "} btAabbCL;\n" + "/// conservative test for overlap between two aabbs\n" + "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n" + "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" + " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" + " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" + "bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" + " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" + " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" + "bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" + " overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" + " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numUnsortedAabbs)\n" + " return;\n" + " int j = get_global_id(1);\n" + " if (j>=numUnSortedAabbs2)\n" + " return;\n" + " __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n" + " __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n" + " if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n" + " {\n" + " int4 myPair;\n" + " \n" + " int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n" + " int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n" + " if (xIndex>yIndex)\n" + " {\n" + " int tmp = xIndex;\n" + " xIndex=yIndex;\n" + " yIndex=tmp;\n" + " }\n" + " \n" + " myPair.x = xIndex;\n" + " myPair.y = yIndex;\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + "}\n" + "__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " for (int j=i+1;j<numObjects;j++)\n" + " {\n" + " if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" + " {\n" + " int4 myPair;\n" + " myPair.x = aabbs[i].m_minIndices[3];\n" + " myPair.y = aabbs[j].m_minIndices[3];\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " for (int j=i+1;j<numObjects;j++)\n" + " {\n" + " if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" + " {\n" + " break;\n" + " }\n" + " if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" + " {\n" + " int4 myPair;\n" + " myPair.x = aabbs[i].m_minIndices[3];\n" + " myPair.y = aabbs[j].m_minIndices[3];\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " int localId = get_local_id(0);\n" + " __local int numActiveWgItems[1];\n" + " __local int breakRequest[1];\n" + " if (localId==0)\n" + " {\n" + " numActiveWgItems[0] = 0;\n" + " breakRequest[0] = 0;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " atomic_inc(numActiveWgItems);\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " int localBreak = 0;\n" + " int j=i+1;\n" + " do\n" + " {\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (j<numObjects)\n" + " {\n" + " if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" + " {\n" + " if (!localBreak)\n" + " {\n" + " atomic_inc(breakRequest);\n" + " localBreak = 1;\n" + " }\n" + " }\n" + " }\n" + " \n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (j>=numObjects && !localBreak)\n" + " {\n" + " atomic_inc(breakRequest);\n" + " localBreak = 1;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (!localBreak)\n" + " {\n" + " if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" + " {\n" + " int4 myPair;\n" + " myPair.x = aabbs[i].m_minIndices[3];\n" + " myPair.y = aabbs[j].m_minIndices[3];\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + " }\n" + " j++;\n" + " } while (breakRequest[0]<numActiveWgItems[0]);\n" + "}\n" + "__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " int localId = get_local_id(0);\n" + " __local int numActiveWgItems[1];\n" + " __local int breakRequest[1];\n" + " __local btAabbCL localAabbs[128];// = aabbs[i];\n" + " \n" + " btAabbCL myAabb;\n" + " \n" + " myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n" + " float testValue = myAabb.m_maxElems[axis];\n" + " \n" + " if (localId==0)\n" + " {\n" + " numActiveWgItems[0] = 0;\n" + " breakRequest[0] = 0;\n" + " }\n" + " int localCount=0;\n" + " int block=0;\n" + " localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n" + " localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n" + " \n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " atomic_inc(numActiveWgItems);\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " int localBreak = 0;\n" + " \n" + " int j=i+1;\n" + " do\n" + " {\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (j<numObjects)\n" + " {\n" + " if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n" + " {\n" + " if (!localBreak)\n" + " {\n" + " atomic_inc(breakRequest);\n" + " localBreak = 1;\n" + " }\n" + " }\n" + " }\n" + " \n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (j>=numObjects && !localBreak)\n" + " {\n" + " atomic_inc(breakRequest);\n" + " localBreak = 1;\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " \n" + " if (!localBreak)\n" + " {\n" + " if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" + " {\n" + " int4 myPair;\n" + " myPair.x = myAabb.m_minIndices[3];\n" + " myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" + " myPair.z = NEW_PAIR_MARKER;\n" + " myPair.w = NEW_PAIR_MARKER;\n" + " int curPair = atomic_inc (pairCount);\n" + " if (curPair<maxPairs)\n" + " {\n" + " pairsOut[curPair] = myPair; //flush to main memory\n" + " }\n" + " }\n" + " }\n" + " \n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + " localCount++;\n" + " if (localCount==64)\n" + " {\n" + " localCount = 0;\n" + " block+=64; \n" + " localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n" + " localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n" + " }\n" + " j++;\n" + " \n" + " } while (breakRequest[0]<numActiveWgItems[0]);\n" + " \n" + "}\n" + "//http://stereopsis.com/radix.html\n" + "unsigned int FloatFlip(float fl);\n" + "unsigned int FloatFlip(float fl)\n" + "{\n" + " unsigned int f = *(unsigned int*)&fl;\n" + " unsigned int mask = -(int)(f >> 31) | 0x80000000;\n" + " return f ^ mask;\n" + "}\n" + "float IFloatFlip(unsigned int f);\n" + "float IFloatFlip(unsigned int f)\n" + "{\n" + " unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n" + " unsigned int fl = f ^ mask;\n" + " return *(float*)&fl;\n" + "}\n" + "__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " int src = destAabbs[i].m_maxIndices[3];\n" + " destAabbs[i] = allAabbs[src];\n" + " destAabbs[i].m_maxIndices[3] = src;\n" + "}\n" + "__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " \n" + " \n" + " sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n" + " sortData[i].y = i;\n" + " \n" + "}\n" + "__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numObjects)\n" + " return;\n" + " \n" + " sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n" + "}\n" + "__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numAabbs)\n" + " return;\n" + " \n" + " btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n" + " \n" + " float4 s;\n" + " s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n" + " sum[i]=s;\n" + " sum2[i]=s*s; \n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLInclude.h b/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLInclude.h index e79182d7cb..6146538263 100644 --- a/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLInclude.h +++ b/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLInclude.h @@ -17,7 +17,7 @@ subject to the following restrictions: #define B3_OPENCL_INCLUDE_H #ifdef B3_USE_CLEW - #include "clew/clew.h" +#include "clew/clew.h" #else #ifdef __APPLE__ @@ -25,7 +25,7 @@ subject to the following restrictions: #include <MiniCL/cl.h> #else #include <OpenCL/cl.h> -#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE +#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE #endif #else #ifdef USE_MINICL @@ -34,15 +34,18 @@ subject to the following restrictions: #include <CL/cl.h> #ifdef _WIN32 #include "CL/cl_gl.h" -#endif //_WIN32 +#endif //_WIN32 #endif -#endif //__APPLE__ -#endif //B3_USE_CLEW +#endif //__APPLE__ +#endif //B3_USE_CLEW #include <assert.h> #include <stdio.h> -#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); } - - -#endif //B3_OPENCL_INCLUDE_H - +#define oclCHECKERROR(a, b) \ + if ((a) != (b)) \ + { \ + printf("OCL Error : %d\n", (a)); \ + assert((a) == (b)); \ + } + +#endif //B3_OPENCL_INCLUDE_H diff --git a/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp b/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp index dd194fc7ba..fe54ea5ec9 100644 --- a/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp @@ -16,7 +16,6 @@ subject to the following restrictions: //Original author: Roman Ponomarev //Mostly Reimplemented by Erwin Coumans - bool gDebugForceLoadingFromSource = false; bool gDebugSkipLoadingBinary = false; @@ -25,7 +24,7 @@ bool gDebugSkipLoadingBinary = false; #include <string.h> #ifdef _WIN32 -#pragma warning (disable:4996) +#pragma warning(disable : 4996) #endif #include "b3OpenCLUtils.h" //#include "b3OpenCLInclude.h" @@ -33,7 +32,7 @@ bool gDebugSkipLoadingBinary = false; #include <stdio.h> #include <stdlib.h> -#define B3_MAX_CL_DEVICES 16 //who needs 16 devices? +#define B3_MAX_CL_DEVICES 16 //who needs 16 devices? #ifdef _WIN32 #include <windows.h> @@ -46,53 +45,49 @@ bool gDebugSkipLoadingBinary = false; #endif -static const char* sCachedBinaryPath="cache"; - +static const char* sCachedBinaryPath = "cache"; //Set the preferred platform vendor using the OpenCL SDK static const char* spPlatformVendor = #if defined(CL_PLATFORM_MINI_CL) -"MiniCL, SCEA"; + "MiniCL, SCEA"; #elif defined(CL_PLATFORM_AMD) -"Advanced Micro Devices, Inc."; + "Advanced Micro Devices, Inc."; #elif defined(CL_PLATFORM_NVIDIA) -"NVIDIA Corporation"; + "NVIDIA Corporation"; #elif defined(CL_PLATFORM_INTEL) -"Intel(R) Corporation"; + "Intel(R) Corporation"; #elif defined(B3_USE_CLEW) -"clew (OpenCL Extension Wrangler library)"; + "clew (OpenCL Extension Wrangler library)"; #else -"Unknown Vendor"; + "Unknown Vendor"; #endif #ifndef CL_PLATFORM_MINI_CL #ifdef _WIN32 #ifndef B3_USE_CLEW #include "CL/cl_gl.h" -#endif //B3_USE_CLEW -#endif //_WIN32 +#endif //B3_USE_CLEW +#endif //_WIN32 #endif - -void MyFatalBreakAPPLE( const char * errstr , - const void * private_info , - size_t cb , - void * user_data ) +void MyFatalBreakAPPLE(const char* errstr, + const void* private_info, + size_t cb, + void* user_data) { - - - const char* patloc = strstr(errstr, "Warning"); - //find out if it is a warning or error, exit if error + const char* patloc = strstr(errstr, "Warning"); + //find out if it is a warning or error, exit if error - if (patloc) - { + if (patloc) + { b3Warning("Warning: %s\n", errstr); - } else - { + } + else + { b3Error("Error: %s\n", errstr); - b3Assert(0); - } - + b3Assert(0); + } } #ifdef B3_USE_CLEW @@ -102,30 +97,31 @@ int b3OpenCLUtils_clewInit() int result = -1; #ifdef _WIN32 - const char* cl = "OpenCL.dll"; + const char* cl = "OpenCL.dll"; #elif defined __APPLE__ - const char* cl = "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL"; -#else//presumable Linux? - //linux (tested on Ubuntu 12.10 with Catalyst 13.4 beta drivers, not that there is no symbolic link from libOpenCL.so - const char* cl = "libOpenCL.so.1"; - result = clewInit(cl); - if (result != CLEW_SUCCESS) - { - cl = "libOpenCL.so"; - } else - { - clewExit(); - } + const char* cl = "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL"; +#else //presumable Linux? \ + //linux (tested on Ubuntu 12.10 with Catalyst 13.4 beta drivers, not that there is no symbolic link from libOpenCL.so + const char* cl = "libOpenCL.so.1"; + result = clewInit(cl); + if (result != CLEW_SUCCESS) + { + cl = "libOpenCL.so"; + } + else + { + clewExit(); + } #endif - result = clewInit(cl); - if (result!=CLEW_SUCCESS) - { - b3Error("clewInit failed with error code %d\n",result); - } - else - { - b3Printf("clewInit succesfull using %s\n",cl); - } + result = clewInit(cl); + if (result != CLEW_SUCCESS) + { + b3Error("clewInit failed with error code %d\n", result); + } + else + { + b3Printf("clewInit succesfull using %s\n", cl); + } return result; } #endif @@ -136,19 +132,18 @@ int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum) b3OpenCLUtils_clewInit(); #endif - cl_platform_id pPlatforms[10] = { 0 }; + cl_platform_id pPlatforms[10] = {0}; - cl_uint numPlatforms = 0; - cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms); + cl_uint numPlatforms = 0; + cl_int ciErrNum = clGetPlatformIDs(10, pPlatforms, &numPlatforms); //cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms); - if(ciErrNum != CL_SUCCESS) + if (ciErrNum != CL_SUCCESS) { - if(pErrNum != NULL) + if (pErrNum != NULL) *pErrNum = ciErrNum; } return numPlatforms; - } const char* b3OpenCLUtils_getSdkVendorName() @@ -164,28 +159,28 @@ void b3OpenCLUtils_setCachePath(const char* path) cl_platform_id b3OpenCLUtils_getPlatform(int platformIndex0, cl_int* pErrNum) { #ifdef B3_USE_CLEW - b3OpenCLUtils_clewInit(); + b3OpenCLUtils_clewInit(); #endif cl_platform_id platform = 0; - unsigned int platformIndex = (unsigned int )platformIndex0; + unsigned int platformIndex = (unsigned int)platformIndex0; cl_uint numPlatforms; cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms); - if (platformIndex<numPlatforms) + if (platformIndex < numPlatforms) { - cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms); ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL); - if(ciErrNum != CL_SUCCESS) + if (ciErrNum != CL_SUCCESS) { - if(pErrNum != NULL) + if (pErrNum != NULL) *pErrNum = ciErrNum; return platform; } platform = platforms[platformIndex]; - free (platforms); + free(platforms); } return platform; @@ -195,30 +190,28 @@ void b3OpenCLUtils::getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInf { b3Assert(platform); cl_int ciErrNum; - ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VENDOR,B3_MAX_STRING_LENGTH,platformInfo->m_platformVendor,NULL); - oclCHECKERROR(ciErrNum,CL_SUCCESS); - ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_NAME,B3_MAX_STRING_LENGTH,platformInfo->m_platformName,NULL); - oclCHECKERROR(ciErrNum,CL_SUCCESS); - ciErrNum = clGetPlatformInfo( platform,CL_PLATFORM_VERSION,B3_MAX_STRING_LENGTH,platformInfo->m_platformVersion,NULL); - oclCHECKERROR(ciErrNum,CL_SUCCESS); + ciErrNum = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, B3_MAX_STRING_LENGTH, platformInfo->m_platformVendor, NULL); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + ciErrNum = clGetPlatformInfo(platform, CL_PLATFORM_NAME, B3_MAX_STRING_LENGTH, platformInfo->m_platformName, NULL); + oclCHECKERROR(ciErrNum, CL_SUCCESS); + ciErrNum = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, B3_MAX_STRING_LENGTH, platformInfo->m_platformVersion, NULL); + oclCHECKERROR(ciErrNum, CL_SUCCESS); } -void b3OpenCLUtils_printPlatformInfo( cl_platform_id platform) +void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform) { b3OpenCLPlatformInfo platformInfo; - b3OpenCLUtils::getPlatformInfo (platform, &platformInfo); + b3OpenCLUtils::getPlatformInfo(platform, &platformInfo); b3Printf("Platform info:\n"); - b3Printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor); - b3Printf(" CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName); - b3Printf(" CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion); + b3Printf(" CL_PLATFORM_VENDOR: \t\t\t%s\n", platformInfo.m_platformVendor); + b3Printf(" CL_PLATFORM_NAME: \t\t\t%s\n", platformInfo.m_platformName); + b3Printf(" CL_PLATFORM_VERSION: \t\t\t%s\n", platformInfo.m_platformVersion); } - - cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex) { cl_context retContext = 0; - cl_int ciErrNum=0; + cl_int ciErrNum = 0; cl_uint num_entries; cl_device_id devices[B3_MAX_CL_DEVICES]; cl_uint num_devices; @@ -228,7 +221,7 @@ cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_d * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ - cl_context_properties cps[7] = {0,0,0,0,0,0,0}; + cl_context_properties cps[7] = {0, 0, 0, 0, 0, 0, 0}; cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties)platform; #ifdef _WIN32 @@ -240,25 +233,24 @@ cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_d cps[4] = CL_WGL_HDC_KHR; cps[5] = (cl_context_properties)pGLDC; } -#endif //B3_USE_CLEW -#endif //_WIN32 +#endif //B3_USE_CLEW +#endif //_WIN32 num_entries = B3_MAX_CL_DEVICES; - - num_devices=-1; + num_devices = -1; ciErrNum = clGetDeviceIDs( platform, deviceType, - num_entries, - devices, - &num_devices); - - if (ciErrNum<0) - { - b3Printf("clGetDeviceIDs returned %d\n",ciErrNum); - return 0; - } + num_entries, + devices, + &num_devices); + + if (ciErrNum < 0) + { + b3Printf("clGetDeviceIDs returned %d\n", ciErrNum); + return 0; + } cprops = (NULL == platform) ? NULL : cps; if (!num_devices) @@ -268,32 +260,33 @@ cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_d { //search for the GPU that relates to the OpenCL context unsigned int i; - for (i=0;i<num_devices;i++) + for (i = 0; i < num_devices; i++) { - retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum); - if (ciErrNum==CL_SUCCESS) + retContext = clCreateContext(cprops, 1, &devices[i], NULL, NULL, &ciErrNum); + if (ciErrNum == CL_SUCCESS) break; } } else { - if (preferredDeviceIndex>=0 && (unsigned int)preferredDeviceIndex<num_devices) + if (preferredDeviceIndex >= 0 && (unsigned int)preferredDeviceIndex < num_devices) { //create a context of the preferred device index - retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum); - } else + retContext = clCreateContext(cprops, 1, &devices[preferredDeviceIndex], NULL, NULL, &ciErrNum); + } + else { //create a context of all devices -#if defined (__APPLE__) - retContext = clCreateContext(cprops,num_devices,devices,MyFatalBreakAPPLE,NULL,&ciErrNum); +#if defined(__APPLE__) + retContext = clCreateContext(cprops, num_devices, devices, MyFatalBreakAPPLE, NULL, &ciErrNum); #else - b3Printf("numDevices=%d\n",num_devices); + b3Printf("numDevices=%d\n", num_devices); - retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum); + retContext = clCreateContext(cprops, num_devices, devices, NULL, NULL, &ciErrNum); #endif } } - if(pErrNum != NULL) + if (pErrNum != NULL) { *pErrNum = ciErrNum; }; @@ -301,60 +294,58 @@ cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_d return retContext; } -cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId) +cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* retPlatformId) { #ifdef B3_USE_CLEW - b3OpenCLUtils_clewInit(); + b3OpenCLUtils_clewInit(); #endif - cl_uint numPlatforms; cl_context retContext = 0; unsigned int i; cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms); - if(ciErrNum != CL_SUCCESS) + if (ciErrNum != CL_SUCCESS) { - if(pErrNum != NULL) *pErrNum = ciErrNum; + if (pErrNum != NULL) *pErrNum = ciErrNum; return NULL; } - if(numPlatforms > 0) + if (numPlatforms > 0) { - cl_platform_id* platforms = (cl_platform_id*) malloc (sizeof(cl_platform_id)*numPlatforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms); ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL); - if(ciErrNum != CL_SUCCESS) + if (ciErrNum != CL_SUCCESS) { - if(pErrNum != NULL) + if (pErrNum != NULL) *pErrNum = ciErrNum; free(platforms); return NULL; } - - - for ( i = 0; i < numPlatforms; ++i) + for (i = 0; i < numPlatforms; ++i) { char pbuf[128]; - ciErrNum = clGetPlatformInfo( platforms[i], - CL_PLATFORM_VENDOR, - sizeof(pbuf), - pbuf, - NULL); - if(ciErrNum != CL_SUCCESS) + ciErrNum = clGetPlatformInfo(platforms[i], + CL_PLATFORM_VENDOR, + sizeof(pbuf), + pbuf, + NULL); + if (ciErrNum != CL_SUCCESS) { - if(pErrNum != NULL) *pErrNum = ciErrNum; + if (pErrNum != NULL) *pErrNum = ciErrNum; return NULL; } - if (preferredPlatformIndex>=0 && i==preferredPlatformIndex) + if (preferredPlatformIndex >= 0 && i == preferredPlatformIndex) { cl_platform_id tmpPlatform = platforms[0]; platforms[0] = platforms[i]; platforms[i] = tmpPlatform; break; - } else + } + else { - if(!strcmp(pbuf, spPlatformVendor)) + if (!strcmp(pbuf, spPlatformVendor)) { cl_platform_id tmpPlatform = platforms[0]; platforms[0] = platforms[i]; @@ -368,11 +359,11 @@ cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int cl_platform_id platform = platforms[i]; assert(platform); - retContext = b3OpenCLUtils_createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex,preferredPlatformIndex); + retContext = b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLContext, pGLDC, preferredDeviceIndex, preferredPlatformIndex); if (retContext) { -// printf("OpenCL platform details:\n"); + // printf("OpenCL platform details:\n"); b3OpenCLPlatformInfo platformInfo; b3OpenCLUtils::getPlatformInfo(platform, &platformInfo); @@ -384,12 +375,11 @@ cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int } } - free (platforms); + free(platforms); } return retContext; } - ////////////////////////////////////////////////////////////////////////////// //! Gets the id of the nth device from the context //! @@ -403,16 +393,17 @@ cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int deviceIndex) size_t szParmDataBytes; cl_device_id* cdDevices; - cl_device_id device ; + cl_device_id device; // get the list of devices associated with context clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes); - if( szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex ) { + if (szParmDataBytes / sizeof(cl_device_id) < (unsigned int)deviceIndex) + { return (cl_device_id)-1; } - cdDevices = (cl_device_id*) malloc(szParmDataBytes); + cdDevices = (cl_device_id*)malloc(szParmDataBytes); clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL); @@ -427,12 +418,10 @@ int b3OpenCLUtils_getNumDevices(cl_context cxMainContext) size_t szParamDataBytes; int device_count; clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes); - device_count = (int) szParamDataBytes/ sizeof(cl_device_id); + device_count = (int)szParamDataBytes / sizeof(cl_device_id); return device_count; } - - void b3OpenCLUtils::getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info) { // CL_DEVICE_NAME @@ -514,23 +503,22 @@ void b3OpenCLUtils::getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info) clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info->m_vecWidthDouble, NULL); } - void b3OpenCLUtils_printDeviceInfo(cl_device_id device) { b3OpenCLDeviceInfo info; - b3OpenCLUtils::getDeviceInfo(device,&info); + b3OpenCLUtils::getDeviceInfo(device, &info); b3Printf("Device Info:\n"); b3Printf(" CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName); b3Printf(" CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor); b3Printf(" CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion); - if( info.m_deviceType & CL_DEVICE_TYPE_CPU ) + if (info.m_deviceType & CL_DEVICE_TYPE_CPU) b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU"); - if( info.m_deviceType & CL_DEVICE_TYPE_GPU ) + if (info.m_deviceType & CL_DEVICE_TYPE_GPU) b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU"); - if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR ) + if (info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR) b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR"); - if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT ) + if (info.m_deviceType & CL_DEVICE_TYPE_DEFAULT) b3Printf(" CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT"); b3Printf(" CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits); @@ -539,15 +527,15 @@ void b3OpenCLUtils_printDeviceInfo(cl_device_id device) b3Printf(" CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize); b3Printf(" CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency); b3Printf(" CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits); - b3Printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024))); - b3Printf(" CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024))); - b3Printf(" CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no"); + b3Printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize / (1024 * 1024))); + b3Printf(" CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize / (1024 * 1024))); + b3Printf(" CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport == CL_TRUE ? "yes" : "no"); b3Printf(" CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global"); b3Printf(" CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024)); b3Printf(" CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024)); - if( info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE ) + if (info.m_queueProperties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) b3Printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE"); - if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE ) + if (info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE) b3Printf(" CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE"); b3Printf(" CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport); @@ -562,7 +550,7 @@ void b3OpenCLUtils_printDeviceInfo(cl_device_id device) b3Printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth); if (*info.m_deviceExtensions != 0) { - b3Printf("\n CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions); + b3Printf("\n CL_DEVICE_EXTENSIONS:%s\n", info.m_deviceExtensions); } else { @@ -570,36 +558,33 @@ void b3OpenCLUtils_printDeviceInfo(cl_device_id device) } b3Printf(" CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t"); b3Printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n", - info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble); - - + info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong, info.m_vecWidthFloat, info.m_vecWidthDouble); } - static const char* strip2(const char* name, const char* pattern) { - size_t const patlen = strlen(pattern); - size_t patcnt = 0; - const char * oriptr; - const char * patloc; - // find how many times the pattern occurs in the original string - for (oriptr = name; (patloc = strstr(oriptr, pattern)); oriptr = patloc + patlen) - { + size_t const patlen = strlen(pattern); + size_t patcnt = 0; + const char* oriptr; + const char* patloc; + // find how many times the pattern occurs in the original string + for (oriptr = name; (patloc = strstr(oriptr, pattern)); oriptr = patloc + patlen) + { patcnt++; - } - return oriptr; + } + return oriptr; } -cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg , const char* clFileNameForCaching, bool disableBinaryCaching) +cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSourceOrg, cl_int* pErrNum, const char* additionalMacrosArg, const char* clFileNameForCaching, bool disableBinaryCaching) { - const char* additionalMacros = additionalMacrosArg?additionalMacrosArg:""; + const char* additionalMacros = additionalMacrosArg ? additionalMacrosArg : ""; if (disableBinaryCaching) { //kernelSourceOrg = 0; } - cl_program m_cpProgram=0; + cl_program m_cpProgram = 0; cl_int status; char binaryFileName[B3_MAX_STRING_LENGTH]; @@ -609,67 +594,64 @@ cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_dev const char* strippedName; int fileUpToDate = 0; #ifdef _WIN32 - int binaryFileValid=0; -#endif + int binaryFileValid = 0; +#endif if (!disableBinaryCaching && clFileNameForCaching) { clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL); clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL); - - strippedName = strip2(clFileNameForCaching,"\\"); - strippedName = strip2(strippedName,"/"); - -#ifdef _MSVC_VER - sprintf_s(binaryFileName,B3_MAX_STRING_LENGTH,"%s/%s.%s.%s.bin",sCachedBinaryPath,strippedName, deviceName,driverVersion ); + + strippedName = strip2(clFileNameForCaching, "\\"); + strippedName = strip2(strippedName, "/"); + +#ifdef _MSC_VER + sprintf_s(binaryFileName, B3_MAX_STRING_LENGTH, "%s/%s.%s.%s.bin", sCachedBinaryPath, strippedName, deviceName, driverVersion); #else - sprintf(binaryFileName,"%s/%s.%s.%s.bin",sCachedBinaryPath,strippedName, deviceName,driverVersion ); + sprintf(binaryFileName, "%s/%s.%s.%s.bin", sCachedBinaryPath, strippedName, deviceName, driverVersion); #endif } - if (clFileNameForCaching && !(disableBinaryCaching || gDebugSkipLoadingBinary||gDebugForceLoadingFromSource) ) + if (clFileNameForCaching && !(disableBinaryCaching || gDebugSkipLoadingBinary || gDebugForceLoadingFromSource)) { - #ifdef _WIN32 - char* bla=0; - - + char* bla = 0; //printf("searching for %s\n", binaryFileName); - FILETIME modtimeBinary; - CreateDirectoryA(sCachedBinaryPath,0); + CreateDirectoryA(sCachedBinaryPath, 0); { - - HANDLE binaryFileHandle = CreateFileA(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0); - if (binaryFileHandle ==INVALID_HANDLE_VALUE) + HANDLE binaryFileHandle = CreateFileA(binaryFileName, GENERIC_READ, 0, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); + if (binaryFileHandle == INVALID_HANDLE_VALUE) { DWORD errorCode; errorCode = GetLastError(); switch (errorCode) { - case ERROR_FILE_NOT_FOUND: + case ERROR_FILE_NOT_FOUND: { b3Warning("\nCached file not found %s\n", binaryFileName); break; } - case ERROR_PATH_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: { b3Warning("\nCached file path not found %s\n", binaryFileName); break; } - default: + default: { b3Warning("\nFailed reading cached file with errorCode = %d\n", errorCode); } } - } else + } + else { - if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0) + if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary) == 0) { DWORD errorCode; errorCode = GetLastError(); b3Warning("\nGetFileTime errorCode = %d\n", errorCode); - } else + } + else { binaryFileValid = 1; } @@ -678,37 +660,35 @@ cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_dev if (binaryFileValid) { - HANDLE srcFileHandle = CreateFileA(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0); + HANDLE srcFileHandle = CreateFileA(clFileNameForCaching, GENERIC_READ, 0, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); - if (srcFileHandle==INVALID_HANDLE_VALUE) + if (srcFileHandle == INVALID_HANDLE_VALUE) { - const char* prefix[]={"./","../","../../","../../../","../../../../"}; - for (int i=0;(srcFileHandle==INVALID_HANDLE_VALUE) && i<5;i++) + const char* prefix[] = {"./", "../", "../../", "../../../", "../../../../"}; + for (int i = 0; (srcFileHandle == INVALID_HANDLE_VALUE) && i < 5; i++) { char relativeFileName[1024]; - sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching); - srcFileHandle = CreateFileA(relativeFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0); + sprintf(relativeFileName, "%s%s", prefix[i], clFileNameForCaching); + srcFileHandle = CreateFileA(relativeFileName, GENERIC_READ, 0, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); } - } - - if (srcFileHandle!=INVALID_HANDLE_VALUE) + if (srcFileHandle != INVALID_HANDLE_VALUE) { FILETIME modtimeSrc; - if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0) + if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc) == 0) { DWORD errorCode; errorCode = GetLastError(); b3Warning("\nGetFileTime errorCode = %d\n", errorCode); } - if ( ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime) - ||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime))) + if ((modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime) || ((modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime) && (modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime))) { - fileUpToDate=1; - } else + fileUpToDate = 1; + } + else { - b3Warning("\nCached binary file out-of-date (%s)\n",binaryFileName); + b3Warning("\nCached binary file out-of-date (%s)\n", binaryFileName); } CloseHandle(srcFileHandle); } @@ -719,25 +699,25 @@ cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_dev errorCode = GetLastError(); switch (errorCode) { - case ERROR_FILE_NOT_FOUND: + case ERROR_FILE_NOT_FOUND: { b3Warning("\nSrc file not found %s\n", clFileNameForCaching); break; } - case ERROR_PATH_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: { b3Warning("\nSrc path not found %s\n", clFileNameForCaching); break; } - default: + default: { b3Warning("\nnSrc file reading errorCode = %d\n", errorCode); } } //we should make sure the src file exists so we can verify the timestamp with binary -// assert(0); - b3Warning("Warning: cannot find OpenCL kernel %s to verify timestamp of binary cached kernel %s\n",clFileNameForCaching, binaryFileName); + // assert(0); + b3Warning("Warning: cannot find OpenCL kernel %s to verify timestamp of binary cached kernel %s\n", clFileNameForCaching, binaryFileName); fileUpToDate = true; #else //if we cannot find the source, assume it is OK in release builds @@ -745,126 +725,109 @@ cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_dev #endif } } - - } - - #else - fileUpToDate = true; - if (mkdir(sCachedBinaryPath,0777) == -1) - { - } - else - { - b3Printf("Succesfully created cache directory: %s\n", sCachedBinaryPath); - } -#endif //_WIN32 + fileUpToDate = true; + if (mkdir(sCachedBinaryPath, 0777) == -1) + { + } + else + { + b3Printf("Succesfully created cache directory: %s\n", sCachedBinaryPath); + } +#endif //_WIN32 } - - - if( fileUpToDate) + + if (fileUpToDate) { #ifdef _MSC_VER FILE* file; - if (fopen_s(&file,binaryFileName, "rb")!=0) - file=0; + if (fopen_s(&file, binaryFileName, "rb") != 0) + file = 0; #else FILE* file = fopen(binaryFileName, "rb"); #endif - + if (file) { - size_t binarySize=0; - char* binary =0; - - fseek( file, 0L, SEEK_END ); - binarySize = ftell( file ); - rewind( file ); - binary = (char*)malloc(sizeof(char)*binarySize); + size_t binarySize = 0; + char* binary = 0; + + fseek(file, 0L, SEEK_END); + binarySize = ftell(file); + rewind(file); + binary = (char*)malloc(sizeof(char) * binarySize); int bytesRead; - bytesRead = fread( binary, sizeof(char), binarySize, file ); - fclose( file ); - - m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status ); - b3Assert( status == CL_SUCCESS ); - status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 ); - b3Assert( status == CL_SUCCESS ); - - if( status != CL_SUCCESS ) + bytesRead = fread(binary, sizeof(char), binarySize, file); + fclose(file); + + m_cpProgram = clCreateProgramWithBinary(clContext, 1, &device, &binarySize, (const unsigned char**)&binary, 0, &status); + b3Assert(status == CL_SUCCESS); + status = clBuildProgram(m_cpProgram, 1, &device, additionalMacros, 0, 0); + b3Assert(status == CL_SUCCESS); + + if (status != CL_SUCCESS) { - char *build_log; + char* build_log; size_t ret_val_size; clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); - build_log = (char*)malloc(sizeof(char)*(ret_val_size+1)); + build_log = (char*)malloc(sizeof(char) * (ret_val_size + 1)); clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); build_log[ret_val_size] = '\0'; b3Error("%s\n", build_log); - free (build_log); + free(build_log); b3Assert(0); m_cpProgram = 0; - b3Warning("clBuildProgram reported failure on cached binary: %s\n",binaryFileName); - - } else + b3Warning("clBuildProgram reported failure on cached binary: %s\n", binaryFileName); + } + else { - b3Printf("clBuildProgram successfully compiled cached binary: %s\n",binaryFileName); + b3Printf("clBuildProgram successfully compiled cached binary: %s\n", binaryFileName); } - free (binary); - - } else + free(binary); + } + else { - b3Warning("Cannot open cached binary: %s\n",binaryFileName); + b3Warning("Cannot open cached binary: %s\n", binaryFileName); } } - - - - - - - - - + if (!m_cpProgram) { - cl_int localErrNum; char* compileFlags; int flagsize; - - const char* kernelSource = kernelSourceOrg; if (!kernelSourceOrg || gDebugForceLoadingFromSource) { if (clFileNameForCaching) { - FILE* file = fopen(clFileNameForCaching, "rb"); //in many cases the relative path is a few levels up the directory hierarchy, so try it if (!file) { - const char* prefix[]={"../","../../","../../../","../../../../"}; - for (int i=0;!file && i<3;i++) + const char* prefix[] = {"../", "../../", "../../../", "../../../../"}; + for (int i = 0; !file && i < 3; i++) { char relativeFileName[1024]; - sprintf(relativeFileName,"%s%s",prefix[i],clFileNameForCaching); + sprintf(relativeFileName, "%s%s", prefix[i], clFileNameForCaching); file = fopen(relativeFileName, "rb"); } } if (file) { - char* kernelSrc=0; - fseek( file, 0L, SEEK_END ); - int kernelSize = ftell( file ); - rewind( file ); - kernelSrc = (char*)malloc(kernelSize+1); + char* kernelSrc = 0; + fseek(file, 0L, SEEK_END); + int kernelSize = ftell(file); + rewind(file); + kernelSrc = (char*)malloc(kernelSize + 1); int readBytes; - readBytes = fread((void*)kernelSrc,1,kernelSize, file); + readBytes = fread((void*)kernelSrc, 1, kernelSize, file); kernelSrc[kernelSize] = 0; fclose(file); kernelSource = kernelSrc; @@ -873,15 +836,14 @@ cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_dev } size_t program_length = kernelSource ? strlen(kernelSource) : 0; -#ifdef MAC //or __APPLE__? +#ifdef MAC //or __APPLE__? char* flags = "-cl-mad-enable -DMAC "; #else const char* flags = ""; #endif - m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum); - if (localErrNum!= CL_SUCCESS) + if (localErrNum != CL_SUCCESS) { if (pErrNum) *pErrNum = localErrNum; @@ -890,108 +852,100 @@ cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_dev // Build the program with 'mad' Optimization option - - - flagsize = sizeof(char)*(strlen(additionalMacros) + strlen(flags) + 5); - compileFlags = (char*) malloc(flagsize); + flagsize = sizeof(char) * (strlen(additionalMacros) + strlen(flags) + 5); + compileFlags = (char*)malloc(flagsize); #ifdef _MSC_VER - sprintf_s(compileFlags,flagsize, "%s %s", flags, additionalMacros); + sprintf_s(compileFlags, flagsize, "%s %s", flags, additionalMacros); #else sprintf(compileFlags, "%s %s", flags, additionalMacros); #endif localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL); - if (localErrNum!= CL_SUCCESS) + if (localErrNum != CL_SUCCESS) { - char *build_log; + char* build_log; size_t ret_val_size; clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); - build_log = (char*) malloc(sizeof(char)*(ret_val_size+1)); + build_log = (char*)malloc(sizeof(char) * (ret_val_size + 1)); clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; - b3Error("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log); - free (build_log); + free(build_log); if (pErrNum) *pErrNum = localErrNum; return 0; } - - if( !disableBinaryCaching && clFileNameForCaching ) - { // write to binary + if (!disableBinaryCaching && clFileNameForCaching) + { // write to binary cl_uint numAssociatedDevices; - status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 ); - b3Assert( status == CL_SUCCESS ); - if (numAssociatedDevices==1) + status = clGetProgramInfo(m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0); + b3Assert(status == CL_SUCCESS); + if (numAssociatedDevices == 1) { - size_t binarySize; - char* binary ; + char* binary; - status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 ); - b3Assert( status == CL_SUCCESS ); + status = clGetProgramInfo(m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0); + b3Assert(status == CL_SUCCESS); - binary = (char*)malloc(sizeof(char)*binarySize); + binary = (char*)malloc(sizeof(char) * binarySize); - status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 ); - b3Assert( status == CL_SUCCESS ); + status = clGetProgramInfo(m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0); + b3Assert(status == CL_SUCCESS); { - FILE* file=0; + FILE* file = 0; #ifdef _MSC_VER - if (fopen_s(&file,binaryFileName, "wb")!=0) - file=0; + if (fopen_s(&file, binaryFileName, "wb") != 0) + file = 0; #else file = fopen(binaryFileName, "wb"); #endif if (file) { - fwrite( binary, sizeof(char), binarySize, file ); - fclose( file ); - } else + fwrite(binary, sizeof(char), binarySize, file); + fclose(file); + } + else { b3Warning("cannot write file %s\n", binaryFileName); } } - free (binary); + free(binary); } } free(compileFlags); - } return m_cpProgram; } - -cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros ) +cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros) { - cl_kernel kernel; cl_int localErrNum; cl_program m_cpProgram = prog; - b3Printf("compiling kernel %s ",kernelName); + b3Printf("compiling kernel %s ", kernelName); if (!m_cpProgram) { - m_cpProgram = b3OpenCLUtils_compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros,0, false); + m_cpProgram = b3OpenCLUtils_compileCLProgramFromString(clContext, device, kernelSource, pErrNum, additionalMacros, 0, false); } - // Create the kernel kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum); if (localErrNum != CL_SUCCESS) { b3Error("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName); - assert(0); + assert(0); if (pErrNum) *pErrNum = localErrNum; return 0; @@ -1003,9 +957,7 @@ cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_devic } b3Printf("ready. \n"); - if (pErrNum) - *pErrNum = CL_SUCCESS; + *pErrNum = CL_SUCCESS; return kernel; - } diff --git a/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.h b/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.h index db6466e76b..6c82eed2a6 100644 --- a/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.h +++ b/thirdparty/bullet/Bullet3OpenCL/Initialize/b3OpenCLUtils.h @@ -22,42 +22,41 @@ subject to the following restrictions: #include "b3OpenCLInclude.h" #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif + ///C API for OpenCL utilities: convenience functions, see below for C++ API -///C API for OpenCL utilities: convenience functions, see below for C++ API + /// CL Context optionally takes a GL context. This is a generic type because we don't really want this code + /// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise. + cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* platformId); -/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code -/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise. -cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId); - -int b3OpenCLUtils_getNumDevices(cl_context cxMainContext); + int b3OpenCLUtils_getNumDevices(cl_context cxMainContext); -cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr); + cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr); -void b3OpenCLUtils_printDeviceInfo(cl_device_id device); + void b3OpenCLUtils_printDeviceInfo(cl_device_id device); -cl_kernel b3OpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros); + cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros); -//optional -cl_program b3OpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros , const char* srcFileNameForCaching, bool disableBinaryCaching); + //optional + cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum, const char* additionalMacros, const char* srcFileNameForCaching, bool disableBinaryCaching); -//the following optional APIs provide access using specific platform information -int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum); + //the following optional APIs provide access using specific platform information + int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum); + + ///get the nr'th platform, where nr is in the range [0..getNumPlatforms) + cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum); -///get the nr'th platform, where nr is in the range [0..getNumPlatforms) -cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum); + void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform); + const char* b3OpenCLUtils_getSdkVendorName(); -void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform); + ///set the path (directory/folder) where the compiled OpenCL kernel are stored + void b3OpenCLUtils_setCachePath(const char* path); -const char* b3OpenCLUtils_getSdkVendorName(); - -///set the path (directory/folder) where the compiled OpenCL kernel are stored -void b3OpenCLUtils_setCachePath(const char* path); - -cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex); + cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex); #ifdef __cplusplus } @@ -71,37 +70,35 @@ typedef struct char m_driverVersion[B3_MAX_STRING_LENGTH]; char m_deviceExtensions[B3_MAX_STRING_LENGTH]; - cl_device_type m_deviceType; - cl_uint m_computeUnits; - size_t m_workitemDims; - size_t m_workItemSize[3]; - size_t m_image2dMaxWidth; - size_t m_image2dMaxHeight; - size_t m_image3dMaxWidth; - size_t m_image3dMaxHeight; - size_t m_image3dMaxDepth; - size_t m_workgroupSize; - cl_uint m_clockFrequency; - cl_ulong m_constantBufferSize; - cl_ulong m_localMemSize; - cl_ulong m_globalMemSize; - cl_bool m_errorCorrectionSupport; + cl_device_type m_deviceType; + cl_uint m_computeUnits; + size_t m_workitemDims; + size_t m_workItemSize[3]; + size_t m_image2dMaxWidth; + size_t m_image2dMaxHeight; + size_t m_image3dMaxWidth; + size_t m_image3dMaxHeight; + size_t m_image3dMaxDepth; + size_t m_workgroupSize; + cl_uint m_clockFrequency; + cl_ulong m_constantBufferSize; + cl_ulong m_localMemSize; + cl_ulong m_globalMemSize; + cl_bool m_errorCorrectionSupport; cl_device_local_mem_type m_localMemType; - cl_uint m_maxReadImageArgs; - cl_uint m_maxWriteImageArgs; + cl_uint m_maxReadImageArgs; + cl_uint m_maxWriteImageArgs; - - - cl_uint m_addressBits; - cl_ulong m_maxMemAllocSize; + cl_uint m_addressBits; + cl_ulong m_maxMemAllocSize; cl_command_queue_properties m_queueProperties; - cl_bool m_imageSupport; - cl_uint m_vecWidthChar; - cl_uint m_vecWidthShort; - cl_uint m_vecWidthInt; - cl_uint m_vecWidthLong; - cl_uint m_vecWidthFloat; - cl_uint m_vecWidthDouble; + cl_bool m_imageSupport; + cl_uint m_vecWidthChar; + cl_uint m_vecWidthShort; + cl_uint m_vecWidthInt; + cl_uint m_vecWidthLong; + cl_uint m_vecWidthFloat; + cl_uint m_vecWidthDouble; } b3OpenCLDeviceInfo; @@ -110,33 +107,32 @@ struct b3OpenCLPlatformInfo char m_platformVendor[B3_MAX_STRING_LENGTH]; char m_platformName[B3_MAX_STRING_LENGTH]; char m_platformVersion[B3_MAX_STRING_LENGTH]; - + b3OpenCLPlatformInfo() { - m_platformVendor[0]=0; - m_platformName[0]=0; - m_platformVersion[0]=0; + m_platformVendor[0] = 0; + m_platformName[0] = 0; + m_platformVersion[0] = 0; } }; - ///C++ API for OpenCL utilities: convenience functions struct b3OpenCLUtils { /// CL Context optionally takes a GL context. This is a generic type because we don't really want this code /// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise. - static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0) + static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1, cl_platform_id* platformId = 0) { - return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId); + return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex, platformId); } - + static inline int getNumDevices(cl_context cxMainContext) { return b3OpenCLUtils_getNumDevices(cxMainContext); } static inline cl_device_id getDevice(cl_context cxMainContext, int nr) { - return b3OpenCLUtils_getDevice(cxMainContext,nr); + return b3OpenCLUtils_getDevice(cxMainContext, nr); } static void getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info); @@ -146,28 +142,28 @@ struct b3OpenCLUtils b3OpenCLUtils_printDeviceInfo(device); } - static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" ) + static inline cl_kernel compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum = 0, cl_program prog = 0, const char* additionalMacros = "") { - return b3OpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource, kernelName, pErrNum, prog,additionalMacros); + return b3OpenCLUtils_compileCLKernelFromString(clContext, device, kernelSource, kernelName, pErrNum, prog, additionalMacros); } //optional - static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0, bool disableBinaryCaching=false) + static inline cl_program compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum = 0, const char* additionalMacros = "", const char* srcFileNameForCaching = 0, bool disableBinaryCaching = false) { - return b3OpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching, disableBinaryCaching); + return b3OpenCLUtils_compileCLProgramFromString(clContext, device, kernelSource, pErrNum, additionalMacros, srcFileNameForCaching, disableBinaryCaching); } //the following optional APIs provide access using specific platform information - static inline int getNumPlatforms(cl_int* pErrNum=0) + static inline int getNumPlatforms(cl_int* pErrNum = 0) { return b3OpenCLUtils_getNumPlatforms(pErrNum); } ///get the nr'th platform, where nr is in the range [0..getNumPlatforms) - static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0) + static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum = 0) { - return b3OpenCLUtils_getPlatform(nr,pErrNum); + return b3OpenCLUtils_getPlatform(nr, pErrNum); } - + static void getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo); static inline void printPlatformInfo(cl_platform_id platform) @@ -179,9 +175,9 @@ struct b3OpenCLUtils { return b3OpenCLUtils_getSdkVendorName(); } - static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1) + static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1) { - return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex); + return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex); } static void setCachePath(const char* path) { @@ -189,6 +185,6 @@ struct b3OpenCLUtils } }; -#endif //__cplusplus +#endif //__cplusplus -#endif // B3_OPENCL_UTILS_H +#endif // B3_OPENCL_UTILS_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h index 872f039506..27835bb747 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h @@ -5,14 +5,13 @@ struct b3BvhInfo { - b3Vector3 m_aabbMin; - b3Vector3 m_aabbMax; - b3Vector3 m_quantization; - int m_numNodes; - int m_numSubTrees; - int m_nodeOffset; - int m_subTreeOffset; - + b3Vector3 m_aabbMin; + b3Vector3 m_aabbMax; + b3Vector3 m_quantization; + int m_numNodes; + int m_numSubTrees; + int m_nodeOffset; + int m_subTreeOffset; }; -#endif //B3_BVH_INFO_H
\ No newline at end of file +#endif //B3_BVH_INFO_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp index cb30ee939b..4db717f8c3 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp @@ -15,7 +15,6 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #include "b3ContactCache.h" #include "Bullet3Common/b3Transform.h" @@ -69,7 +68,7 @@ int b3ContactCache::sortCachedPoints(const b3Vector3& pt) maxPenetration = m_pointCache[i].getDistance(); } } -#endif //KEEP_DEEPEST_POINT +#endif //KEEP_DEEPEST_POINT b3Scalar res0(b3Scalar(0.)),res1(b3Scalar(0.)),res2(b3Scalar(0.)),res3(b3Scalar(0.)); @@ -251,8 +250,4 @@ void b3ContactCache::refreshContactPoints(const b3Transform& trA,const b3Transfo } - - - - #endif diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h index d6c9b0a07e..a15fd0b2a9 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h @@ -17,17 +17,13 @@ subject to the following restrictions: #ifndef B3_CONTACT_CACHE_H #define B3_CONTACT_CACHE_H - #include "Bullet3Common/b3Vector3.h" #include "Bullet3Common/b3Transform.h" #include "Bullet3Common/b3AlignedAllocator.h" - ///maximum contact breaking and merging threshold extern b3Scalar gContactBreakingThreshold; - - #define MANIFOLD_CACHE_SIZE 4 ///b3ContactCache is a contact point cache, it stays persistent as long as objects are overlapping in the broadphase. @@ -37,24 +33,16 @@ extern b3Scalar gContactBreakingThreshold; ///reduces the cache to 4 points, when more then 4 points are added, using following rules: ///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points ///note that some pairs of objects might have more then one contact manifold. -B3_ATTRIBUTE_ALIGNED16( class) b3ContactCache +B3_ATTRIBUTE_ALIGNED16(class) +b3ContactCache { - - - - /// sort cached points so most isolated points come first - int sortCachedPoints(const b3Vector3& pt); - - + int sortCachedPoints(const b3Vector3& pt); public: - B3_DECLARE_ALIGNED_ALLOCATOR(); - - - int addManifoldPoint( const b3Vector3& newPoint); + int addManifoldPoint(const b3Vector3& newPoint); /*void replaceContactPoint(const b3Vector3& newPoint,int insertIndex) { @@ -63,18 +51,12 @@ public: } */ - - static bool validContactDistance(const b3Vector3& pt); - - /// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin - static void refreshContactPoints( const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& newContactCache); - static void removeContactPoint(struct b3Contact4Data& newContactCache,int i); - + /// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin + static void refreshContactPoints(const b3Transform& trA, const b3Transform& trB, struct b3Contact4Data& newContactCache); + static void removeContactPoint(struct b3Contact4Data & newContactCache, int i); }; - - -#endif //B3_CONTACT_CACHE_H +#endif //B3_CONTACT_CACHE_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp index fb435aa7fd..54a104c5c8 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp @@ -16,19 +16,18 @@ subject to the following restrictions: bool findSeparatingAxisOnGpu = true; bool splitSearchSepAxisConcave = false; bool splitSearchSepAxisConvex = true; -bool useMprGpu = true;//use mpr for edge-edge (+contact point) or sat. Needs testing on main OpenCL platforms, before enabling... +bool useMprGpu = true; //use mpr for edge-edge (+contact point) or sat. Needs testing on main OpenCL platforms, before enabling... bool bvhTraversalKernelGPU = true; bool findConcaveSeparatingAxisKernelGPU = true; -bool clipConcaveFacesAndFindContactsCPU = false;//false;//true; -bool clipConvexFacesAndFindContactsCPU = false;//false;//true; -bool reduceConcaveContactsOnGPU = true;//false; -bool reduceConvexContactsOnGPU = true;//false; +bool clipConcaveFacesAndFindContactsCPU = false; //false;//true; +bool clipConvexFacesAndFindContactsCPU = false; //false;//true; +bool reduceConcaveContactsOnGPU = true; //false; +bool reduceConvexContactsOnGPU = true; //false; bool findConvexClippingFacesGPU = true; -bool useGjk = false;///option for CPU/host testing, when findSeparatingAxisOnGpu = false -bool useGjkContacts = false;//////option for CPU/host testing when findSeparatingAxisOnGpu = false +bool useGjk = false; ///option for CPU/host testing, when findSeparatingAxisOnGpu = false +bool useGjkContacts = false; //////option for CPU/host testing when findSeparatingAxisOnGpu = false - -static int myframecount=0;///for testing +static int myframecount = 0; ///for testing ///This file was written by Erwin Coumans ///Separating axis rest based on work from Pierre Terdiman, see @@ -42,10 +41,10 @@ static int myframecount=0;///for testing //#define PERSISTENT_CONTACTS_HOST #endif -int b3g_actualSATPairTests=0; +int b3g_actualSATPairTests = 0; #include "b3ConvexHullContact.h" -#include <string.h>//memcpy +#include <string.h> //memcpy #include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" #include "Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h" @@ -54,8 +53,7 @@ int b3g_actualSATPairTests=0; typedef b3AlignedObjectArray<b3Vector3> b3VertexArray; - -#include <float.h> //for FLT_MAX +#include <float.h> //for FLT_MAX #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" //#include "AdlQuaternion.h" @@ -69,7 +67,6 @@ typedef b3AlignedObjectArray<b3Vector3> b3VertexArray; #include "kernels/bvhTraversal.h" #include "kernels/primitiveContacts.h" - #include "Bullet3Geometry/b3AabbUtil.h" #define BT_NARROWPHASE_SAT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl" @@ -77,12 +74,10 @@ typedef b3AlignedObjectArray<b3Vector3> b3VertexArray; #define BT_NARROWPHASE_MPR_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl" - #define BT_NARROWPHASE_CLIPHULL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl" #define BT_NARROWPHASE_BVH_TRAVERSAL_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.cl" #define BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH "src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl" - #ifndef __global #define __global #endif @@ -91,204 +86,184 @@ typedef b3AlignedObjectArray<b3Vector3> b3VertexArray; #define __kernel #endif - #include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h" #include "Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h" #include "Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h" #include "Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h" - - #define dot3F4 b3Dot -GpuSatCollision::GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue q ) -:m_context(ctx), -m_device(device), -m_queue(q), +GpuSatCollision::GpuSatCollision(cl_context ctx, cl_device_id device, cl_command_queue q) + : m_context(ctx), + m_device(device), + m_queue(q), -m_findSeparatingAxisKernel(0), -m_findSeparatingAxisVertexFaceKernel(0), -m_findSeparatingAxisEdgeEdgeKernel(0), -m_unitSphereDirections(m_context,m_queue), + m_findSeparatingAxisKernel(0), + m_findSeparatingAxisVertexFaceKernel(0), + m_findSeparatingAxisEdgeEdgeKernel(0), + m_unitSphereDirections(m_context, m_queue), -m_totalContactsOut(m_context, m_queue), -m_sepNormals(m_context, m_queue), -m_dmins(m_context,m_queue), + m_totalContactsOut(m_context, m_queue), + m_sepNormals(m_context, m_queue), + m_dmins(m_context, m_queue), -m_hasSeparatingNormals(m_context, m_queue), -m_concaveSepNormals(m_context, m_queue), -m_concaveHasSeparatingNormals(m_context,m_queue), -m_numConcavePairsOut(m_context, m_queue), + m_hasSeparatingNormals(m_context, m_queue), + m_concaveSepNormals(m_context, m_queue), + m_concaveHasSeparatingNormals(m_context, m_queue), + m_numConcavePairsOut(m_context, m_queue), + m_gpuCompoundPairs(m_context, m_queue), -m_gpuCompoundPairs(m_context, m_queue), + m_gpuCompoundSepNormals(m_context, m_queue), + m_gpuHasCompoundSepNormals(m_context, m_queue), - -m_gpuCompoundSepNormals(m_context, m_queue), -m_gpuHasCompoundSepNormals(m_context, m_queue), - -m_numCompoundPairsOut(m_context, m_queue) + m_numCompoundPairsOut(m_context, m_queue) { m_totalContactsOut.push_back(0); - - cl_int errNum=0; + + cl_int errNum = 0; if (1) { const char* mprSrc = mprKernelsCL; - + const char* srcConcave = satConcaveKernelsCL; - char flags[1024]={0}; -//#ifdef CL_PLATFORM_INTEL -// sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/sat.cl"); -//#endif - m_mprPenetrationKernel = 0; + char flags[1024] = {0}; + //#ifdef CL_PLATFORM_INTEL + // sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/sat.cl"); + //#endif + m_mprPenetrationKernel = 0; m_findSeparatingAxisUnitSphereKernel = 0; if (useMprGpu) { - cl_program mprProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,mprSrc,&errNum,flags,BT_NARROWPHASE_MPR_PATH); - b3Assert(errNum==CL_SUCCESS); - - m_mprPenetrationKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,mprSrc, "mprPenetrationKernel",&errNum,mprProg ); + cl_program mprProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, mprSrc, &errNum, flags, BT_NARROWPHASE_MPR_PATH); + b3Assert(errNum == CL_SUCCESS); + + m_mprPenetrationKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, mprSrc, "mprPenetrationKernel", &errNum, mprProg); b3Assert(m_mprPenetrationKernel); - b3Assert(errNum==CL_SUCCESS); + b3Assert(errNum == CL_SUCCESS); - m_findSeparatingAxisUnitSphereKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,mprSrc, "findSeparatingAxisUnitSphereKernel",&errNum,mprProg ); + m_findSeparatingAxisUnitSphereKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, mprSrc, "findSeparatingAxisUnitSphereKernel", &errNum, mprProg); b3Assert(m_findSeparatingAxisUnitSphereKernel); - b3Assert(errNum==CL_SUCCESS); + b3Assert(errNum == CL_SUCCESS); - - int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3); + int numDirections = sizeof(unitSphere162) / sizeof(b3Vector3); m_unitSphereDirections.resize(numDirections); - m_unitSphereDirections.copyFromHostPointer(unitSphere162,numDirections,0,true); - - + m_unitSphereDirections.copyFromHostPointer(unitSphere162, numDirections, 0, true); } + cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, satKernelsCL, &errNum, flags, BT_NARROWPHASE_SAT_PATH); + b3Assert(errNum == CL_SUCCESS); - cl_program satProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,satKernelsCL,&errNum,flags,BT_NARROWPHASE_SAT_PATH); - b3Assert(errNum==CL_SUCCESS); - - cl_program satConcaveProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcConcave,&errNum,flags,BT_NARROWPHASE_SAT_CONCAVE_PATH); - b3Assert(errNum==CL_SUCCESS); + cl_program satConcaveProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, srcConcave, &errNum, flags, BT_NARROWPHASE_SAT_CONCAVE_PATH); + b3Assert(errNum == CL_SUCCESS); - m_findSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisKernel",&errNum,satProg ); + m_findSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, satKernelsCL, "findSeparatingAxisKernel", &errNum, satProg); b3Assert(m_findSeparatingAxisKernel); - b3Assert(errNum==CL_SUCCESS); + b3Assert(errNum == CL_SUCCESS); - - m_findSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisVertexFaceKernel",&errNum,satProg ); + m_findSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, satKernelsCL, "findSeparatingAxisVertexFaceKernel", &errNum, satProg); b3Assert(m_findSeparatingAxisVertexFaceKernel); - m_findSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findSeparatingAxisEdgeEdgeKernel",&errNum,satProg ); + m_findSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, satKernelsCL, "findSeparatingAxisEdgeEdgeKernel", &errNum, satProg); b3Assert(m_findSeparatingAxisVertexFaceKernel); - - m_findConcaveSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findConcaveSeparatingAxisKernel",&errNum,satProg ); + m_findConcaveSeparatingAxisKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, satKernelsCL, "findConcaveSeparatingAxisKernel", &errNum, satProg); b3Assert(m_findConcaveSeparatingAxisKernel); - b3Assert(errNum==CL_SUCCESS); - - m_findConcaveSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcConcave, "findConcaveSeparatingAxisVertexFaceKernel",&errNum,satConcaveProg ); + b3Assert(errNum == CL_SUCCESS); + + m_findConcaveSeparatingAxisVertexFaceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, srcConcave, "findConcaveSeparatingAxisVertexFaceKernel", &errNum, satConcaveProg); b3Assert(m_findConcaveSeparatingAxisVertexFaceKernel); - b3Assert(errNum==CL_SUCCESS); - - m_findConcaveSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcConcave, "findConcaveSeparatingAxisEdgeEdgeKernel",&errNum,satConcaveProg ); + b3Assert(errNum == CL_SUCCESS); + + m_findConcaveSeparatingAxisEdgeEdgeKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, srcConcave, "findConcaveSeparatingAxisEdgeEdgeKernel", &errNum, satConcaveProg); b3Assert(m_findConcaveSeparatingAxisEdgeEdgeKernel); - b3Assert(errNum==CL_SUCCESS); - - - - - m_findCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "findCompoundPairsKernel",&errNum,satProg ); + b3Assert(errNum == CL_SUCCESS); + + m_findCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, satKernelsCL, "findCompoundPairsKernel", &errNum, satProg); b3Assert(m_findCompoundPairsKernel); - b3Assert(errNum==CL_SUCCESS); - m_processCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,satKernelsCL, "processCompoundPairsKernel",&errNum,satProg ); + b3Assert(errNum == CL_SUCCESS); + m_processCompoundPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, satKernelsCL, "processCompoundPairsKernel", &errNum, satProg); b3Assert(m_processCompoundPairsKernel); - b3Assert(errNum==CL_SUCCESS); + b3Assert(errNum == CL_SUCCESS); } if (1) { const char* srcClip = satClipKernelsCL; - char flags[1024]={0}; -//#ifdef CL_PLATFORM_INTEL -// sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/satClipHullContacts.cl"); -//#endif + char flags[1024] = {0}; + //#ifdef CL_PLATFORM_INTEL + // sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/satClipHullContacts.cl"); + //#endif - cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,flags,BT_NARROWPHASE_CLIPHULL_PATH); - b3Assert(errNum==CL_SUCCESS); + cl_program satClipContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, srcClip, &errNum, flags, BT_NARROWPHASE_CLIPHULL_PATH); + b3Assert(errNum == CL_SUCCESS); - m_clipHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullKernel",&errNum,satClipContactsProg); - b3Assert(errNum==CL_SUCCESS); + m_clipHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, srcClip, "clipHullHullKernel", &errNum, satClipContactsProg); + b3Assert(errNum == CL_SUCCESS); - m_clipCompoundsHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipCompoundsHullHullKernel",&errNum,satClipContactsProg); - b3Assert(errNum==CL_SUCCESS); - + m_clipCompoundsHullHullKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, srcClip, "clipCompoundsHullHullKernel", &errNum, satClipContactsProg); + b3Assert(errNum == CL_SUCCESS); - m_findClippingFacesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "findClippingFacesKernel",&errNum,satClipContactsProg); - b3Assert(errNum==CL_SUCCESS); + m_findClippingFacesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, srcClip, "findClippingFacesKernel", &errNum, satClipContactsProg); + b3Assert(errNum == CL_SUCCESS); - m_clipFacesAndFindContacts = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipFacesAndFindContactsKernel",&errNum,satClipContactsProg); - b3Assert(errNum==CL_SUCCESS); + m_clipFacesAndFindContacts = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, srcClip, "clipFacesAndFindContactsKernel", &errNum, satClipContactsProg); + b3Assert(errNum == CL_SUCCESS); - m_clipHullHullConcaveConvexKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullConcaveConvexKernel",&errNum,satClipContactsProg); - b3Assert(errNum==CL_SUCCESS); + m_clipHullHullConcaveConvexKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, srcClip, "clipHullHullConcaveConvexKernel", &errNum, satClipContactsProg); + b3Assert(errNum == CL_SUCCESS); -// m_extractManifoldAndAddContactKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "extractManifoldAndAddContactKernel",&errNum,satClipContactsProg); - // b3Assert(errNum==CL_SUCCESS); + // m_extractManifoldAndAddContactKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "extractManifoldAndAddContactKernel",&errNum,satClipContactsProg); + // b3Assert(errNum==CL_SUCCESS); - m_newContactReductionKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, - "newContactReductionKernel",&errNum,satClipContactsProg); - b3Assert(errNum==CL_SUCCESS); + m_newContactReductionKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, srcClip, + "newContactReductionKernel", &errNum, satClipContactsProg); + b3Assert(errNum == CL_SUCCESS); } - else + else { - m_clipHullHullKernel=0; + m_clipHullHullKernel = 0; m_clipCompoundsHullHullKernel = 0; - m_findClippingFacesKernel = 0; - m_newContactReductionKernel=0; - m_clipFacesAndFindContacts = 0; + m_findClippingFacesKernel = 0; + m_newContactReductionKernel = 0; + m_clipFacesAndFindContacts = 0; m_clipHullHullConcaveConvexKernel = 0; -// m_extractManifoldAndAddContactKernel = 0; + // m_extractManifoldAndAddContactKernel = 0; } - if (1) + if (1) { const char* srcBvh = bvhTraversalKernelCL; - cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"",BT_NARROWPHASE_BVH_TRAVERSAL_PATH); - b3Assert(errNum==CL_SUCCESS); - - m_bvhTraversalKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,srcBvh, "bvhTraversalKernel",&errNum,bvhTraversalProg,""); - b3Assert(errNum==CL_SUCCESS); + cl_program bvhTraversalProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, srcBvh, &errNum, "", BT_NARROWPHASE_BVH_TRAVERSAL_PATH); + b3Assert(errNum == CL_SUCCESS); + m_bvhTraversalKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, srcBvh, "bvhTraversalKernel", &errNum, bvhTraversalProg, ""); + b3Assert(errNum == CL_SUCCESS); } - - { - const char* primitiveContactsSrc = primitiveContactsKernelsCL; - cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,primitiveContactsSrc,&errNum,"",BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH); - b3Assert(errNum==CL_SUCCESS); - m_primitiveContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "primitiveContactsKernel",&errNum,primitiveContactsProg,""); - b3Assert(errNum==CL_SUCCESS); + { + const char* primitiveContactsSrc = primitiveContactsKernelsCL; + cl_program primitiveContactsProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, primitiveContactsSrc, &errNum, "", BT_NARROWPHASE_PRIMITIVE_CONTACT_PATH); + b3Assert(errNum == CL_SUCCESS); + + m_primitiveContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, primitiveContactsSrc, "primitiveContactsKernel", &errNum, primitiveContactsProg, ""); + b3Assert(errNum == CL_SUCCESS); - m_findConcaveSphereContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "findConcaveSphereContactsKernel",&errNum,primitiveContactsProg ); - b3Assert(errNum==CL_SUCCESS); + m_findConcaveSphereContactsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, primitiveContactsSrc, "findConcaveSphereContactsKernel", &errNum, primitiveContactsProg); + b3Assert(errNum == CL_SUCCESS); b3Assert(m_findConcaveSphereContactsKernel); - m_processCompoundPairsPrimitivesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,primitiveContactsSrc, "processCompoundPairsPrimitivesKernel",&errNum,primitiveContactsProg,""); - b3Assert(errNum==CL_SUCCESS); + m_processCompoundPairsPrimitivesKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, primitiveContactsSrc, "processCompoundPairsPrimitivesKernel", &errNum, primitiveContactsProg, ""); + b3Assert(errNum == CL_SUCCESS); b3Assert(m_processCompoundPairsPrimitivesKernel); - - } - - + } } GpuSatCollision::~GpuSatCollision() { - if (m_findSeparatingAxisVertexFaceKernel) clReleaseKernel(m_findSeparatingAxisVertexFaceKernel); @@ -301,17 +276,15 @@ GpuSatCollision::~GpuSatCollision() if (m_mprPenetrationKernel) clReleaseKernel(m_mprPenetrationKernel); - if (m_findSeparatingAxisKernel) clReleaseKernel(m_findSeparatingAxisKernel); - if (m_findConcaveSeparatingAxisVertexFaceKernel) - clReleaseKernel(m_findConcaveSeparatingAxisVertexFaceKernel); + if (m_findConcaveSeparatingAxisVertexFaceKernel) + clReleaseKernel(m_findConcaveSeparatingAxisVertexFaceKernel); + + if (m_findConcaveSeparatingAxisEdgeEdgeKernel) + clReleaseKernel(m_findConcaveSeparatingAxisEdgeEdgeKernel); - - if (m_findConcaveSeparatingAxisEdgeEdgeKernel) - clReleaseKernel(m_findConcaveSeparatingAxisEdgeEdgeKernel); - if (m_findConcaveSeparatingAxisKernel) clReleaseKernel(m_findConcaveSeparatingAxisKernel); @@ -320,17 +293,17 @@ GpuSatCollision::~GpuSatCollision() if (m_processCompoundPairsKernel) clReleaseKernel(m_processCompoundPairsKernel); - - if (m_findClippingFacesKernel) - clReleaseKernel(m_findClippingFacesKernel); - - if (m_clipFacesAndFindContacts) - clReleaseKernel(m_clipFacesAndFindContacts); - if (m_newContactReductionKernel) - clReleaseKernel(m_newContactReductionKernel); + + if (m_findClippingFacesKernel) + clReleaseKernel(m_findClippingFacesKernel); + + if (m_clipFacesAndFindContacts) + clReleaseKernel(m_clipFacesAndFindContacts); + if (m_newContactReductionKernel) + clReleaseKernel(m_newContactReductionKernel); if (m_primitiveContactsKernel) clReleaseKernel(m_primitiveContactsKernel); - + if (m_findConcaveSphereContactsKernel) clReleaseKernel(m_findConcaveSphereContactsKernel); @@ -344,12 +317,11 @@ GpuSatCollision::~GpuSatCollision() if (m_clipHullHullConcaveConvexKernel) clReleaseKernel(m_clipHullHullConcaveConvexKernel); -// if (m_extractManifoldAndAddContactKernel) + // if (m_extractManifoldAndAddContactKernel) // clReleaseKernel(m_extractManifoldAndAddContactKernel); if (m_bvhTraversalKernel) clReleaseKernel(m_bvhTraversalKernel); - } struct MyTriangleCallback : public b3NodeOverlapCallback @@ -359,14 +331,13 @@ struct MyTriangleCallback : public b3NodeOverlapCallback virtual void processNode(int subPart, int triangleIndex) { - printf("bodyIndexA %d, bodyIndexB %d\n",m_bodyIndexA,m_bodyIndexB); + printf("bodyIndexA %d, bodyIndexB %d\n", m_bodyIndexA, m_bodyIndexB); printf("triangleIndex %d\n", triangleIndex); } }; - #define float4 b3Vector3 -#define make_float4(x,y,z,w) b3MakeVector3(x,y,z,w) +#define make_float4(x, y, z, w) b3MakeVector3(x, y, z, w) float signedDistanceFromPointToPlane(const float4& point, const float4& planeEqn, float4* closestPointOnFace) { @@ -377,9 +348,7 @@ float signedDistanceFromPointToPlane(const float4& point, const float4& planeEqn return dist; } - - -#define cross3(a,b) (a.cross(b)) +#define cross3(a, b) (a.cross(b)) b3Vector3 transform(const b3Vector3* v, const b3Vector3* pos, const b3Quaternion* orn) { b3Transform tr; @@ -390,184 +359,170 @@ b3Vector3 transform(const b3Vector3* v, const b3Vector3* pos, const b3Quaternion return res; } - -inline bool IsPointInPolygon(const float4& p, - const b3GpuFace* face, +inline bool IsPointInPolygon(const float4& p, + const b3GpuFace* face, const float4* baseVertex, - const int* convexIndices, - float4* out) + const int* convexIndices, + float4* out) { - float4 a; - float4 b; - float4 ab; - float4 ap; - float4 v; + float4 a; + float4 b; + float4 ab; + float4 ap; + float4 v; - float4 plane = b3MakeVector3(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f); - - if (face->m_numIndices<2) + float4 plane = b3MakeVector3(face->m_plane.x, face->m_plane.y, face->m_plane.z, 0.f); + + if (face->m_numIndices < 2) return false; - - float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]]; + float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices - 1]]; b = v0; - for(unsigned i=0; i != face->m_numIndices; ++i) - { + for (unsigned i = 0; i != face->m_numIndices; ++i) + { a = b; float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]]; b = vi; - ab = b-a; - ap = p-a; - v = cross3(ab,plane); - - if (b3Dot(ap, v) > 0.f) - { - float ab_m2 = b3Dot(ab, ab); - float rt = ab_m2 != 0.f ? b3Dot(ab, ap) / ab_m2 : 0.f; - if (rt <= 0.f) - { - *out = a; - } - else if (rt >= 1.f) - { - *out = b; - } - else - { - float s = 1.f - rt; + ab = b - a; + ap = p - a; + v = cross3(ab, plane); + + if (b3Dot(ap, v) > 0.f) + { + float ab_m2 = b3Dot(ab, ab); + float rt = ab_m2 != 0.f ? b3Dot(ab, ap) / ab_m2 : 0.f; + if (rt <= 0.f) + { + *out = a; + } + else if (rt >= 1.f) + { + *out = b; + } + else + { + float s = 1.f - rt; out[0].x = s * a.x + rt * b.x; out[0].y = s * a.y + rt * b.y; out[0].z = s * a.z + rt * b.z; - } - return false; - } - } - return true; + } + return false; + } + } + return true; } #define normalize3(a) (a.normalize()) - -int extractManifoldSequentialGlobal( const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx) +int extractManifoldSequentialGlobal(const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx) { - if( nPoints == 0 ) - return 0; - - if (nPoints <=4) - return nPoints; - - - if (nPoints >64) - nPoints = 64; - - float4 center = b3MakeVector3(0,0,0,0); + if (nPoints == 0) + return 0; + + if (nPoints <= 4) + return nPoints; + + if (nPoints > 64) + nPoints = 64; + + float4 center = b3MakeVector3(0, 0, 0, 0); { - - for (int i=0;i<nPoints;i++) + for (int i = 0; i < nPoints; i++) center += p[i]; center /= (float)nPoints; } - - - + // sample 4 directions - - float4 aVector = p[0] - center; - float4 u = cross3( nearNormal, aVector ); - float4 v = cross3( nearNormal, u ); - u = normalize3( u ); - v = normalize3( v ); - - - //keep point with deepest penetration - float minW= FLT_MAX; - - int minIndex=-1; - - float4 maxDots; - maxDots.x = FLT_MIN; - maxDots.y = FLT_MIN; - maxDots.z = FLT_MIN; - maxDots.w = FLT_MIN; - - // idx, distance - for(int ie = 0; ie<nPoints; ie++ ) - { - if (p[ie].w<minW) - { - minW = p[ie].w; - minIndex=ie; - } - float f; - float4 r = p[ie]-center; - f = dot3F4( u, r ); - if (f<maxDots.x) - { - maxDots.x = f; - contactIdx[0].x = ie; - } - - f = dot3F4( -u, r ); - if (f<maxDots.y) - { - maxDots.y = f; - contactIdx[0].y = ie; - } - - - f = dot3F4( v, r ); - if (f<maxDots.z) - { - maxDots.z = f; - contactIdx[0].z = ie; - } - - f = dot3F4( -v, r ); - if (f<maxDots.w) - { - maxDots.w = f; - contactIdx[0].w = ie; - } - - } - - if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex) - { - //replace the first contact with minimum (todo: replace contact with least penetration) - contactIdx[0].x = minIndex; - } - - return 4; - -} + float4 aVector = p[0] - center; + float4 u = cross3(nearNormal, aVector); + float4 v = cross3(nearNormal, u); + u = normalize3(u); + v = normalize3(v); + //keep point with deepest penetration + float minW = FLT_MAX; -#define MAX_VERTS 1024 + int minIndex = -1; + + float4 maxDots; + maxDots.x = FLT_MIN; + maxDots.y = FLT_MIN; + maxDots.z = FLT_MIN; + maxDots.w = FLT_MIN; + + // idx, distance + for (int ie = 0; ie < nPoints; ie++) + { + if (p[ie].w < minW) + { + minW = p[ie].w; + minIndex = ie; + } + float f; + float4 r = p[ie] - center; + f = dot3F4(u, r); + if (f < maxDots.x) + { + maxDots.x = f; + contactIdx[0].x = ie; + } + + f = dot3F4(-u, r); + if (f < maxDots.y) + { + maxDots.y = f; + contactIdx[0].y = ie; + } + + f = dot3F4(v, r); + if (f < maxDots.z) + { + maxDots.z = f; + contactIdx[0].z = ie; + } + + f = dot3F4(-v, r); + if (f < maxDots.w) + { + maxDots.w = f; + contactIdx[0].w = ie; + } + } + + if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex) + { + //replace the first contact with minimum (todo: replace contact with least penetration) + contactIdx[0].x = minIndex; + } + + return 4; +} +#define MAX_VERTS 1024 -inline void project(const b3ConvexPolyhedronData& hull, const float4& pos, const b3Quaternion& orn, const float4& dir, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar& min, b3Scalar& max) +inline void project(const b3ConvexPolyhedronData& hull, const float4& pos, const b3Quaternion& orn, const float4& dir, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar& min, b3Scalar& max) { min = FLT_MAX; max = -FLT_MAX; int numVerts = hull.m_numVertices; - const float4 localDir = b3QuatRotate(orn.inverse(),dir); + const float4 localDir = b3QuatRotate(orn.inverse(), dir); - b3Scalar offset = dot3F4(pos,dir); + b3Scalar offset = dot3F4(pos, dir); - for(int i=0;i<numVerts;i++) + for (int i = 0; i < numVerts; i++) { //b3Vector3 pt = trans * vertices[m_vertexOffset+i]; //b3Scalar dp = pt.dot(dir); //b3Vector3 vertex = vertices[hull.m_vertexOffset+i]; - b3Scalar dp = dot3F4((float4&)vertices[hull.m_vertexOffset+i],localDir); + b3Scalar dp = dot3F4((float4&)vertices[hull.m_vertexOffset + i], localDir); //b3Assert(dp==dpL); - if(dp < min) min = dp; - if(dp > max) max = dp; + if (dp < min) min = dp; + if (dp > max) max = dp; } - if(min>max) + if (min > max) { b3Scalar tmp = min; min = max; @@ -577,50 +532,48 @@ inline void project(const b3ConvexPolyhedronData& hull, const float4& pos, cons max += offset; } - -static bool TestSepAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, - const float4& posA,const b3Quaternion& ornA, - const float4& posB,const b3Quaternion& ornB, - const float4& sep_axis, const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB,b3Scalar& depth) +static bool TestSepAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, + const float4& posA, const b3Quaternion& ornA, + const float4& posB, const b3Quaternion& ornB, + const float4& sep_axis, const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB, b3Scalar& depth) { - b3Scalar Min0,Max0; - b3Scalar Min1,Max1; - project(hullA,posA,ornA,sep_axis,verticesA, Min0, Max0); - project(hullB,posB,ornB, sep_axis,verticesB, Min1, Max1); + b3Scalar Min0, Max0; + b3Scalar Min1, Max1; + project(hullA, posA, ornA, sep_axis, verticesA, Min0, Max0); + project(hullB, posB, ornB, sep_axis, verticesB, Min1, Max1); - if(Max0<Min1 || Max1<Min0) + if (Max0 < Min1 || Max1 < Min0) return false; b3Scalar d0 = Max0 - Min1; - assert(d0>=0.0f); + assert(d0 >= 0.0f); b3Scalar d1 = Max1 - Min0; - assert(d1>=0.0f); - depth = d0<d1 ? d0:d1; + assert(d1 >= 0.0f); + depth = d0 < d1 ? d0 : d1; return true; } inline bool IsAlmostZero(const b3Vector3& v) { - if(fabsf(v.x)>1e-6 || fabsf(v.y)>1e-6 || fabsf(v.z)>1e-6) return false; + if (fabsf(v.x) > 1e-6 || fabsf(v.y) > 1e-6 || fabsf(v.z) > 1e-6) return false; return true; } - -static bool findSeparatingAxis( const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, - const float4& posA1, - const b3Quaternion& ornA, - const float4& posB1, - const b3Quaternion& ornB, - const b3AlignedObjectArray<b3Vector3>& verticesA, - const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, - const b3AlignedObjectArray<b3GpuFace>& facesA, - const b3AlignedObjectArray<int>& indicesA, - const b3AlignedObjectArray<b3Vector3>& verticesB, - const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB, - const b3AlignedObjectArray<b3GpuFace>& facesB, - const b3AlignedObjectArray<int>& indicesB, - - b3Vector3& sep) +static bool findSeparatingAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, + const float4& posA1, + const b3Quaternion& ornA, + const float4& posB1, + const b3Quaternion& ornB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, + const b3AlignedObjectArray<b3GpuFace>& facesA, + const b3AlignedObjectArray<int>& indicesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB, + const b3AlignedObjectArray<b3GpuFace>& facesB, + const b3AlignedObjectArray<int>& indicesB, + + b3Vector3& sep) { B3_PROFILE("findSeparatingAxis"); @@ -629,41 +582,40 @@ static bool findSeparatingAxis( const b3ConvexPolyhedronData& hullA, const b3Con posA.w = 0.f; float4 posB = posB1; posB.w = 0.f; -//#ifdef TEST_INTERNAL_OBJECTS + //#ifdef TEST_INTERNAL_OBJECTS float4 c0local = (float4&)hullA.m_localCenter; float4 c0 = transform(&c0local, &posA, &ornA); float4 c1local = (float4&)hullB.m_localCenter; - float4 c1 = transform(&c1local,&posB,&ornB); + float4 c1 = transform(&c1local, &posB, &ornB); const float4 deltaC2 = c0 - c1; -//#endif + //#endif b3Scalar dmin = FLT_MAX; - int curPlaneTests=0; + int curPlaneTests = 0; int numFacesA = hullA.m_numFaces; // Test normals from hullA - for(int i=0;i<numFacesA;i++) + for (int i = 0; i < numFacesA; i++) { - const float4& normal = (float4&)facesA[hullA.m_faceOffset+i].m_plane; - float4 faceANormalWS = b3QuatRotate(ornA,normal); + const float4& normal = (float4&)facesA[hullA.m_faceOffset + i].m_plane; + float4 faceANormalWS = b3QuatRotate(ornA, normal); - if (dot3F4(deltaC2,faceANormalWS)<0) - faceANormalWS*=-1.f; + if (dot3F4(deltaC2, faceANormalWS) < 0) + faceANormalWS *= -1.f; curPlaneTests++; #ifdef TEST_INTERNAL_OBJECTS gExpectedNbTests++; - if(gUseInternalObject && !TestInternalObjects(transA,transB, DeltaC2, faceANormalWS, hullA, hullB, dmin)) + if (gUseInternalObject && !TestInternalObjects(transA, transB, DeltaC2, faceANormalWS, hullA, hullB, dmin)) continue; gActualNbTests++; #endif - b3Scalar d; - if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,faceANormalWS, verticesA, verticesB,d)) + if (!TestSepAxis(hullA, hullB, posA, ornA, posB, ornB, faceANormalWS, verticesA, verticesB, d)) return false; - if(d<dmin) + if (d < dmin) { dmin = d; sep = (b3Vector3&)faceANormalWS; @@ -672,28 +624,28 @@ static bool findSeparatingAxis( const b3ConvexPolyhedronData& hullA, const b3Con int numFacesB = hullB.m_numFaces; // Test normals from hullB - for(int i=0;i<numFacesB;i++) + for (int i = 0; i < numFacesB; i++) { - float4 normal = (float4&)facesB[hullB.m_faceOffset+i].m_plane; + float4 normal = (float4&)facesB[hullB.m_faceOffset + i].m_plane; float4 WorldNormal = b3QuatRotate(ornB, normal); - if (dot3F4(deltaC2,WorldNormal)<0) + if (dot3F4(deltaC2, WorldNormal) < 0) { - WorldNormal*=-1.f; + WorldNormal *= -1.f; } curPlaneTests++; #ifdef TEST_INTERNAL_OBJECTS gExpectedNbTests++; - if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, WorldNormal, hullA, hullB, dmin)) + if (gUseInternalObject && !TestInternalObjects(transA, transB, DeltaC2, WorldNormal, hullA, hullB, dmin)) continue; gActualNbTests++; #endif b3Scalar d; - if(!TestSepAxis(hullA, hullB,posA,ornA,posB,ornB,WorldNormal,verticesA,verticesB,d)) + if (!TestSepAxis(hullA, hullB, posA, ornA, posB, ornB, WorldNormal, verticesA, verticesB, d)) return false; - if(d<dmin) + if (d < dmin) { dmin = d; sep = (b3Vector3&)WorldNormal; @@ -702,70 +654,65 @@ static bool findSeparatingAxis( const b3ConvexPolyhedronData& hullA, const b3Con int curEdgeEdge = 0; // Test edges - for(int e0=0;e0<hullA.m_numUniqueEdges;e0++) + for (int e0 = 0; e0 < hullA.m_numUniqueEdges; e0++) { - const float4& edge0 = (float4&) uniqueEdgesA[hullA.m_uniqueEdgesOffset+e0]; - float4 edge0World = b3QuatRotate(ornA,(float4&)edge0); + const float4& edge0 = (float4&)uniqueEdgesA[hullA.m_uniqueEdgesOffset + e0]; + float4 edge0World = b3QuatRotate(ornA, (float4&)edge0); - for(int e1=0;e1<hullB.m_numUniqueEdges;e1++) + for (int e1 = 0; e1 < hullB.m_numUniqueEdges; e1++) { - const b3Vector3 edge1 = uniqueEdgesB[hullB.m_uniqueEdgesOffset+e1]; - float4 edge1World = b3QuatRotate(ornB,(float4&)edge1); - + const b3Vector3 edge1 = uniqueEdgesB[hullB.m_uniqueEdgesOffset + e1]; + float4 edge1World = b3QuatRotate(ornB, (float4&)edge1); - float4 crossje = cross3(edge0World,edge1World); + float4 crossje = cross3(edge0World, edge1World); curEdgeEdge++; - if(!IsAlmostZero((b3Vector3&)crossje)) + if (!IsAlmostZero((b3Vector3&)crossje)) { crossje = normalize3(crossje); - if (dot3F4(deltaC2,crossje)<0) - crossje*=-1.f; - + if (dot3F4(deltaC2, crossje) < 0) + crossje *= -1.f; #ifdef TEST_INTERNAL_OBJECTS gExpectedNbTests++; - if(gUseInternalObject && !TestInternalObjects(transA,transB,DeltaC2, Cross, hullA, hullB, dmin)) + if (gUseInternalObject && !TestInternalObjects(transA, transB, DeltaC2, Cross, hullA, hullB, dmin)) continue; gActualNbTests++; #endif b3Scalar dist; - if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,crossje, verticesA,verticesB,dist)) + if (!TestSepAxis(hullA, hullB, posA, ornA, posB, ornB, crossje, verticesA, verticesB, dist)) return false; - if(dist<dmin) + if (dist < dmin) { dmin = dist; sep = (b3Vector3&)crossje; } } } - } - - if((dot3F4(-deltaC2,(float4&)sep))>0.0f) + if ((dot3F4(-deltaC2, (float4&)sep)) > 0.0f) sep = -sep; return true; } - -bool findSeparatingAxisEdgeEdge( __global const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB, - const b3Float4& posA1, - const b3Quat& ornA, - const b3Float4& posB1, - const b3Quat& ornB, - const b3Float4& DeltaC2, - __global const b3AlignedObjectArray<float4>& vertices, - __global const b3AlignedObjectArray<float4>& uniqueEdges, - __global const b3AlignedObjectArray<b3GpuFace>& faces, - __global const b3AlignedObjectArray<int>& indices, - float4* sep, - float* dmin) +bool findSeparatingAxisEdgeEdge(__global const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB, + const b3Float4& posA1, + const b3Quat& ornA, + const b3Float4& posB1, + const b3Quat& ornB, + const b3Float4& DeltaC2, + __global const b3AlignedObjectArray<float4>& vertices, + __global const b3AlignedObjectArray<float4>& uniqueEdges, + __global const b3AlignedObjectArray<b3GpuFace>& faces, + __global const b3AlignedObjectArray<int>& indices, + float4* sep, + float* dmin) { -// int i = get_global_id(0); + // int i = get_global_id(0); float4 posA = posA1; posA.w = 0.f; @@ -776,97 +723,89 @@ bool findSeparatingAxisEdgeEdge( __global const b3ConvexPolyhedronData* hullA, _ int curEdgeEdge = 0; // Test edges - for(int e0=0;e0<hullA->m_numUniqueEdges;e0++) + for (int e0 = 0; e0 < hullA->m_numUniqueEdges; e0++) { - const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0]; - float4 edge0World = b3QuatRotate(ornA,edge0); + const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset + e0]; + float4 edge0World = b3QuatRotate(ornA, edge0); - for(int e1=0;e1<hullB->m_numUniqueEdges;e1++) + for (int e1 = 0; e1 < hullB->m_numUniqueEdges; e1++) { - const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1]; - float4 edge1World = b3QuatRotate(ornB,edge1); + const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset + e1]; + float4 edge1World = b3QuatRotate(ornB, edge1); - - float4 crossje = cross3(edge0World,edge1World); + float4 crossje = cross3(edge0World, edge1World); curEdgeEdge++; - if(!IsAlmostZero(crossje)) + if (!IsAlmostZero(crossje)) { crossje = normalize3(crossje); - if (dot3F4(DeltaC2,crossje)<0) - crossje*=-1.f; - + if (dot3F4(DeltaC2, crossje) < 0) + crossje *= -1.f; + float dist; bool result = true; { - float Min0,Max0; - float Min1,Max1; - project(*hullA,posA,ornA,crossje,vertices, Min0, Max0); - project(*hullB,posB,ornB,crossje,vertices, Min1, Max1); - - if(Max0<Min1 || Max1<Min0) + float Min0, Max0; + float Min1, Max1; + project(*hullA, posA, ornA, crossje, vertices, Min0, Max0); + project(*hullB, posB, ornB, crossje, vertices, Min1, Max1); + + if (Max0 < Min1 || Max1 < Min0) result = false; - + float d0 = Max0 - Min1; float d1 = Max1 - Min0; - dist = d0<d1 ? d0:d1; + dist = d0 < d1 ? d0 : d1; result = true; - } - - if(dist<*dmin) + if (dist < *dmin) { *dmin = dist; *sep = crossje; } } } - } - - if((dot3F4(-DeltaC2,*sep))>0.0f) + if ((dot3F4(-DeltaC2, *sep)) > 0.0f) { *sep = -(*sep); } return true; } - -__inline float4 lerp3(const float4& a,const float4& b, float t) +__inline float4 lerp3(const float4& a, const float4& b, float t) { - return b3MakeVector3( a.x + (b.x - a.x) * t, - a.y + (b.y - a.y) * t, - a.z + (b.z - a.z) * t, - 0.f); + return b3MakeVector3(a.x + (b.x - a.x) * t, + a.y + (b.y - a.y) * t, + a.z + (b.z - a.z) * t, + 0.f); } - // Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut -int clipFace(const float4* pVtxIn, int numVertsIn, float4& planeNormalWS,float planeEqWS, float4* ppVtxOut) +int clipFace(const float4* pVtxIn, int numVertsIn, float4& planeNormalWS, float planeEqWS, float4* ppVtxOut) { - int ve; float ds, de; int numVertsOut = 0; if (numVertsIn < 2) return 0; - float4 firstVertex=pVtxIn[numVertsIn-1]; + float4 firstVertex = pVtxIn[numVertsIn - 1]; float4 endVertex = pVtxIn[0]; - - ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS; + + ds = dot3F4(planeNormalWS, firstVertex) + planeEqWS; for (ve = 0; ve < numVertsIn; ve++) { - endVertex=pVtxIn[ve]; + endVertex = pVtxIn[ve]; - de = dot3F4(planeNormalWS,endVertex)+planeEqWS; + de = dot3F4(planeNormalWS, endVertex) + planeEqWS; - if (ds<0) + if (ds < 0) { - if (de<0) + if (de < 0) { // Start < 0, end < 0, so output endVertex ppVtxOut[numVertsOut++] = endVertex; @@ -874,15 +813,15 @@ int clipFace(const float4* pVtxIn, int numVertsIn, float4& planeNormalWS,float p else { // Start < 0, end >= 0, so output intersection - ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) ); + ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex, (ds * 1.f / (ds - de))); } } else { - if (de<0) + if (de < 0) { // Start >= 0, end < 0 so output intersection and end - ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) ); + ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex, (ds * 1.f / (ds - de))); ppVtxOut[numVertsOut++] = endVertex; } } @@ -892,36 +831,35 @@ int clipFace(const float4* pVtxIn, int numVertsIn, float4& planeNormalWS,float p return numVertsOut; } - -int clipFaceAgainstHull(const float4& separatingNormal, const b3ConvexPolyhedronData* hullA, - const float4& posA, const b3Quaternion& ornA, float4* worldVertsB1, int numWorldVertsB1, - float4* worldVertsB2, int capacityWorldVertsB2, - const float minDist, float maxDist, - const b3AlignedObjectArray<float4>& verticesA, const b3AlignedObjectArray<b3GpuFace>& facesA, const b3AlignedObjectArray<int>& indicesA, - //const float4* verticesB, const b3GpuFace* facesB, const int* indicesB, - float4* contactsOut, - int contactCapacity) +int clipFaceAgainstHull(const float4& separatingNormal, const b3ConvexPolyhedronData* hullA, + const float4& posA, const b3Quaternion& ornA, float4* worldVertsB1, int numWorldVertsB1, + float4* worldVertsB2, int capacityWorldVertsB2, + const float minDist, float maxDist, + const b3AlignedObjectArray<float4>& verticesA, const b3AlignedObjectArray<b3GpuFace>& facesA, const b3AlignedObjectArray<int>& indicesA, + //const float4* verticesB, const b3GpuFace* facesB, const int* indicesB, + float4* contactsOut, + int contactCapacity) { int numContactsOut = 0; float4* pVtxIn = worldVertsB1; float4* pVtxOut = worldVertsB2; - + int numVertsIn = numWorldVertsB1; int numVertsOut = 0; - int closestFaceA=-1; + int closestFaceA = -1; { float dmin = FLT_MAX; - for(int face=0;face<hullA->m_numFaces;face++) + for (int face = 0; face < hullA->m_numFaces; face++) { const float4 Normal = b3MakeVector3( - facesA[hullA->m_faceOffset+face].m_plane.x, - facesA[hullA->m_faceOffset+face].m_plane.y, - facesA[hullA->m_faceOffset+face].m_plane.z,0.f); - const float4 faceANormalWS = b3QuatRotate(ornA,Normal); - - float d = dot3F4(faceANormalWS,separatingNormal); + facesA[hullA->m_faceOffset + face].m_plane.x, + facesA[hullA->m_faceOffset + face].m_plane.y, + facesA[hullA->m_faceOffset + face].m_plane.z, 0.f); + const float4 faceANormalWS = b3QuatRotate(ornA, Normal); + + float d = dot3F4(faceANormalWS, separatingNormal); if (d < dmin) { dmin = d; @@ -929,33 +867,33 @@ int clipFaceAgainstHull(const float4& separatingNormal, const b3ConvexPolyhedron } } } - if (closestFaceA<0) + if (closestFaceA < 0) return numContactsOut; - b3GpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA]; + b3GpuFace polyA = facesA[hullA->m_faceOffset + closestFaceA]; // clip polygon to back of planes of all faces of hull A that are adjacent to witness face -// int numContacts = numWorldVertsB1; + // int numContacts = numWorldVertsB1; int numVerticesA = polyA.m_numIndices; - for(int e0=0;e0<numVerticesA;e0++) + for (int e0 = 0; e0 < numVerticesA; e0++) { - const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]]; - const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]]; + const float4 a = verticesA[hullA->m_vertexOffset + indicesA[polyA.m_indexOffset + e0]]; + const float4 b = verticesA[hullA->m_vertexOffset + indicesA[polyA.m_indexOffset + ((e0 + 1) % numVerticesA)]]; const float4 edge0 = a - b; - const float4 WorldEdge0 = b3QuatRotate(ornA,edge0); - float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f); - float4 worldPlaneAnormal1 = b3QuatRotate(ornA,planeNormalA); + const float4 WorldEdge0 = b3QuatRotate(ornA, edge0); + float4 planeNormalA = make_float4(polyA.m_plane.x, polyA.m_plane.y, polyA.m_plane.z, 0.f); + float4 worldPlaneAnormal1 = b3QuatRotate(ornA, planeNormalA); + + float4 planeNormalWS1 = -cross3(WorldEdge0, worldPlaneAnormal1); + float4 worldA1 = transform(&a, &posA, &ornA); + float planeEqWS1 = -dot3F4(worldA1, planeNormalWS1); - float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1); - float4 worldA1 = transform(&a,&posA,&ornA); - float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1); - float4 planeNormalWS = planeNormalWS1; - float planeEqWS=planeEqWS1; - + float planeEqWS = planeEqWS1; + //clip face //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS); - numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut); + numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS, planeEqWS, pVtxOut); //btSwap(pVtxIn,pVtxOut); float4* tmp = pVtxOut; @@ -965,32 +903,32 @@ int clipFaceAgainstHull(const float4& separatingNormal, const b3ConvexPolyhedron numVertsOut = 0; } - // only keep points that are behind the witness face { - float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f); + float4 localPlaneNormal = make_float4(polyA.m_plane.x, polyA.m_plane.y, polyA.m_plane.z, 0.f); float localPlaneEq = polyA.m_plane.w; - float4 planeNormalWS = b3QuatRotate(ornA,localPlaneNormal); - float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA); - for (int i=0;i<numVertsIn;i++) + float4 planeNormalWS = b3QuatRotate(ornA, localPlaneNormal); + float planeEqWS = localPlaneEq - dot3F4(planeNormalWS, posA); + for (int i = 0; i < numVertsIn; i++) { - float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS; - if (depth <=minDist) + float depth = dot3F4(planeNormalWS, pVtxIn[i]) + planeEqWS; + if (depth <= minDist) { depth = minDist; } - if (numContactsOut<contactCapacity) + if (numContactsOut < contactCapacity) { - if (depth <=maxDist) + if (depth <= maxDist) { float4 pointInWorld = pVtxIn[i]; //resultOut.addContactPoint(separatingNormal,point,depth); - contactsOut[numContactsOut++] = b3MakeVector3(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth); + contactsOut[numContactsOut++] = b3MakeVector3(pointInWorld.x, pointInWorld.y, pointInWorld.z, depth); //printf("depth=%f\n",depth); } - } else + } + else { - b3Error("exceeding contact capacity (%d,%df)\n", numContactsOut,contactCapacity); + b3Error("exceeding contact capacity (%d,%df)\n", numContactsOut, contactCapacity); } } } @@ -998,62 +936,60 @@ int clipFaceAgainstHull(const float4& separatingNormal, const b3ConvexPolyhedron return numContactsOut; } +static int clipHullAgainstHull(const float4& separatingNormal, + const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, + const float4& posA, const b3Quaternion& ornA, const float4& posB, const b3Quaternion& ornB, + float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts, + const float minDist, float maxDist, + const b3AlignedObjectArray<float4>& verticesA, const b3AlignedObjectArray<b3GpuFace>& facesA, const b3AlignedObjectArray<int>& indicesA, + const b3AlignedObjectArray<float4>& verticesB, const b3AlignedObjectArray<b3GpuFace>& facesB, const b3AlignedObjectArray<int>& indicesB, - -static int clipHullAgainstHull(const float4& separatingNormal, - const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB, - const float4& posA, const b3Quaternion& ornA,const float4& posB, const b3Quaternion& ornB, - float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts, - const float minDist, float maxDist, - const b3AlignedObjectArray<float4>& verticesA, const b3AlignedObjectArray<b3GpuFace>& facesA, const b3AlignedObjectArray<int>& indicesA, - const b3AlignedObjectArray<float4>& verticesB, const b3AlignedObjectArray<b3GpuFace>& facesB, const b3AlignedObjectArray<int>& indicesB, - - float4* contactsOut, - int contactCapacity) + float4* contactsOut, + int contactCapacity) { int numContactsOut = 0; - int numWorldVertsB1= 0; - + int numWorldVertsB1 = 0; + B3_PROFILE("clipHullAgainstHull"); -// float curMaxDist=maxDist; - int closestFaceB=-1; + // float curMaxDist=maxDist; + int closestFaceB = -1; float dmax = -FLT_MAX; { //B3_PROFILE("closestFaceB"); - if (hullB.m_numFaces!=1) + if (hullB.m_numFaces != 1) { //printf("wtf\n"); } static bool once = true; //printf("separatingNormal=%f,%f,%f\n",separatingNormal.x,separatingNormal.y,separatingNormal.z); - - for(int face=0;face<hullB.m_numFaces;face++) + + for (int face = 0; face < hullB.m_numFaces; face++) { #ifdef BT_DEBUG_SAT_FACE if (once) - printf("face %d\n",face); - const b3GpuFace* faceB = &facesB[hullB.m_faceOffset+face]; + printf("face %d\n", face); + const b3GpuFace* faceB = &facesB[hullB.m_faceOffset + face]; if (once) { - for (int i=0;i<faceB->m_numIndices;i++) + for (int i = 0; i < faceB->m_numIndices; i++) { - float4 vert = verticesB[hullB.m_vertexOffset+indicesB[faceB->m_indexOffset+i]]; - printf("vert[%d] = %f,%f,%f\n",i,vert.x,vert.y,vert.z); + float4 vert = verticesB[hullB.m_vertexOffset + indicesB[faceB->m_indexOffset + i]]; + printf("vert[%d] = %f,%f,%f\n", i, vert.x, vert.y, vert.z); } } -#endif //BT_DEBUG_SAT_FACE - //if (facesB[hullB.m_faceOffset+face].m_numIndices>2) +#endif //BT_DEBUG_SAT_FACE \ + //if (facesB[hullB.m_faceOffset+face].m_numIndices>2) { - const float4 Normal = b3MakeVector3(facesB[hullB.m_faceOffset+face].m_plane.x, - facesB[hullB.m_faceOffset+face].m_plane.y, facesB[hullB.m_faceOffset+face].m_plane.z,0.f); + const float4 Normal = b3MakeVector3(facesB[hullB.m_faceOffset + face].m_plane.x, + facesB[hullB.m_faceOffset + face].m_plane.y, facesB[hullB.m_faceOffset + face].m_plane.z, 0.f); const float4 WorldNormal = b3QuatRotate(ornB, Normal); #ifdef BT_DEBUG_SAT_FACE if (once) - printf("faceNormal = %f,%f,%f\n",Normal.x,Normal.y,Normal.z); + printf("faceNormal = %f,%f,%f\n", Normal.x, Normal.y, Normal.z); #endif - float d = dot3F4(WorldNormal,separatingNormal); + float d = dot3F4(WorldNormal, separatingNormal); if (d > dmax) { dmax = d; @@ -1064,184 +1000,176 @@ static int clipHullAgainstHull(const float4& separatingNormal, once = false; } - - b3Assert(closestFaceB>=0); + b3Assert(closestFaceB >= 0); { //B3_PROFILE("worldVertsB1"); - const b3GpuFace& polyB = facesB[hullB.m_faceOffset+closestFaceB]; + const b3GpuFace& polyB = facesB[hullB.m_faceOffset + closestFaceB]; const int numVertices = polyB.m_numIndices; - for(int e0=0;e0<numVertices;e0++) + for (int e0 = 0; e0 < numVertices; e0++) { - const float4& b = verticesB[hullB.m_vertexOffset+indicesB[polyB.m_indexOffset+e0]]; - worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB); + const float4& b = verticesB[hullB.m_vertexOffset + indicesB[polyB.m_indexOffset + e0]]; + worldVertsB1[numWorldVertsB1++] = transform(&b, &posB, &ornB); } } - if (closestFaceB>=0) + if (closestFaceB >= 0) { //B3_PROFILE("clipFaceAgainstHull"); - numContactsOut = clipFaceAgainstHull((float4&)separatingNormal, &hullA, - posA,ornA, - worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist, - verticesA, facesA, indicesA, - contactsOut,contactCapacity); + numContactsOut = clipFaceAgainstHull((float4&)separatingNormal, &hullA, + posA, ornA, + worldVertsB1, numWorldVertsB1, worldVertsB2, capacityWorldVerts, minDist, maxDist, + verticesA, facesA, indicesA, + contactsOut, contactCapacity); } return numContactsOut; } +#define PARALLEL_SUM(v, n) \ + for (int j = 1; j < n; j++) v[0] += v[j]; +#define PARALLEL_DO(execution, n) \ + for (int ie = 0; ie < n; ie++) \ + { \ + execution; \ + } +#define REDUCE_MAX(v, n) \ + { \ + int i = 0; \ + for (int offset = 0; offset < n; offset++) v[i] = (v[i].y > v[i + offset].y) ? v[i] : v[i + offset]; \ + } +#define REDUCE_MIN(v, n) \ + { \ + int i = 0; \ + for (int offset = 0; offset < n; offset++) v[i] = (v[i].y < v[i + offset].y) ? v[i] : v[i + offset]; \ + } +int extractManifold(const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx) +{ + if (nPoints == 0) + return 0; + if (nPoints <= 4) + return nPoints; + if (nPoints > 64) + nPoints = 64; - -#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j]; -#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;} -#define REDUCE_MAX(v, n) {int i=0;\ -for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; } -#define REDUCE_MIN(v, n) {int i=0;\ -for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; } - -int extractManifold(const float4* p, int nPoints, const float4& nearNormal, b3Int4* contactIdx) -{ - if( nPoints == 0 ) - return 0; - - if (nPoints <=4) - return nPoints; - - - if (nPoints >64) - nPoints = 64; - - float4 center = make_float4(0,0,0,0); + float4 center = make_float4(0, 0, 0, 0); { - - for (int i=0;i<nPoints;i++) + for (int i = 0; i < nPoints; i++) center += p[i]; center /= (float)nPoints; } - - - + // sample 4 directions - - float4 aVector = p[0] - center; - float4 u = cross3( nearNormal, aVector ); - float4 v = cross3( nearNormal, u ); - u = normalize3( u ); - v = normalize3( v ); - - - //keep point with deepest penetration - float minW= FLT_MAX; - - int minIndex=-1; - - float4 maxDots; - maxDots.x = FLT_MIN; - maxDots.y = FLT_MIN; - maxDots.z = FLT_MIN; - maxDots.w = FLT_MIN; - - // idx, distance - for(int ie = 0; ie<nPoints; ie++ ) - { - if (p[ie].w<minW) - { - minW = p[ie].w; - minIndex=ie; - } - float f; - float4 r = p[ie]-center; - f = dot3F4( u, r ); - if (f<maxDots.x) - { - maxDots.x = f; - contactIdx[0].x = ie; - } - - f = dot3F4( -u, r ); - if (f<maxDots.y) - { - maxDots.y = f; - contactIdx[0].y = ie; - } - - - f = dot3F4( v, r ); - if (f<maxDots.z) - { - maxDots.z = f; - contactIdx[0].z = ie; - } - - f = dot3F4( -v, r ); - if (f<maxDots.w) - { - maxDots.w = f; - contactIdx[0].w = ie; - } - - } - - if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex) - { - //replace the first contact with minimum (todo: replace contact with least penetration) - contactIdx[0].x = minIndex; - } - - return 4; - -} + float4 aVector = p[0] - center; + float4 u = cross3(nearNormal, aVector); + float4 v = cross3(nearNormal, u); + u = normalize3(u); + v = normalize3(v); + + //keep point with deepest penetration + float minW = FLT_MAX; + int minIndex = -1; + float4 maxDots; + maxDots.x = FLT_MIN; + maxDots.y = FLT_MIN; + maxDots.z = FLT_MIN; + maxDots.w = FLT_MIN; + + // idx, distance + for (int ie = 0; ie < nPoints; ie++) + { + if (p[ie].w < minW) + { + minW = p[ie].w; + minIndex = ie; + } + float f; + float4 r = p[ie] - center; + f = dot3F4(u, r); + if (f < maxDots.x) + { + maxDots.x = f; + contactIdx[0].x = ie; + } + + f = dot3F4(-u, r); + if (f < maxDots.y) + { + maxDots.y = f; + contactIdx[0].y = ie; + } + + f = dot3F4(v, r); + if (f < maxDots.z) + { + maxDots.z = f; + contactIdx[0].z = ie; + } + + f = dot3F4(-v, r); + if (f < maxDots.w) + { + maxDots.w = f; + contactIdx[0].w = ie; + } + } + + if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex) + { + //replace the first contact with minimum (todo: replace contact with least penetration) + contactIdx[0].x = minIndex; + } + + return 4; +} int clipHullHullSingle( - int bodyIndexA, int bodyIndexB, - const float4& posA, - const b3Quaternion& ornA, - const float4& posB, - const b3Quaternion& ornB, + int bodyIndexA, int bodyIndexB, + const float4& posA, + const b3Quaternion& ornA, + const float4& posB, + const b3Quaternion& ornB, - int collidableIndexA, int collidableIndexB, + int collidableIndexA, int collidableIndexB, - const b3AlignedObjectArray<b3RigidBodyData>* bodyBuf, - b3AlignedObjectArray<b3Contact4>* globalContactOut, - int& nContacts, - - const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataA, - const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataB, - - const b3AlignedObjectArray<b3Vector3>& verticesA, - const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, - const b3AlignedObjectArray<b3GpuFace>& facesA, - const b3AlignedObjectArray<int>& indicesA, - - const b3AlignedObjectArray<b3Vector3>& verticesB, - const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB, - const b3AlignedObjectArray<b3GpuFace>& facesB, - const b3AlignedObjectArray<int>& indicesB, - - const b3AlignedObjectArray<b3Collidable>& hostCollidablesA, - const b3AlignedObjectArray<b3Collidable>& hostCollidablesB, - const b3Vector3& sepNormalWorldSpace, - int maxContactCapacity ) + const b3AlignedObjectArray<b3RigidBodyData>* bodyBuf, + b3AlignedObjectArray<b3Contact4>* globalContactOut, + int& nContacts, + + const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataA, + const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataB, + + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA, + const b3AlignedObjectArray<b3GpuFace>& facesA, + const b3AlignedObjectArray<int>& indicesA, + + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB, + const b3AlignedObjectArray<b3GpuFace>& facesB, + const b3AlignedObjectArray<int>& indicesB, + + const b3AlignedObjectArray<b3Collidable>& hostCollidablesA, + const b3AlignedObjectArray<b3Collidable>& hostCollidablesB, + const b3Vector3& sepNormalWorldSpace, + int maxContactCapacity) { int contactIndex = -1; b3ConvexPolyhedronData hullA, hullB; - - b3Collidable colA = hostCollidablesA[collidableIndexA]; - hullA = hostConvexDataA[colA.m_shapeIndex]; - //printf("numvertsA = %d\n",hullA.m_numVertices); - - - b3Collidable colB = hostCollidablesB[collidableIndexB]; - hullB = hostConvexDataB[colB.m_shapeIndex]; - //printf("numvertsB = %d\n",hullB.m_numVertices); - - + + b3Collidable colA = hostCollidablesA[collidableIndexA]; + hullA = hostConvexDataA[colA.m_shapeIndex]; + //printf("numvertsA = %d\n",hullA.m_numVertices); + + b3Collidable colB = hostCollidablesB[collidableIndexB]; + hullB = hostConvexDataB[colB.m_shapeIndex]; + //printf("numvertsB = %d\n",hullB.m_numVertices); + float4 contactsOut[MAX_VERTS]; int localContactCapacity = MAX_VERTS; @@ -1249,133 +1177,125 @@ int clipHullHullSingle( b3Assert(_finite(bodyBuf->at(bodyIndexA).m_pos.x)); b3Assert(_finite(bodyBuf->at(bodyIndexB).m_pos.x)); #endif - - + { - float4 worldVertsB1[MAX_VERTS]; float4 worldVertsB2[MAX_VERTS]; int capacityWorldVerts = MAX_VERTS; - float4 hostNormal = make_float4(sepNormalWorldSpace.x,sepNormalWorldSpace.y,sepNormalWorldSpace.z,0.f); + float4 hostNormal = make_float4(sepNormalWorldSpace.x, sepNormalWorldSpace.y, sepNormalWorldSpace.z, 0.f); int shapeA = hostCollidablesA[collidableIndexA].m_shapeIndex; int shapeB = hostCollidablesB[collidableIndexB].m_shapeIndex; b3Scalar minDist = -1; b3Scalar maxDist = 0.; - - - b3Transform trA,trB; + b3Transform trA, trB; { - //B3_PROFILE("transform computation"); - //trA.setIdentity(); - trA.setOrigin(b3MakeVector3(posA.x,posA.y,posA.z)); - trA.setRotation(b3Quaternion(ornA.x,ornA.y,ornA.z,ornA.w)); - - //trB.setIdentity(); - trB.setOrigin(b3MakeVector3(posB.x,posB.y,posB.z)); - trB.setRotation(b3Quaternion(ornB.x,ornB.y,ornB.z,ornB.w)); + //B3_PROFILE("transform computation"); + //trA.setIdentity(); + trA.setOrigin(b3MakeVector3(posA.x, posA.y, posA.z)); + trA.setRotation(b3Quaternion(ornA.x, ornA.y, ornA.z, ornA.w)); + + //trB.setIdentity(); + trB.setOrigin(b3MakeVector3(posB.x, posB.y, posB.z)); + trB.setRotation(b3Quaternion(ornB.x, ornB.y, ornB.z, ornB.w)); } b3Quaternion trAorn = trA.getRotation(); - b3Quaternion trBorn = trB.getRotation(); - - int numContactsOut = clipHullAgainstHull(hostNormal, - hostConvexDataA.at(shapeA), - hostConvexDataB.at(shapeB), - (float4&)trA.getOrigin(), (b3Quaternion&)trAorn, - (float4&)trB.getOrigin(), (b3Quaternion&)trBorn, - worldVertsB1,worldVertsB2,capacityWorldVerts, - minDist, maxDist, - verticesA, facesA,indicesA, - verticesB, facesB,indicesB, - - contactsOut,localContactCapacity); - - if (numContactsOut>0) + b3Quaternion trBorn = trB.getRotation(); + + int numContactsOut = clipHullAgainstHull(hostNormal, + hostConvexDataA.at(shapeA), + hostConvexDataB.at(shapeB), + (float4&)trA.getOrigin(), (b3Quaternion&)trAorn, + (float4&)trB.getOrigin(), (b3Quaternion&)trBorn, + worldVertsB1, worldVertsB2, capacityWorldVerts, + minDist, maxDist, + verticesA, facesA, indicesA, + verticesB, facesB, indicesB, + + contactsOut, localContactCapacity); + + if (numContactsOut > 0) { B3_PROFILE("overlap"); float4 normalOnSurfaceB = (float4&)hostNormal; - + b3Int4 contactIdx; contactIdx.x = 0; contactIdx.y = 1; contactIdx.z = 2; contactIdx.w = 3; - + int numPoints = 0; - + { - // B3_PROFILE("extractManifold"); - numPoints = extractManifold(contactsOut, numContactsOut, normalOnSurfaceB, &contactIdx); + // B3_PROFILE("extractManifold"); + numPoints = extractManifold(contactsOut, numContactsOut, normalOnSurfaceB, &contactIdx); } - + b3Assert(numPoints); - - if (nContacts<maxContactCapacity) + + if (nContacts < maxContactCapacity) { contactIndex = nContacts; globalContactOut->expand(); b3Contact4& contact = globalContactOut->at(nContacts); - contact.m_batchIdx = 0;//i; - contact.m_bodyAPtrAndSignBit = (bodyBuf->at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA; - contact.m_bodyBPtrAndSignBit = (bodyBuf->at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB; + contact.m_batchIdx = 0; //i; + contact.m_bodyAPtrAndSignBit = (bodyBuf->at(bodyIndexA).m_invMass == 0) ? -bodyIndexA : bodyIndexA; + contact.m_bodyBPtrAndSignBit = (bodyBuf->at(bodyIndexB).m_invMass == 0) ? -bodyIndexB : bodyIndexB; contact.m_frictionCoeffCmp = 45874; contact.m_restituitionCoeffCmp = 0; - - // float distance = 0.f; - for (int p=0;p<numPoints;p++) + + // float distance = 0.f; + for (int p = 0; p < numPoints; p++) { - contact.m_worldPosB[p] = contactsOut[contactIdx.s[p]];//check if it is actually on B - contact.m_worldNormalOnB = normalOnSurfaceB; + contact.m_worldPosB[p] = contactsOut[contactIdx.s[p]]; //check if it is actually on B + contact.m_worldNormalOnB = normalOnSurfaceB; } //printf("bodyIndexA %d,bodyIndexB %d,normal=%f,%f,%f numPoints %d\n",bodyIndexA,bodyIndexB,normalOnSurfaceB.x,normalOnSurfaceB.y,normalOnSurfaceB.z,numPoints); contact.m_worldNormalOnB.w = (b3Scalar)numPoints; nContacts++; - } else + } + else { - b3Error("Error: exceeding contact capacity (%d/%d)\n", nContacts,maxContactCapacity); + b3Error("Error: exceeding contact capacity (%d/%d)\n", nContacts, maxContactCapacity); } } } return contactIndex; } - - - - void computeContactPlaneConvex(int pairIndex, - int bodyIndexA, int bodyIndexB, - int collidableIndexA, int collidableIndexB, - const b3RigidBodyData* rigidBodies, - const b3Collidable* collidables, - const b3ConvexPolyhedronData* convexShapes, - const b3Vector3* convexVertices, - const int* convexIndices, - const b3GpuFace* faces, - b3Contact4* globalContactsOut, - int& nGlobalContactsOut, - int maxContactCapacity) + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3RigidBodyData* rigidBodies, + const b3Collidable* collidables, + const b3ConvexPolyhedronData* convexShapes, + const b3Vector3* convexVertices, + const int* convexIndices, + const b3GpuFace* faces, + b3Contact4* globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity) { - - int shapeIndex = collidables[collidableIndexB].m_shapeIndex; + int shapeIndex = collidables[collidableIndexB].m_shapeIndex; const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndex]; - + b3Vector3 posB = rigidBodies[bodyIndexB].m_pos; b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat; b3Vector3 posA = rigidBodies[bodyIndexA].m_pos; b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat; -// int numContactsOut = 0; -// int numWorldVertsB1= 0; + // int numContactsOut = 0; + // int numWorldVertsB1= 0; b3Vector3 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane; - b3Vector3 planeNormal=b3MakeVector3(planeEq.x,planeEq.y,planeEq.z); - b3Vector3 planeNormalWorld = b3QuatRotate(ornA,planeNormal); + b3Vector3 planeNormal = b3MakeVector3(planeEq.x, planeEq.y, planeEq.z); + b3Vector3 planeNormalWorld = b3QuatRotate(ornA, planeNormal); float planeConstant = planeEq.w; b3Transform convexWorldTransform; convexWorldTransform.setIdentity(); @@ -1387,13 +1307,13 @@ void computeContactPlaneConvex(int pairIndex, planeTransform.setRotation(ornA); b3Transform planeInConvex; - planeInConvex= convexWorldTransform.inverse() * planeTransform; + planeInConvex = convexWorldTransform.inverse() * planeTransform; b3Transform convexInPlane; convexInPlane = planeTransform.inverse() * convexWorldTransform; - - b3Vector3 planeNormalInConvex = planeInConvex.getBasis()*-planeNormal; + + b3Vector3 planeNormalInConvex = planeInConvex.getBasis() * -planeNormal; float maxDot = -1e30; - int hitVertex=-1; + int hitVertex = -1; b3Vector3 hitVtx; #define MAX_PLANE_CONVEX_POINTS 64 @@ -1406,54 +1326,52 @@ void computeContactPlaneConvex(int pairIndex, contactIdx.s[1] = 1; contactIdx.s[2] = 2; contactIdx.s[3] = 3; - - for (int i=0;i<hullB->m_numVertices;i++) + + for (int i = 0; i < hullB->m_numVertices; i++) { - b3Vector3 vtx = convexVertices[hullB->m_vertexOffset+i]; + b3Vector3 vtx = convexVertices[hullB->m_vertexOffset + i]; float curDot = vtx.dot(planeNormalInConvex); - - if (curDot>maxDot) + if (curDot > maxDot) { - hitVertex=i; - maxDot=curDot; + hitVertex = i; + maxDot = curDot; hitVtx = vtx; //make sure the deepest points is always included - if (numPoints==MAX_PLANE_CONVEX_POINTS) + if (numPoints == MAX_PLANE_CONVEX_POINTS) numPoints--; } - if (numPoints<MAX_PLANE_CONVEX_POINTS) + if (numPoints < MAX_PLANE_CONVEX_POINTS) { - b3Vector3 vtxWorld = convexWorldTransform*vtx; - b3Vector3 vtxInPlane = planeTransform.inverse()*vtxWorld; - float dist = planeNormal.dot(vtxInPlane)-planeConstant; - if (dist<0.f) + b3Vector3 vtxWorld = convexWorldTransform * vtx; + b3Vector3 vtxInPlane = planeTransform.inverse() * vtxWorld; + float dist = planeNormal.dot(vtxInPlane) - planeConstant; + if (dist < 0.f) { vtxWorld.w = dist; contactPoints[numPoints] = vtxWorld; numPoints++; } } - } - int numReducedPoints = 0; + int numReducedPoints = 0; numReducedPoints = numPoints; - - if (numPoints>4) + + if (numPoints > 4) { - numReducedPoints = extractManifoldSequentialGlobal( contactPoints, numPoints, planeNormalInConvex, &contactIdx); + numReducedPoints = extractManifoldSequentialGlobal(contactPoints, numPoints, planeNormalInConvex, &contactIdx); } int dstIdx; -// dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx ); - - if (numReducedPoints>0) + // dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx ); + + if (numReducedPoints > 0) { if (nGlobalContactsOut < maxContactCapacity) { - dstIdx=nGlobalContactsOut; + dstIdx = nGlobalContactsOut; nGlobalContactsOut++; b3Contact4* c = &globalContactsOut[dstIdx]; @@ -1462,38 +1380,33 @@ void computeContactPlaneConvex(int pairIndex, c->setRestituitionCoeff(0.f); c->m_batchIdx = pairIndex; - c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; - c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; - for (int i=0;i<numReducedPoints;i++) + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass == 0 ? -bodyIndexA : bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass == 0 ? -bodyIndexB : bodyIndexB; + for (int i = 0; i < numReducedPoints; i++) { b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]]; c->m_worldPosB[i] = pOnB1; } c->m_worldNormalOnB.w = (b3Scalar)numReducedPoints; - }//if (dstIdx < numPairs) - } - - + } //if (dstIdx < numPairs) + } -// printf("computeContactPlaneConvex\n"); + // printf("computeContactPlaneConvex\n"); } - - -B3_FORCE_INLINE b3Vector3 MyUnQuantize(const unsigned short* vecIn, const b3Vector3& quantization, const b3Vector3& bvhAabbMin) - { - b3Vector3 vecOut; - vecOut.setValue( - (b3Scalar)(vecIn[0]) / (quantization.x), - (b3Scalar)(vecIn[1]) / (quantization.y), - (b3Scalar)(vecIn[2]) / (quantization.z)); - vecOut += bvhAabbMin; - return vecOut; - } +B3_FORCE_INLINE b3Vector3 MyUnQuantize(const unsigned short* vecIn, const b3Vector3& quantization, const b3Vector3& bvhAabbMin) +{ + b3Vector3 vecOut; + vecOut.setValue( + (b3Scalar)(vecIn[0]) / (quantization.x), + (b3Scalar)(vecIn[1]) / (quantization.y), + (b3Scalar)(vecIn[2]) / (quantization.z)); + vecOut += bvhAabbMin; + return vecOut; +} void traverseTreeTree() { - } #include "Bullet3Common/shared/b3Mat3x3.h" @@ -1503,44 +1416,40 @@ int maxNumAabbChecks = 0; int maxDepth = 0; // work-in-progress -__kernel void findCompoundPairsKernel( +__kernel void findCompoundPairsKernel( int pairIndex, int bodyIndexA, int bodyIndexB, int collidableIndexA, int collidableIndexB, - __global const b3RigidBodyData* rigidBodies, + __global const b3RigidBodyData* rigidBodies, __global const b3Collidable* collidables, - __global const b3ConvexPolyhedronData* convexShapes, + __global const b3ConvexPolyhedronData* convexShapes, __global const b3AlignedObjectArray<b3Float4>& vertices, __global const b3AlignedObjectArray<b3Aabb>& aabbsWorldSpace, __global const b3AlignedObjectArray<b3Aabb>& aabbsLocalSpace, __global const b3GpuChildShape* gpuChildShapes, __global b3Int4* gpuCompoundPairsOut, - __global int* numCompoundPairsOut, + __global int* numCompoundPairsOut, int maxNumCompoundPairsCapacity, - b3AlignedObjectArray<b3QuantizedBvhNode>& treeNodesCPU, - b3AlignedObjectArray<b3BvhSubtreeInfo>& subTreesCPU, - b3AlignedObjectArray<b3BvhInfo>& bvhInfoCPU - ) + b3AlignedObjectArray<b3QuantizedBvhNode>& treeNodesCPU, + b3AlignedObjectArray<b3BvhSubtreeInfo>& subTreesCPU, + b3AlignedObjectArray<b3BvhInfo>& bvhInfoCPU) { - numAabbChecks=0; - maxNumAabbChecks=0; -// int i = pairIndex; + numAabbChecks = 0; + maxNumAabbChecks = 0; + // int i = pairIndex; { - - int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; - //once the broadphase avoids static-static pairs, we can remove this test - if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0)) + if ((rigidBodies[bodyIndexA].m_invMass == 0) && (rigidBodies[bodyIndexB].m_invMass == 0)) { return; } - if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + if ((collidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS) && (collidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)) { int bvhA = collidables[collidableIndexA].m_compoundBvhIndex; int bvhB = collidables[collidableIndexB].m_compoundBvhIndex; @@ -1548,9 +1457,8 @@ __kernel void findCompoundPairsKernel( int subTreesOffsetA = bvhInfoCPU[bvhA].m_subTreeOffset; int subTreesOffsetB = bvhInfoCPU[bvhB].m_subTreeOffset; - int numSubTreesB = bvhInfoCPU[bvhB].m_numSubTrees; - + float4 posA = rigidBodies[bodyIndexA].m_pos; b3Quat ornA = rigidBodies[bodyIndexA].m_quat; @@ -1567,41 +1475,37 @@ __kernel void findCompoundPairsKernel( transB.setOrigin(posB); transB.setRotation(ornB); - - - for (int p=0;p<numSubTreesA;p++) + for (int p = 0; p < numSubTreesA; p++) { - b3BvhSubtreeInfo subtreeA = subTreesCPU[subTreesOffsetA+p]; + b3BvhSubtreeInfo subtreeA = subTreesCPU[subTreesOffsetA + p]; //bvhInfoCPU[bvhA].m_quantization - b3Vector3 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin); - b3Vector3 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin); + b3Vector3 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin, bvhInfoCPU[bvhA].m_quantization, bvhInfoCPU[bvhA].m_aabbMin); + b3Vector3 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax, bvhInfoCPU[bvhA].m_quantization, bvhInfoCPU[bvhA].m_aabbMin); - b3Vector3 aabbAMinOut,aabbAMaxOut; - float margin=0.f; - b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut); + b3Vector3 aabbAMinOut, aabbAMaxOut; + float margin = 0.f; + b3TransformAabb2(treeAminLocal, treeAmaxLocal, margin, transA.getOrigin(), transA.getRotation(), &aabbAMinOut, &aabbAMaxOut); - for (int q=0;q<numSubTreesB;q++) + for (int q = 0; q < numSubTreesB; q++) { - b3BvhSubtreeInfo subtreeB = subTreesCPU[subTreesOffsetB+q]; + b3BvhSubtreeInfo subtreeB = subTreesCPU[subTreesOffsetB + q]; - b3Vector3 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin); - b3Vector3 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin); + b3Vector3 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin, bvhInfoCPU[bvhB].m_quantization, bvhInfoCPU[bvhB].m_aabbMin); + b3Vector3 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax, bvhInfoCPU[bvhB].m_quantization, bvhInfoCPU[bvhB].m_aabbMin); - b3Vector3 aabbBMinOut,aabbBMaxOut; - float margin=0.f; - b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut); + b3Vector3 aabbBMinOut, aabbBMaxOut; + float margin = 0.f; + b3TransformAabb2(treeBminLocal, treeBmaxLocal, margin, transB.getOrigin(), transB.getRotation(), &aabbBMinOut, &aabbBMaxOut); - - numAabbChecks=0; - bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut); + numAabbChecks = 0; + bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut, aabbAMaxOut, aabbBMinOut, aabbBMaxOut); if (aabbOverlap) { - - int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfoCPU[bvhA].m_nodeOffset; - // int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize; + int startNodeIndexA = subtreeA.m_rootNodeIndex + bvhInfoCPU[bvhA].m_nodeOffset; + // int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize; - int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfoCPU[bvhB].m_nodeOffset; - // int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize; + int startNodeIndexB = subtreeB.m_rootNodeIndex + bvhInfoCPU[bvhB].m_nodeOffset; + // int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize; b3AlignedObjectArray<b3Int2> nodeStack; b3Int2 node0; @@ -1610,33 +1514,33 @@ __kernel void findCompoundPairsKernel( int maxStackDepth = 1024; nodeStack.resize(maxStackDepth); - int depth=0; - nodeStack[depth++]=node0; + int depth = 0; + nodeStack[depth++] = node0; do { if (depth > maxDepth) { - maxDepth=depth; - printf("maxDepth=%d\n",maxDepth); + maxDepth = depth; + printf("maxDepth=%d\n", maxDepth); } b3Int2 node = nodeStack[--depth]; - - b3Vector3 aMinLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMin,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin); - b3Vector3 aMaxLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMax,bvhInfoCPU[bvhA].m_quantization,bvhInfoCPU[bvhA].m_aabbMin); - b3Vector3 bMinLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMin,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin); - b3Vector3 bMaxLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMax,bvhInfoCPU[bvhB].m_quantization,bvhInfoCPU[bvhB].m_aabbMin); + b3Vector3 aMinLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMin, bvhInfoCPU[bvhA].m_quantization, bvhInfoCPU[bvhA].m_aabbMin); + b3Vector3 aMaxLocal = MyUnQuantize(treeNodesCPU[node.x].m_quantizedAabbMax, bvhInfoCPU[bvhA].m_quantization, bvhInfoCPU[bvhA].m_aabbMin); - float margin=0.f; - b3Vector3 aabbAMinOut,aabbAMaxOut; - b3TransformAabb2(aMinLocal,aMaxLocal, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut); + b3Vector3 bMinLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMin, bvhInfoCPU[bvhB].m_quantization, bvhInfoCPU[bvhB].m_aabbMin); + b3Vector3 bMaxLocal = MyUnQuantize(treeNodesCPU[node.y].m_quantizedAabbMax, bvhInfoCPU[bvhB].m_quantization, bvhInfoCPU[bvhB].m_aabbMin); - b3Vector3 aabbBMinOut,aabbBMaxOut; - b3TransformAabb2(bMinLocal,bMaxLocal, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut); + float margin = 0.f; + b3Vector3 aabbAMinOut, aabbAMaxOut; + b3TransformAabb2(aMinLocal, aMaxLocal, margin, transA.getOrigin(), transA.getRotation(), &aabbAMinOut, &aabbAMaxOut); + + b3Vector3 aabbBMinOut, aabbBMaxOut; + b3TransformAabb2(bMinLocal, bMaxLocal, margin, transB.getOrigin(), transB.getRotation(), &aabbBMinOut, &aabbBMaxOut); numAabbChecks++; - bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut); + bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut, aabbAMaxOut, aabbBMinOut, aabbBMaxOut); if (nodeOverlap) { bool isLeafA = treeNodesCPU[node.x].isLeafNode(); @@ -1645,23 +1549,23 @@ __kernel void findCompoundPairsKernel( bool isInternalB = !isLeafB; //fail, even though it might hit two leaf nodes - if (depth+4>maxStackDepth && !(isLeafA && isLeafB)) + if (depth + 4 > maxStackDepth && !(isLeafA && isLeafB)) { b3Error("Error: traversal exceeded maxStackDepth\n"); continue; } - if(isInternalA) + if (isInternalA) { - int nodeAleftChild = node.x+1; - bool isNodeALeftChildLeaf = treeNodesCPU[node.x+1].isLeafNode(); - int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + treeNodesCPU[node.x+1].getEscapeIndex(); + int nodeAleftChild = node.x + 1; + bool isNodeALeftChildLeaf = treeNodesCPU[node.x + 1].isLeafNode(); + int nodeArightChild = isNodeALeftChildLeaf ? node.x + 2 : node.x + 1 + treeNodesCPU[node.x + 1].getEscapeIndex(); - if(isInternalB) - { - int nodeBleftChild = node.y+1; - bool isNodeBLeftChildLeaf = treeNodesCPU[node.y+1].isLeafNode(); - int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + treeNodesCPU[node.y+1].getEscapeIndex(); + if (isInternalB) + { + int nodeBleftChild = node.y + 1; + bool isNodeBLeftChildLeaf = treeNodesCPU[node.y + 1].isLeafNode(); + int nodeBrightChild = isNodeBLeftChildLeaf ? node.y + 2 : node.y + 1 + treeNodesCPU[node.y + 1].getEscapeIndex(); nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild); nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild); @@ -1670,90 +1574,83 @@ __kernel void findCompoundPairsKernel( } else { - nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y); - nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y); + nodeStack[depth++] = b3MakeInt2(nodeAleftChild, node.y); + nodeStack[depth++] = b3MakeInt2(nodeArightChild, node.y); } } else { - if(isInternalB) + if (isInternalB) { - int nodeBleftChild = node.y+1; - bool isNodeBLeftChildLeaf = treeNodesCPU[node.y+1].isLeafNode(); - int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + treeNodesCPU[node.y+1].getEscapeIndex(); - nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild); - nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild); + int nodeBleftChild = node.y + 1; + bool isNodeBLeftChildLeaf = treeNodesCPU[node.y + 1].isLeafNode(); + int nodeBrightChild = isNodeBLeftChildLeaf ? node.y + 2 : node.y + 1 + treeNodesCPU[node.y + 1].getEscapeIndex(); + nodeStack[depth++] = b3MakeInt2(node.x, nodeBleftChild); + nodeStack[depth++] = b3MakeInt2(node.x, nodeBrightChild); } else { int compoundPairIdx = b3AtomicInc(numCompoundPairsOut); - if (compoundPairIdx<maxNumCompoundPairsCapacity) + if (compoundPairIdx < maxNumCompoundPairsCapacity) { int childShapeIndexA = treeNodesCPU[node.x].getTriangleIndex(); int childShapeIndexB = treeNodesCPU[node.y].getTriangleIndex(); - gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB); + gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA, bodyIndexB, childShapeIndexA, childShapeIndexB); } } } } } while (depth); - maxNumAabbChecks = b3Max(numAabbChecks,maxNumAabbChecks); + maxNumAabbChecks = b3Max(numAabbChecks, maxNumAabbChecks); } } } - + return; } - if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + if ((collidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS) || (collidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)) { - - if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + if (collidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS) { - int numChildrenA = collidables[collidableIndexA].m_numChildShapes; - for (int c=0;c<numChildrenA;c++) + for (int c = 0; c < numChildrenA; c++) { - int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c; + int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex + c; int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; float4 posA = rigidBodies[bodyIndexA].m_pos; b3Quat ornA = rigidBodies[bodyIndexA].m_quat; float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; - float4 newPosA = b3QuatRotate(ornA,childPosA)+posA; - b3Quat newOrnA = b3QuatMul(ornA,childOrnA); - + float4 newPosA = b3QuatRotate(ornA, childPosA) + posA; + b3Quat newOrnA = b3QuatMul(ornA, childOrnA); - b3Aabb aabbA = aabbsLocalSpace[childColIndexA]; - b3Transform transA; transA.setIdentity(); transA.setOrigin(newPosA); transA.setRotation(newOrnA); - b3Scalar margin=0.0f; + b3Scalar margin = 0.0f; - b3Vector3 aabbAMinOut,aabbAMaxOut; + b3Vector3 aabbAMinOut, aabbAMaxOut; - b3TransformAabb2((const b3Float4&)aabbA.m_min,(const b3Float4&)aabbA.m_max, margin,transA.getOrigin(),transA.getRotation(),&aabbAMinOut,&aabbAMaxOut); + b3TransformAabb2((const b3Float4&)aabbA.m_min, (const b3Float4&)aabbA.m_max, margin, transA.getOrigin(), transA.getRotation(), &aabbAMinOut, &aabbAMaxOut); - if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + if (collidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS) { int numChildrenB = collidables[collidableIndexB].m_numChildShapes; - for (int b=0;b<numChildrenB;b++) + for (int b = 0; b < numChildrenB; b++) { - int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b; + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex + b; int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; b3Quat ornB = rigidBodies[bodyIndexB].m_quat; float4 posB = rigidBodies[bodyIndexB].m_pos; float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; - float4 newPosB = transform(&childPosB,&posB,&ornB); - b3Quat newOrnB = b3QuatMul(ornB,childOrnB); - - + float4 newPosB = transform(&childPosB, &posB, &ornB); + b3Quat newOrnB = b3QuatMul(ornB, childOrnB); b3Aabb aabbB = aabbsLocalSpace[childColIndexB]; @@ -1762,11 +1659,11 @@ __kernel void findCompoundPairsKernel( transB.setOrigin(newPosB); transB.setRotation(newOrnB); - b3Vector3 aabbBMinOut,aabbBMaxOut; - b3TransformAabb2((const b3Float4&)aabbB.m_min,(const b3Float4&)aabbB.m_max, margin,transB.getOrigin(),transB.getRotation(),&aabbBMinOut,&aabbBMaxOut); + b3Vector3 aabbBMinOut, aabbBMaxOut; + b3TransformAabb2((const b3Float4&)aabbB.m_min, (const b3Float4&)aabbB.m_max, margin, transB.getOrigin(), transB.getRotation(), &aabbBMinOut, &aabbBMaxOut); numAabbChecks++; - bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut); + bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut, aabbAMaxOut, aabbBMinOut, aabbBMaxOut); if (aabbOverlap) { /* @@ -1784,22 +1681,22 @@ __kernel void findCompoundPairsKernel( float4 c1 = transform(&c1local,&posB,&ornB); const float4 DeltaC2 = c0 - c1; */ - {// + { // int compoundPairIdx = b3AtomicInc(numCompoundPairsOut); - if (compoundPairIdx<maxNumCompoundPairsCapacity) + if (compoundPairIdx < maxNumCompoundPairsCapacity) { - gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB); + gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA, bodyIndexB, childShapeIndexA, childShapeIndexB); } - }// - }//fi(1) - } //for (int b=0 - }//if (collidables[collidableIndexB]. - else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + } // + } //fi(1) + } //for (int b=0 + } //if (collidables[collidableIndexB]. + else //if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) { if (1) { - // int numFacesA = convexShapes[shapeIndexA].m_numFaces; - // float dmin = FLT_MAX; + // int numFacesA = convexShapes[shapeIndexA].m_numFaces; + // float dmin = FLT_MAX; float4 posA = newPosA; posA.w = 0.f; float4 posB = rigidBodies[bodyIndexB].m_pos; @@ -1811,45 +1708,43 @@ __kernel void findCompoundPairsKernel( float4 c1local = convexShapes[shapeIndexB].m_localCenter; b3Quat ornB = rigidBodies[bodyIndexB].m_quat; float4 c1; - c1 = transform(&c1local,&posB,&ornB); - // const float4 DeltaC2 = c0 - c1; + c1 = transform(&c1local, &posB, &ornB); + // const float4 DeltaC2 = c0 - c1; { int compoundPairIdx = b3AtomicInc(numCompoundPairsOut); - if (compoundPairIdx<maxNumCompoundPairsCapacity) + if (compoundPairIdx < maxNumCompoundPairsCapacity) { - gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,childShapeIndexA,-1); - }//if (compoundPairIdx<maxNumCompoundPairsCapacity) - }// - }//fi (1) - }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) - }//for (int b=0;b<numChildrenB;b++) + gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA, bodyIndexB, childShapeIndexA, -1); + } //if (compoundPairIdx<maxNumCompoundPairsCapacity) + } // + } //fi (1) + } //if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + } //for (int b=0;b<numChildrenB;b++) return; - }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) - if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) - && (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + } //if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + if ((collidables[collidableIndexA].m_shapeType != SHAPE_CONCAVE_TRIMESH) && (collidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)) { int numChildrenB = collidables[collidableIndexB].m_numChildShapes; - for (int b=0;b<numChildrenB;b++) + for (int b = 0; b < numChildrenB; b++) { - int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b; + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex + b; int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; b3Quat ornB = rigidBodies[bodyIndexB].m_quat; float4 posB = rigidBodies[bodyIndexB].m_pos; float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; - float4 newPosB = b3QuatRotate(ornB,childPosB)+posB; - b3Quat newOrnB = b3QuatMul(ornB,childOrnB); + float4 newPosB = b3QuatRotate(ornB, childPosB) + posB; + b3Quat newOrnB = b3QuatMul(ornB, childOrnB); int shapeIndexB = collidables[childColIndexB].m_shapeIndex; - ////////////////////////////////////// if (1) { - // int numFacesA = convexShapes[shapeIndexA].m_numFaces; - // float dmin = FLT_MAX; + // int numFacesA = convexShapes[shapeIndexA].m_numFaces; + // float dmin = FLT_MAX; float4 posA = rigidBodies[bodyIndexA].m_pos; posA.w = 0.f; float4 posB = newPosB; @@ -1859,99 +1754,96 @@ __kernel void findCompoundPairsKernel( float4 c0; c0 = transform(&c0local, &posA, &ornA); float4 c1local = convexShapes[shapeIndexB].m_localCenter; - b3Quat ornB =newOrnB; + b3Quat ornB = newOrnB; float4 c1; - c1 = transform(&c1local,&posB,&ornB); - // const float4 DeltaC2 = c0 - c1; - {// + c1 = transform(&c1local, &posB, &ornB); + // const float4 DeltaC2 = c0 - c1; + { // int compoundPairIdx = b3AtomicInc(numCompoundPairsOut); - if (compoundPairIdx<maxNumCompoundPairsCapacity) + if (compoundPairIdx < maxNumCompoundPairsCapacity) { - gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA,bodyIndexB,-1,childShapeIndexB); - }//fi (compoundPairIdx<maxNumCompoundPairsCapacity) - }// - }//fi (1) - }//for (int b=0;b<numChildrenB;b++) + gpuCompoundPairsOut[compoundPairIdx] = b3MakeInt4(bodyIndexA, bodyIndexB, -1, childShapeIndexB); + } //fi (compoundPairIdx<maxNumCompoundPairsCapacity) + } // + } //fi (1) + } //for (int b=0;b<numChildrenB;b++) return; - }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) + } //if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) return; - }//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) - }//i<numPairs + } //fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)) + } //i<numPairs } - - -__kernel void processCompoundPairsKernel( __global const b3Int4* gpuCompoundPairs, - __global const b3RigidBodyData* rigidBodies, - __global const b3Collidable* collidables, - __global const b3ConvexPolyhedronData* convexShapes, - __global const b3AlignedObjectArray<b3Float4>& vertices, - __global const b3AlignedObjectArray<b3Float4>& uniqueEdges, - __global const b3AlignedObjectArray<b3GpuFace>& faces, - __global const b3AlignedObjectArray<int>& indices, - __global b3Aabb* aabbs, - __global const b3GpuChildShape* gpuChildShapes, - __global b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut, - __global b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut, - int numCompoundPairs, - int i - ) +__kernel void processCompoundPairsKernel(__global const b3Int4* gpuCompoundPairs, + __global const b3RigidBodyData* rigidBodies, + __global const b3Collidable* collidables, + __global const b3ConvexPolyhedronData* convexShapes, + __global const b3AlignedObjectArray<b3Float4>& vertices, + __global const b3AlignedObjectArray<b3Float4>& uniqueEdges, + __global const b3AlignedObjectArray<b3GpuFace>& faces, + __global const b3AlignedObjectArray<int>& indices, + __global b3Aabb* aabbs, + __global const b3GpuChildShape* gpuChildShapes, + __global b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut, + __global b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut, + int numCompoundPairs, + int i) { - -// int i = get_global_id(0); - if (i<numCompoundPairs) + // int i = get_global_id(0); + if (i < numCompoundPairs) { int bodyIndexA = gpuCompoundPairs[i].x; int bodyIndexB = gpuCompoundPairs[i].y; int childShapeIndexA = gpuCompoundPairs[i].z; int childShapeIndexB = gpuCompoundPairs[i].w; - + int collidableIndexA = -1; int collidableIndexB = -1; - + b3Quat ornA = rigidBodies[bodyIndexA].m_quat; float4 posA = rigidBodies[bodyIndexA].m_pos; - + b3Quat ornB = rigidBodies[bodyIndexB].m_quat; float4 posB = rigidBodies[bodyIndexB].m_pos; - + if (childShapeIndexA >= 0) { collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; - b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; - float4 newPosA = b3QuatRotate(ornA,childPosA)+posA; - b3Quat newOrnA = b3QuatMul(ornA,childOrnA); + b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; + float4 newPosA = b3QuatRotate(ornA, childPosA) + posA; + b3Quat newOrnA = b3QuatMul(ornA, childOrnA); posA = newPosA; ornA = newOrnA; - } else + } + else { collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; } - - if (childShapeIndexB>=0) + + if (childShapeIndexB >= 0) { collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; - float4 newPosB = b3QuatRotate(ornB,childPosB)+posB; - b3Quat newOrnB = b3QuatMul(ornB,childOrnB); + float4 newPosB = b3QuatRotate(ornB, childPosB) + posB; + b3Quat newOrnB = b3QuatMul(ornB, childOrnB); posB = newPosB; ornB = newOrnB; - } else + } + else { - collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; } - + gpuHasCompoundSepNormalsOut[i] = 0; - + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; - + int shapeTypeA = collidables[collidableIndexA].m_shapeType; int shapeTypeB = collidables[collidableIndexB].m_shapeType; - if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL)) { @@ -1959,145 +1851,142 @@ __kernel void processCompoundPairsKernel( __global const b3Int4* gpuCompoundPa } int hasSeparatingAxis = 5; - - // int numFacesA = convexShapes[shapeIndexA].m_numFaces; + + // int numFacesA = convexShapes[shapeIndexA].m_numFaces; float dmin = FLT_MAX; posA.w = 0.f; posB.w = 0.f; float4 c0local = convexShapes[shapeIndexA].m_localCenter; float4 c0 = transform(&c0local, &posA, &ornA); float4 c1local = convexShapes[shapeIndexB].m_localCenter; - float4 c1 = transform(&c1local,&posB,&ornB); + float4 c1 = transform(&c1local, &posB, &ornB); const float4 DeltaC2 = c0 - c1; - float4 sepNormal = make_float4(1,0,0,0); -// bool sepA = findSeparatingAxis( convexShapes[shapeIndexA], convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin); - bool sepA = findSeparatingAxis( convexShapes[shapeIndexA], convexShapes[shapeIndexB],posA,ornA,posB,ornB,vertices,uniqueEdges,faces,indices,vertices,uniqueEdges,faces,indices,sepNormal);//,&dmin); - + float4 sepNormal = make_float4(1, 0, 0, 0); + // bool sepA = findSeparatingAxis( convexShapes[shapeIndexA], convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin); + bool sepA = findSeparatingAxis(convexShapes[shapeIndexA], convexShapes[shapeIndexB], posA, ornA, posB, ornB, vertices, uniqueEdges, faces, indices, vertices, uniqueEdges, faces, indices, sepNormal); //,&dmin); + hasSeparatingAxis = 4; if (!sepA) { hasSeparatingAxis = 0; - } else + } + else { - bool sepB = findSeparatingAxis( convexShapes[shapeIndexB],convexShapes[shapeIndexA],posB,ornB,posA,ornA,vertices,uniqueEdges,faces,indices,vertices,uniqueEdges,faces,indices,sepNormal);//,&dmin); + bool sepB = findSeparatingAxis(convexShapes[shapeIndexB], convexShapes[shapeIndexA], posB, ornB, posA, ornA, vertices, uniqueEdges, faces, indices, vertices, uniqueEdges, faces, indices, sepNormal); //,&dmin); if (!sepB) { hasSeparatingAxis = 0; - } else//(!sepB) + } + else //(!sepB) { - bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin); + bool sepEE = findSeparatingAxisEdgeEdge(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB], posA, ornA, posB, ornB, DeltaC2, vertices, uniqueEdges, faces, indices, &sepNormal, &dmin); if (sepEE) { - gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal); - gpuHasCompoundSepNormalsOut[i] = 1; - }//sepEE - }//(!sepB) - }//(!sepA) - - + gpuCompoundSepNormalsOut[i] = sepNormal; //fastNormalize4(sepNormal); + gpuHasCompoundSepNormalsOut[i] = 1; + } //sepEE + } //(!sepB) + } //(!sepA) } - } - -__kernel void clipCompoundsHullHullKernel( __global const b3Int4* gpuCompoundPairs, - __global const b3RigidBodyData* rigidBodies, - __global const b3Collidable* collidables, - __global const b3ConvexPolyhedronData* convexShapes, - __global const b3AlignedObjectArray<b3Float4>& vertices, - __global const b3AlignedObjectArray<b3Float4>& uniqueEdges, - __global const b3AlignedObjectArray<b3GpuFace>& faces, - __global const b3AlignedObjectArray<int>& indices, - __global const b3GpuChildShape* gpuChildShapes, - __global const b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut, - __global const b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut, - __global struct b3Contact4Data* globalContactsOut, - int* nGlobalContactsOut, - int numCompoundPairs, int maxContactCapacity, int i) +__kernel void clipCompoundsHullHullKernel(__global const b3Int4* gpuCompoundPairs, + __global const b3RigidBodyData* rigidBodies, + __global const b3Collidable* collidables, + __global const b3ConvexPolyhedronData* convexShapes, + __global const b3AlignedObjectArray<b3Float4>& vertices, + __global const b3AlignedObjectArray<b3Float4>& uniqueEdges, + __global const b3AlignedObjectArray<b3GpuFace>& faces, + __global const b3AlignedObjectArray<int>& indices, + __global const b3GpuChildShape* gpuChildShapes, + __global const b3AlignedObjectArray<b3Float4>& gpuCompoundSepNormalsOut, + __global const b3AlignedObjectArray<int>& gpuHasCompoundSepNormalsOut, + __global struct b3Contact4Data* globalContactsOut, + int* nGlobalContactsOut, + int numCompoundPairs, int maxContactCapacity, int i) { - -// int i = get_global_id(0); + // int i = get_global_id(0); int pairIndex = i; - + float4 worldVertsB1[64]; float4 worldVertsB2[64]; - int capacityWorldVerts = 64; + int capacityWorldVerts = 64; float4 localContactsOut[64]; - int localContactCapacity=64; - + int localContactCapacity = 64; + float minDist = -1e30f; float maxDist = 0.0f; - if (i<numCompoundPairs) + if (i < numCompoundPairs) { - if (gpuHasCompoundSepNormalsOut[i]) { - int bodyIndexA = gpuCompoundPairs[i].x; int bodyIndexB = gpuCompoundPairs[i].y; - + int childShapeIndexA = gpuCompoundPairs[i].z; int childShapeIndexB = gpuCompoundPairs[i].w; - + int collidableIndexA = -1; int collidableIndexB = -1; - + b3Quat ornA = rigidBodies[bodyIndexA].m_quat; float4 posA = rigidBodies[bodyIndexA].m_pos; - + b3Quat ornB = rigidBodies[bodyIndexB].m_quat; float4 posB = rigidBodies[bodyIndexB].m_pos; - + if (childShapeIndexA >= 0) { collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition; b3Quat childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation; - float4 newPosA = b3QuatRotate(ornA,childPosA)+posA; - b3Quat newOrnA = b3QuatMul(ornA,childOrnA); + float4 newPosA = b3QuatRotate(ornA, childPosA) + posA; + b3Quat newOrnA = b3QuatMul(ornA, childOrnA); posA = newPosA; ornA = newOrnA; - } else + } + else { collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx; } - - if (childShapeIndexB>=0) + + if (childShapeIndexB >= 0) { collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex; float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition; - b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; - float4 newPosB = b3QuatRotate(ornB,childPosB)+posB; - b3Quat newOrnB = b3QuatMul(ornB,childOrnB); + b3Quat childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation; + float4 newPosB = b3QuatRotate(ornB, childPosB) + posB; + b3Quat newOrnB = b3QuatMul(ornB, childOrnB); posB = newPosB; ornB = newOrnB; - } else + } + else { - collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; + collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; } - + int shapeIndexA = collidables[collidableIndexA].m_shapeIndex; int shapeIndexB = collidables[collidableIndexB].m_shapeIndex; - + int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i], - convexShapes[shapeIndexA], convexShapes[shapeIndexB], - posA,ornA, - posB,ornB, - worldVertsB1,worldVertsB2,capacityWorldVerts, - minDist, maxDist, - vertices,faces,indices, - vertices,faces,indices, - localContactsOut,localContactCapacity); - - if (numLocalContactsOut>0) - { + convexShapes[shapeIndexA], convexShapes[shapeIndexB], + posA, ornA, + posB, ornB, + worldVertsB1, worldVertsB2, capacityWorldVerts, + minDist, maxDist, + vertices, faces, indices, + vertices, faces, indices, + localContactsOut, localContactCapacity); + + if (numLocalContactsOut > 0) + { float4 normal = -gpuCompoundSepNormalsOut[i]; int nPoints = numLocalContactsOut; float4* pointsIn = localContactsOut; - b3Int4 contactIdx;// = {-1,-1,-1,-1}; + b3Int4 contactIdx; // = {-1,-1,-1,-1}; contactIdx.s[0] = 0; contactIdx.s[1] = 1; @@ -2105,111 +1994,106 @@ __kernel void clipCompoundsHullHullKernel( __global const b3Int4* gpuCompoundP contactIdx.s[3] = 3; int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx); - + int dstIdx; - dstIdx = b3AtomicInc( nGlobalContactsOut); - if ((dstIdx+nReducedContacts) < maxContactCapacity) + dstIdx = b3AtomicInc(nGlobalContactsOut); + if ((dstIdx + nReducedContacts) < maxContactCapacity) { - __global struct b3Contact4Data* c = globalContactsOut+ dstIdx; + __global struct b3Contact4Data* c = globalContactsOut + dstIdx; c->m_worldNormalOnB = -normal; - c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff); + c->m_restituitionCoeffCmp = (0.f * 0xffff); + c->m_frictionCoeffCmp = (0.7f * 0xffff); c->m_batchIdx = pairIndex; int bodyA = gpuCompoundPairs[pairIndex].x; int bodyB = gpuCompoundPairs[pairIndex].y; - c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA; - c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass == 0 ? -bodyA : bodyA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass == 0 ? -bodyB : bodyB; c->m_childIndexA = childShapeIndexA; c->m_childIndexB = childShapeIndexB; - for (int i=0;i<nReducedContacts;i++) + for (int i = 0; i < nReducedContacts; i++) { c->m_worldPosB[i] = pointsIn[contactIdx.s[i]]; } - b3Contact4Data_setNumPoints(c,nReducedContacts); + b3Contact4Data_setNumPoints(c, nReducedContacts); } - - }// if (numContactsOut>0) - }// if (gpuHasCompoundSepNormalsOut[i]) - }// if (i<numCompoundPairs) + } // if (numContactsOut>0) + } // if (gpuHasCompoundSepNormalsOut[i]) + } // if (i<numCompoundPairs) } - void computeContactCompoundCompound(int pairIndex, - int bodyIndexA, int bodyIndexB, - int collidableIndexA, int collidableIndexB, - const b3RigidBodyData* rigidBodies, - const b3Collidable* collidables, - const b3ConvexPolyhedronData* convexShapes, - const b3GpuChildShape* cpuChildShapes, - const b3AlignedObjectArray<b3Aabb>& hostAabbsWorldSpace, - const b3AlignedObjectArray<b3Aabb>& hostAabbsLocalSpace, - - const b3AlignedObjectArray<b3Vector3>& convexVertices, - const b3AlignedObjectArray<b3Vector3>& hostUniqueEdges, - const b3AlignedObjectArray<int>& convexIndices, - const b3AlignedObjectArray<b3GpuFace>& faces, - - b3Contact4* globalContactsOut, - int& nGlobalContactsOut, - int maxContactCapacity, - b3AlignedObjectArray<b3QuantizedBvhNode>& treeNodesCPU, - b3AlignedObjectArray<b3BvhSubtreeInfo>& subTreesCPU, - b3AlignedObjectArray<b3BvhInfo>& bvhInfoCPU - ) + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3RigidBodyData* rigidBodies, + const b3Collidable* collidables, + const b3ConvexPolyhedronData* convexShapes, + const b3GpuChildShape* cpuChildShapes, + const b3AlignedObjectArray<b3Aabb>& hostAabbsWorldSpace, + const b3AlignedObjectArray<b3Aabb>& hostAabbsLocalSpace, + + const b3AlignedObjectArray<b3Vector3>& convexVertices, + const b3AlignedObjectArray<b3Vector3>& hostUniqueEdges, + const b3AlignedObjectArray<int>& convexIndices, + const b3AlignedObjectArray<b3GpuFace>& faces, + + b3Contact4* globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity, + b3AlignedObjectArray<b3QuantizedBvhNode>& treeNodesCPU, + b3AlignedObjectArray<b3BvhSubtreeInfo>& subTreesCPU, + b3AlignedObjectArray<b3BvhInfo>& bvhInfoCPU) { - int shapeTypeB = collidables[collidableIndexB].m_shapeType; b3Assert(shapeTypeB == SHAPE_COMPOUND_OF_CONVEX_HULLS); b3AlignedObjectArray<b3Int4> cpuCompoundPairsOut; - int numCompoundPairsOut=0; - int maxNumCompoundPairsCapacity = 8192;//1024; + int numCompoundPairsOut = 0; + int maxNumCompoundPairsCapacity = 8192; //1024; cpuCompoundPairsOut.resize(maxNumCompoundPairsCapacity); // work-in-progress - findCompoundPairsKernel( - pairIndex, - bodyIndexA,bodyIndexB, - collidableIndexA,collidableIndexB, - rigidBodies, - collidables, - convexShapes, - convexVertices, - hostAabbsWorldSpace, - hostAabbsLocalSpace, - cpuChildShapes, - &cpuCompoundPairsOut[0], - &numCompoundPairsOut, - maxNumCompoundPairsCapacity , - treeNodesCPU, - subTreesCPU, - bvhInfoCPU - ); - - printf("maxNumAabbChecks=%d\n",maxNumAabbChecks); - if (numCompoundPairsOut>maxNumCompoundPairsCapacity) + findCompoundPairsKernel( + pairIndex, + bodyIndexA, bodyIndexB, + collidableIndexA, collidableIndexB, + rigidBodies, + collidables, + convexShapes, + convexVertices, + hostAabbsWorldSpace, + hostAabbsLocalSpace, + cpuChildShapes, + &cpuCompoundPairsOut[0], + &numCompoundPairsOut, + maxNumCompoundPairsCapacity, + treeNodesCPU, + subTreesCPU, + bvhInfoCPU); + + printf("maxNumAabbChecks=%d\n", maxNumAabbChecks); + if (numCompoundPairsOut > maxNumCompoundPairsCapacity) { - b3Error("numCompoundPairsOut exceeded maxNumCompoundPairsCapacity (%d)\n",maxNumCompoundPairsCapacity); - numCompoundPairsOut=maxNumCompoundPairsCapacity; + b3Error("numCompoundPairsOut exceeded maxNumCompoundPairsCapacity (%d)\n", maxNumCompoundPairsCapacity); + numCompoundPairsOut = maxNumCompoundPairsCapacity; } b3AlignedObjectArray<b3Float4> cpuCompoundSepNormalsOut; b3AlignedObjectArray<int> cpuHasCompoundSepNormalsOut; cpuCompoundSepNormalsOut.resize(numCompoundPairsOut); cpuHasCompoundSepNormalsOut.resize(numCompoundPairsOut); - for (int i=0;i<numCompoundPairsOut;i++) + for (int i = 0; i < numCompoundPairsOut; i++) { - - processCompoundPairsKernel(&cpuCompoundPairsOut[0],rigidBodies,collidables,convexShapes,convexVertices,hostUniqueEdges,faces,convexIndices,0,cpuChildShapes, - cpuCompoundSepNormalsOut,cpuHasCompoundSepNormalsOut,numCompoundPairsOut,i); + processCompoundPairsKernel(&cpuCompoundPairsOut[0], rigidBodies, collidables, convexShapes, convexVertices, hostUniqueEdges, faces, convexIndices, 0, cpuChildShapes, + cpuCompoundSepNormalsOut, cpuHasCompoundSepNormalsOut, numCompoundPairsOut, i); } - for (int i=0;i<numCompoundPairsOut;i++) + for (int i = 0; i < numCompoundPairsOut; i++) { - clipCompoundsHullHullKernel(&cpuCompoundPairsOut[0],rigidBodies,collidables,convexShapes,convexVertices,hostUniqueEdges,faces,convexIndices,cpuChildShapes, - cpuCompoundSepNormalsOut,cpuHasCompoundSepNormalsOut,globalContactsOut,&nGlobalContactsOut,numCompoundPairsOut,maxContactCapacity,i); + clipCompoundsHullHullKernel(&cpuCompoundPairsOut[0], rigidBodies, collidables, convexShapes, convexVertices, hostUniqueEdges, faces, convexIndices, cpuChildShapes, + cpuCompoundSepNormalsOut, cpuHasCompoundSepNormalsOut, globalContactsOut, &nGlobalContactsOut, numCompoundPairsOut, maxContactCapacity, i); } - /* + /* int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex; float4 posA = rigidBodies[bodyIndexA].m_pos; @@ -2235,7 +2119,6 @@ void computeContactCompoundCompound(int pairIndex, ); */ - /* if (foundSepAxis) { @@ -2271,8 +2154,8 @@ void computeContactCompoundCompound(int pairIndex, } */ -// return contactIndex; - + // return contactIndex; + /* int numChildrenB = collidables[collidableIndexB].m_numChildShapes; @@ -2294,56 +2177,52 @@ void computeContactCompoundCompound(int pairIndex, } */ - } void computeContactPlaneCompound(int pairIndex, - int bodyIndexA, int bodyIndexB, - int collidableIndexA, int collidableIndexB, - const b3RigidBodyData* rigidBodies, - const b3Collidable* collidables, - const b3ConvexPolyhedronData* convexShapes, - const b3GpuChildShape* cpuChildShapes, - const b3Vector3* convexVertices, - const int* convexIndices, - const b3GpuFace* faces, - - b3Contact4* globalContactsOut, - int& nGlobalContactsOut, - int maxContactCapacity) + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3RigidBodyData* rigidBodies, + const b3Collidable* collidables, + const b3ConvexPolyhedronData* convexShapes, + const b3GpuChildShape* cpuChildShapes, + const b3Vector3* convexVertices, + const int* convexIndices, + const b3GpuFace* faces, + + b3Contact4* globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity) { - int shapeTypeB = collidables[collidableIndexB].m_shapeType; b3Assert(shapeTypeB == SHAPE_COMPOUND_OF_CONVEX_HULLS); - int numChildrenB = collidables[collidableIndexB].m_numChildShapes; - for (int c=0;c<numChildrenB;c++) + for (int c = 0; c < numChildrenB; c++) { - int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+c; + int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex + c; int childColIndexB = cpuChildShapes[childShapeIndexB].m_shapeIndex; float4 rootPosB = rigidBodies[bodyIndexB].m_pos; b3Quaternion rootOrnB = rigidBodies[bodyIndexB].m_quat; b3Vector3 childPosB = cpuChildShapes[childShapeIndexB].m_childPosition; b3Quaternion childOrnB = cpuChildShapes[childShapeIndexB].m_childOrientation; - float4 posB = b3QuatRotate(rootOrnB,childPosB)+rootPosB; - b3Quaternion ornB = rootOrnB*childOrnB;//b3QuatMul(ornB,childOrnB); + float4 posB = b3QuatRotate(rootOrnB, childPosB) + rootPosB; + b3Quaternion ornB = rootOrnB * childOrnB; //b3QuatMul(ornB,childOrnB); int shapeIndexB = collidables[childColIndexB].m_shapeIndex; const b3ConvexPolyhedronData* hullB = &convexShapes[shapeIndexB]; - - + b3Vector3 posA = rigidBodies[bodyIndexA].m_pos; b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat; - // int numContactsOut = 0; - // int numWorldVertsB1= 0; + // int numContactsOut = 0; + // int numWorldVertsB1= 0; b3Vector3 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane; - b3Vector3 planeNormal=b3MakeVector3(planeEq.x,planeEq.y,planeEq.z); - b3Vector3 planeNormalWorld = b3QuatRotate(ornA,planeNormal); + b3Vector3 planeNormal = b3MakeVector3(planeEq.x, planeEq.y, planeEq.z); + b3Vector3 planeNormalWorld = b3QuatRotate(ornA, planeNormal); float planeConstant = planeEq.w; b3Transform convexWorldTransform; convexWorldTransform.setIdentity(); @@ -2355,16 +2234,16 @@ void computeContactPlaneCompound(int pairIndex, planeTransform.setRotation(ornA); b3Transform planeInConvex; - planeInConvex= convexWorldTransform.inverse() * planeTransform; + planeInConvex = convexWorldTransform.inverse() * planeTransform; b3Transform convexInPlane; convexInPlane = planeTransform.inverse() * convexWorldTransform; - - b3Vector3 planeNormalInConvex = planeInConvex.getBasis()*-planeNormal; + + b3Vector3 planeNormalInConvex = planeInConvex.getBasis() * -planeNormal; float maxDot = -1e30; - int hitVertex=-1; + int hitVertex = -1; b3Vector3 hitVtx; - #define MAX_PLANE_CONVEX_POINTS 64 +#define MAX_PLANE_CONVEX_POINTS 64 b3Vector3 contactPoints[MAX_PLANE_CONVEX_POINTS]; int numPoints = 0; @@ -2374,54 +2253,52 @@ void computeContactPlaneCompound(int pairIndex, contactIdx.s[1] = 1; contactIdx.s[2] = 2; contactIdx.s[3] = 3; - - for (int i=0;i<hullB->m_numVertices;i++) + + for (int i = 0; i < hullB->m_numVertices; i++) { - b3Vector3 vtx = convexVertices[hullB->m_vertexOffset+i]; + b3Vector3 vtx = convexVertices[hullB->m_vertexOffset + i]; float curDot = vtx.dot(planeNormalInConvex); - - if (curDot>maxDot) + if (curDot > maxDot) { - hitVertex=i; - maxDot=curDot; + hitVertex = i; + maxDot = curDot; hitVtx = vtx; //make sure the deepest points is always included - if (numPoints==MAX_PLANE_CONVEX_POINTS) + if (numPoints == MAX_PLANE_CONVEX_POINTS) numPoints--; } - if (numPoints<MAX_PLANE_CONVEX_POINTS) + if (numPoints < MAX_PLANE_CONVEX_POINTS) { - b3Vector3 vtxWorld = convexWorldTransform*vtx; - b3Vector3 vtxInPlane = planeTransform.inverse()*vtxWorld; - float dist = planeNormal.dot(vtxInPlane)-planeConstant; - if (dist<0.f) + b3Vector3 vtxWorld = convexWorldTransform * vtx; + b3Vector3 vtxInPlane = planeTransform.inverse() * vtxWorld; + float dist = planeNormal.dot(vtxInPlane) - planeConstant; + if (dist < 0.f) { vtxWorld.w = dist; contactPoints[numPoints] = vtxWorld; numPoints++; } } - } - int numReducedPoints = 0; + int numReducedPoints = 0; numReducedPoints = numPoints; - - if (numPoints>4) + + if (numPoints > 4) { - numReducedPoints = extractManifoldSequentialGlobal( contactPoints, numPoints, planeNormalInConvex, &contactIdx); + numReducedPoints = extractManifoldSequentialGlobal(contactPoints, numPoints, planeNormalInConvex, &contactIdx); } int dstIdx; - // dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx ); - - if (numReducedPoints>0) + // dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx ); + + if (numReducedPoints > 0) { if (nGlobalContactsOut < maxContactCapacity) { - dstIdx=nGlobalContactsOut; + dstIdx = nGlobalContactsOut; nGlobalContactsOut++; b3Contact4* c = &globalContactsOut[dstIdx]; @@ -2430,48 +2307,37 @@ void computeContactPlaneCompound(int pairIndex, c->setRestituitionCoeff(0.f); c->m_batchIdx = pairIndex; - c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; - c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; - for (int i=0;i<numReducedPoints;i++) + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass == 0 ? -bodyIndexA : bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass == 0 ? -bodyIndexB : bodyIndexB; + for (int i = 0; i < numReducedPoints; i++) { b3Vector3 pOnB1 = contactPoints[contactIdx.s[i]]; c->m_worldPosB[i] = pOnB1; } c->m_worldNormalOnB.w = (b3Scalar)numReducedPoints; - }//if (dstIdx < numPairs) - } - + } //if (dstIdx < numPairs) + } } - - } - - - - -void computeContactSphereConvex(int pairIndex, - int bodyIndexA, int bodyIndexB, - int collidableIndexA, int collidableIndexB, - const b3RigidBodyData* rigidBodies, - const b3Collidable* collidables, - const b3ConvexPolyhedronData* convexShapes, - const b3Vector3* convexVertices, - const int* convexIndices, - const b3GpuFace* faces, - b3Contact4* globalContactsOut, - int& nGlobalContactsOut, - int maxContactCapacity) +void computeContactSphereConvex(int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3RigidBodyData* rigidBodies, + const b3Collidable* collidables, + const b3ConvexPolyhedronData* convexShapes, + const b3Vector3* convexVertices, + const int* convexIndices, + const b3GpuFace* faces, + b3Contact4* globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity) { - float radius = collidables[collidableIndexA].m_radius; float4 spherePos1 = rigidBodies[bodyIndexA].m_pos; b3Quaternion sphereOrn = rigidBodies[bodyIndexA].m_quat; - - float4 pos = rigidBodies[bodyIndexB].m_pos; - b3Quaternion quat = rigidBodies[bodyIndexB].m_quat; @@ -2487,64 +2353,65 @@ void computeContactSphereConvex(int pairIndex, int shapeIndex = collidables[collidableIndex].m_shapeIndex; int numFaces = convexShapes[shapeIndex].m_numFaces; float4 closestPnt = b3MakeVector3(0, 0, 0, 0); -// float4 hitNormalWorld = b3MakeVector3(0, 0, 0, 0); - float minDist = -1000000.f; // TODO: What is the largest/smallest float? + // float4 hitNormalWorld = b3MakeVector3(0, 0, 0, 0); + float minDist = -1000000.f; // TODO: What is the largest/smallest float? bool bCollide = true; int region = -1; float4 localHitNormal; - for ( int f = 0; f < numFaces; f++ ) + for (int f = 0; f < numFaces; f++) { - b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f]; + b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset + f]; float4 planeEqn; - float4 localPlaneNormal = b3MakeVector3(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f); - float4 n1 = localPlaneNormal;//quatRotate(quat,localPlaneNormal); + float4 localPlaneNormal = b3MakeVector3(face.m_plane.x, face.m_plane.y, face.m_plane.z, 0.f); + float4 n1 = localPlaneNormal; //quatRotate(quat,localPlaneNormal); planeEqn = n1; planeEqn[3] = face.m_plane.w; float4 pntReturn; float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn); - if ( dist > radius) + if (dist > radius) { bCollide = false; break; } - if ( dist > 0 ) + if (dist > 0) { //might hit an edge or vertex b3Vector3 out; bool isInPoly = IsPointInPolygon(spherePos, - &face, - &convexVertices[convexShapes[shapeIndex].m_vertexOffset], - convexIndices, - &out); + &face, + &convexVertices[convexShapes[shapeIndex].m_vertexOffset], + convexIndices, + &out); if (isInPoly) { - if (dist>minDist) + if (dist > minDist) { minDist = dist; closestPnt = pntReturn; localHitNormal = planeEqn; - region=1; + region = 1; } - } else + } + else { - b3Vector3 tmp = spherePos-out; + b3Vector3 tmp = spherePos - out; b3Scalar l2 = tmp.length2(); - if (l2<radius*radius) + if (l2 < radius * radius) { - dist = b3Sqrt(l2); - if (dist>minDist) + dist = b3Sqrt(l2); + if (dist > minDist) { minDist = dist; closestPnt = out; - localHitNormal = tmp/dist; - region=2; + localHitNormal = tmp / dist; + region = 2; } - - } else + } + else { bCollide = false; break; @@ -2553,12 +2420,12 @@ void computeContactSphereConvex(int pairIndex, } else { - if ( dist > minDist ) + if (dist > minDist) { minDist = dist; closestPnt = pntReturn; localHitNormal = planeEqn; - region=3; + region = 3; } } } @@ -2567,128 +2434,113 @@ void computeContactSphereConvex(int pairIndex, if (bCollide && minDist > -10000) { - - float4 normalOnSurfaceB1 = tr.getBasis()*localHitNormal;//-hitNormalWorld; + float4 normalOnSurfaceB1 = tr.getBasis() * localHitNormal; //-hitNormalWorld; float4 pOnB1 = tr(closestPnt); //printf("dist ,%f,",minDist); - float actualDepth = minDist-radius; - if (actualDepth<0) + float actualDepth = minDist - radius; + if (actualDepth < 0) { - //printf("actualDepth = ,%f,", actualDepth); - //printf("normalOnSurfaceB1 = ,%f,%f,%f,", normalOnSurfaceB1.x,normalOnSurfaceB1.y,normalOnSurfaceB1.z); - //printf("region=,%d,\n", region); - pOnB1[3] = actualDepth; + //printf("actualDepth = ,%f,", actualDepth); + //printf("normalOnSurfaceB1 = ,%f,%f,%f,", normalOnSurfaceB1.x,normalOnSurfaceB1.y,normalOnSurfaceB1.z); + //printf("region=,%d,\n", region); + pOnB1[3] = actualDepth; - int dstIdx; -// dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx ); - - if (nGlobalContactsOut < maxContactCapacity) - { - dstIdx=nGlobalContactsOut; - nGlobalContactsOut++; + int dstIdx; + // dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx ); - b3Contact4* c = &globalContactsOut[dstIdx]; - c->m_worldNormalOnB = normalOnSurfaceB1; - c->setFrictionCoeff(0.7); - c->setRestituitionCoeff(0.f); + if (nGlobalContactsOut < maxContactCapacity) + { + dstIdx = nGlobalContactsOut; + nGlobalContactsOut++; - c->m_batchIdx = pairIndex; - c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA; - c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB; - c->m_worldPosB[0] = pOnB1; - int numPoints = 1; - c->m_worldNormalOnB.w = (b3Scalar)numPoints; - }//if (dstIdx < numPairs) + b3Contact4* c = &globalContactsOut[dstIdx]; + c->m_worldNormalOnB = normalOnSurfaceB1; + c->setFrictionCoeff(0.7); + c->setRestituitionCoeff(0.f); + + c->m_batchIdx = pairIndex; + c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass == 0 ? -bodyIndexA : bodyIndexA; + c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass == 0 ? -bodyIndexB : bodyIndexB; + c->m_worldPosB[0] = pOnB1; + int numPoints = 1; + c->m_worldNormalOnB.w = (b3Scalar)numPoints; + } //if (dstIdx < numPairs) } - }//if (hasCollision) - + } //if (hasCollision) } - - - int computeContactConvexConvex2( - int pairIndex, - int bodyIndexA, int bodyIndexB, - int collidableIndexA, int collidableIndexB, - const b3AlignedObjectArray<b3RigidBodyData>& rigidBodies, - const b3AlignedObjectArray<b3Collidable>& collidables, - const b3AlignedObjectArray<b3ConvexPolyhedronData>& convexShapes, - const b3AlignedObjectArray<b3Vector3>& convexVertices, - const b3AlignedObjectArray<b3Vector3>& uniqueEdges, - const b3AlignedObjectArray<int>& convexIndices, - const b3AlignedObjectArray<b3GpuFace>& faces, - b3AlignedObjectArray<b3Contact4>& globalContactsOut, - int& nGlobalContactsOut, - int maxContactCapacity, - const b3AlignedObjectArray<b3Contact4>& oldContacts - ) + int pairIndex, + int bodyIndexA, int bodyIndexB, + int collidableIndexA, int collidableIndexB, + const b3AlignedObjectArray<b3RigidBodyData>& rigidBodies, + const b3AlignedObjectArray<b3Collidable>& collidables, + const b3AlignedObjectArray<b3ConvexPolyhedronData>& convexShapes, + const b3AlignedObjectArray<b3Vector3>& convexVertices, + const b3AlignedObjectArray<b3Vector3>& uniqueEdges, + const b3AlignedObjectArray<int>& convexIndices, + const b3AlignedObjectArray<b3GpuFace>& faces, + b3AlignedObjectArray<b3Contact4>& globalContactsOut, + int& nGlobalContactsOut, + int maxContactCapacity, + const b3AlignedObjectArray<b3Contact4>& oldContacts) { int contactIndex = -1; b3Vector3 posA = rigidBodies[bodyIndexA].m_pos; b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat; b3Vector3 posB = rigidBodies[bodyIndexB].m_pos; b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat; - b3ConvexPolyhedronData hullA, hullB; - + b3Vector3 sepNormalWorldSpace; - + b3Collidable colA = collidables[collidableIndexA]; + hullA = convexShapes[colA.m_shapeIndex]; + //printf("numvertsA = %d\n",hullA.m_numVertices); - b3Collidable colA = collidables[collidableIndexA]; - hullA = convexShapes[colA.m_shapeIndex]; - //printf("numvertsA = %d\n",hullA.m_numVertices); - - - b3Collidable colB = collidables[collidableIndexB]; - hullB = convexShapes[colB.m_shapeIndex]; - //printf("numvertsB = %d\n",hullB.m_numVertices); + b3Collidable colB = collidables[collidableIndexB]; + hullB = convexShapes[colB.m_shapeIndex]; + //printf("numvertsB = %d\n",hullB.m_numVertices); -// int contactCapacity = MAX_VERTS; + // int contactCapacity = MAX_VERTS; //int numContactsOut=0; - #ifdef _WIN32 b3Assert(_finite(rigidBodies[bodyIndexA].m_pos.x)); b3Assert(_finite(rigidBodies[bodyIndexB].m_pos.x)); #endif - - bool foundSepAxis = findSeparatingAxis(hullA,hullB, - posA, - ornA, - posB, - ornB, - convexVertices,uniqueEdges,faces,convexIndices, - convexVertices,uniqueEdges,faces,convexIndices, - - sepNormalWorldSpace - ); + bool foundSepAxis = findSeparatingAxis(hullA, hullB, + posA, + ornA, + posB, + ornB, + + convexVertices, uniqueEdges, faces, convexIndices, + convexVertices, uniqueEdges, faces, convexIndices, + + sepNormalWorldSpace); - if (foundSepAxis) { - - contactIndex = clipHullHullSingle( bodyIndexA, bodyIndexB, - posA,ornA, - posB,ornB, + posA, ornA, + posB, ornB, collidableIndexA, collidableIndexB, - &rigidBodies, + &rigidBodies, &globalContactsOut, nGlobalContactsOut, - + convexShapes, convexShapes, - - convexVertices, - uniqueEdges, + + convexVertices, + uniqueEdges, faces, convexIndices, - + convexVertices, uniqueEdges, faces, @@ -2698,50 +2550,42 @@ int computeContactConvexConvex2( collidables, sepNormalWorldSpace, maxContactCapacity); - } return contactIndex; } - - - - - - -void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs, - const b3OpenCLArray<b3RigidBodyData>* bodyBuf, - b3OpenCLArray<b3Contact4>* contactOut, int& nContacts, - const b3OpenCLArray<b3Contact4>* oldContacts, - int maxContactCapacity, - int compoundPairCapacity, - const b3OpenCLArray<b3ConvexPolyhedronData>& convexData, - const b3OpenCLArray<b3Vector3>& gpuVertices, - const b3OpenCLArray<b3Vector3>& gpuUniqueEdges, - const b3OpenCLArray<b3GpuFace>& gpuFaces, - const b3OpenCLArray<int>& gpuIndices, - const b3OpenCLArray<b3Collidable>& gpuCollidables, - const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes, - - const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace, - const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace, - - b3OpenCLArray<b3Vector3>& worldVertsB1GPU, - b3OpenCLArray<b3Int4>& clippingFacesOutGPU, - b3OpenCLArray<b3Vector3>& worldNormalsAGPU, - b3OpenCLArray<b3Vector3>& worldVertsA1GPU, - b3OpenCLArray<b3Vector3>& worldVertsB2GPU, - b3AlignedObjectArray<class b3OptimizedBvh*>& bvhDataUnused, - b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU, - b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU, - b3OpenCLArray<b3BvhInfo>* bvhInfo, - - int numObjects, - int maxTriConvexPairCapacity, - b3OpenCLArray<b3Int4>& triangleConvexPairsOut, - int& numTriConvexPairsOut - ) +void GpuSatCollision::computeConvexConvexContactsGPUSAT(b3OpenCLArray<b3Int4>* pairs, int nPairs, + const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + b3OpenCLArray<b3Contact4>* contactOut, int& nContacts, + const b3OpenCLArray<b3Contact4>* oldContacts, + int maxContactCapacity, + int compoundPairCapacity, + const b3OpenCLArray<b3ConvexPolyhedronData>& convexData, + const b3OpenCLArray<b3Vector3>& gpuVertices, + const b3OpenCLArray<b3Vector3>& gpuUniqueEdges, + const b3OpenCLArray<b3GpuFace>& gpuFaces, + const b3OpenCLArray<int>& gpuIndices, + const b3OpenCLArray<b3Collidable>& gpuCollidables, + const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes, + + const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace, + const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace, + + b3OpenCLArray<b3Vector3>& worldVertsB1GPU, + b3OpenCLArray<b3Int4>& clippingFacesOutGPU, + b3OpenCLArray<b3Vector3>& worldNormalsAGPU, + b3OpenCLArray<b3Vector3>& worldVertsA1GPU, + b3OpenCLArray<b3Vector3>& worldVertsB2GPU, + b3AlignedObjectArray<class b3OptimizedBvh*>& bvhDataUnused, + b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU, + b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU, + b3OpenCLArray<b3BvhInfo>* bvhInfo, + + int numObjects, + int maxTriConvexPairCapacity, + b3OpenCLArray<b3Int4>& triangleConvexPairsOut, + int& numTriConvexPairsOut) { myframecount++; @@ -2750,14 +2594,13 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* #ifdef CHECK_ON_HOST - - b3AlignedObjectArray<b3QuantizedBvhNode> treeNodesCPU; + b3AlignedObjectArray<b3QuantizedBvhNode> treeNodesCPU; treeNodesGPU->copyToHost(treeNodesCPU); - b3AlignedObjectArray<b3BvhSubtreeInfo> subTreesCPU; + b3AlignedObjectArray<b3BvhSubtreeInfo> subTreesCPU; subTreesGPU->copyToHost(subTreesCPU); - b3AlignedObjectArray<b3BvhInfo> bvhInfoCPU; + b3AlignedObjectArray<b3BvhInfo> bvhInfoCPU; bvhInfo->copyToHost(bvhInfoCPU); b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; @@ -2772,8 +2615,6 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; bodyBuf->copyToHost(hostBodyBuf); - - b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; convexData.copyToHost(hostConvexData); @@ -2788,10 +2629,9 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* gpuIndices.copyToHost(hostIndices); b3AlignedObjectArray<b3Collidable> hostCollidables; gpuCollidables.copyToHost(hostCollidables); - + b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; gpuChildShapes.copyToHost(cpuChildShapes); - b3AlignedObjectArray<b3Int4> hostTriangleConvexPairs; @@ -2802,16 +2642,15 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* } b3AlignedObjectArray<b3Contact4> oldHostContacts; - + if (oldContacts->size()) { oldContacts->copyToHost(oldHostContacts); } - hostContacts.resize(maxContactCapacity); - for (int i=0;i<nPairs;i++) + for (int i = 0; i < nPairs; i++) { int bodyIndexA = hostPairs[i].x; int bodyIndexB = hostPairs[i].y; @@ -2821,84 +2660,73 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_SPHERE && hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL) { - computeContactSphereConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0], - &hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); + computeContactSphereConvex(i, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, &hostBodyBuf[0], + &hostCollidables[0], &hostConvexData[0], &hostVertices[0], &hostIndices[0], &hostFaces[0], &hostContacts[0], nContacts, maxContactCapacity); } if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL && hostCollidables[collidableIndexB].m_shapeType == SHAPE_SPHERE) { - computeContactSphereConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0], - &hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); + computeContactSphereConvex(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, &hostBodyBuf[0], + &hostCollidables[0], &hostConvexData[0], &hostVertices[0], &hostIndices[0], &hostFaces[0], &hostContacts[0], nContacts, maxContactCapacity); //printf("convex-sphere\n"); - } if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL && hostCollidables[collidableIndexB].m_shapeType == SHAPE_PLANE) { - computeContactPlaneConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0], - &hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); -// printf("convex-plane\n"); - + computeContactPlaneConvex(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, &hostBodyBuf[0], + &hostCollidables[0], &hostConvexData[0], &hostVertices[0], &hostIndices[0], &hostFaces[0], &hostContacts[0], nContacts, maxContactCapacity); + // printf("convex-plane\n"); } if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_PLANE && hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL) { - computeContactPlaneConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0], - &hostCollidables[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); -// printf("plane-convex\n"); - + computeContactPlaneConvex(i, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, &hostBodyBuf[0], + &hostCollidables[0], &hostConvexData[0], &hostVertices[0], &hostIndices[0], &hostFaces[0], &hostContacts[0], nContacts, maxContactCapacity); + // printf("plane-convex\n"); } - if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS && + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS && hostCollidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS) { - computeContactCompoundCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0], - &hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0], hostAabbsWorldSpace,hostAabbsLocalSpace,hostVertices,hostUniqueEdges,hostIndices,hostFaces,&hostContacts[0], - nContacts,maxContactCapacity,treeNodesCPU,subTreesCPU,bvhInfoCPU); -// printf("convex-plane\n"); - + computeContactCompoundCompound(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, &hostBodyBuf[0], + &hostCollidables[0], &hostConvexData[0], &cpuChildShapes[0], hostAabbsWorldSpace, hostAabbsLocalSpace, hostVertices, hostUniqueEdges, hostIndices, hostFaces, &hostContacts[0], + nContacts, maxContactCapacity, treeNodesCPU, subTreesCPU, bvhInfoCPU); + // printf("convex-plane\n"); } - - if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS && + if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS && hostCollidables[collidableIndexB].m_shapeType == SHAPE_PLANE) { - computeContactPlaneCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&hostBodyBuf[0], - &hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0], &hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); -// printf("convex-plane\n"); - + computeContactPlaneCompound(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, &hostBodyBuf[0], + &hostCollidables[0], &hostConvexData[0], &cpuChildShapes[0], &hostVertices[0], &hostIndices[0], &hostFaces[0], &hostContacts[0], nContacts, maxContactCapacity); + // printf("convex-plane\n"); } if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_PLANE && hostCollidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS) { - computeContactPlaneCompound(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&hostBodyBuf[0], - &hostCollidables[0],&hostConvexData[0],&cpuChildShapes[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity); -// printf("plane-convex\n"); - + computeContactPlaneCompound(i, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, &hostBodyBuf[0], + &hostCollidables[0], &hostConvexData[0], &cpuChildShapes[0], &hostVertices[0], &hostIndices[0], &hostFaces[0], &hostContacts[0], nContacts, maxContactCapacity); + // printf("plane-convex\n"); } if (hostCollidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL && hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL) { //printf("hostPairs[i].z=%d\n",hostPairs[i].z); - int contactIndex = computeContactConvexConvex2( i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf, hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts); + int contactIndex = computeContactConvexConvex2(i, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, hostBodyBuf, hostCollidables, hostConvexData, hostVertices, hostUniqueEdges, hostIndices, hostFaces, hostContacts, nContacts, maxContactCapacity, oldHostContacts); //int contactIndex = computeContactConvexConvex(hostPairs,i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf,hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts); - - if (contactIndex>=0) + if (contactIndex >= 0) { -// printf("convex convex contactIndex = %d\n",contactIndex); + // printf("convex convex contactIndex = %d\n",contactIndex); hostPairs[i].z = contactIndex; } -// printf("plane-convex\n"); - + // printf("plane-convex\n"); } - - } if (hostPairs.size()) @@ -2908,81 +2736,76 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* hostContacts.resize(nContacts); if (nContacts) - { - - contactOut->copyFromHost(hostContacts); - } else + { + contactOut->copyFromHost(hostContacts); + } + else { contactOut->resize(0); - } + } - m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); - //printf("(HOST) nContacts = %d\n",nContacts); + m_totalContactsOut.copyFromHostPointer(&nContacts, 1, 0, true); + //printf("(HOST) nContacts = %d\n",nContacts); #else { if (nPairs) { - m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); + m_totalContactsOut.copyFromHostPointer(&nContacts, 1, 0, true); B3_PROFILE("primitiveContactsKernel"); b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( contactOut->getBufferCL()), - b3BufferInfoCL( m_totalContactsOut.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_primitiveContactsKernel,"m_primitiveContactsKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( nPairs ); + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(contactOut->getBufferCL()), + b3BufferInfoCL(m_totalContactsOut.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_primitiveContactsKernel, "m_primitiveContactsKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(nPairs); launcher.setConst(maxContactCapacity); int num = nPairs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); - + nContacts = m_totalContactsOut.at(0); contactOut->resize(nContacts); } } - -#endif//CHECK_ON_HOST - +#endif //CHECK_ON_HOST + B3_PROFILE("computeConvexConvexContactsGPUSAT"); - // printf("nContacts = %d\n",nContacts); - - + // printf("nContacts = %d\n",nContacts); + m_sepNormals.resize(nPairs); m_hasSeparatingNormals.resize(nPairs); - - int concaveCapacity=maxTriConvexPairCapacity; + + int concaveCapacity = maxTriConvexPairCapacity; m_concaveSepNormals.resize(concaveCapacity); m_concaveHasSeparatingNormals.resize(concaveCapacity); m_numConcavePairsOut.resize(0); m_numConcavePairsOut.push_back(0); - m_gpuCompoundPairs.resize(compoundPairCapacity); m_gpuCompoundSepNormals.resize(compoundPairCapacity); - - + m_gpuHasCompoundSepNormals.resize(compoundPairCapacity); - + m_numCompoundPairsOut.resize(0); m_numCompoundPairsOut.push_back(0); int numCompoundPairs = 0; - int numConcavePairs =0; + int numConcavePairs = 0; { clFinish(m_queue); @@ -2991,33 +2814,30 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* m_dmins.resize(nPairs); if (splitSearchSepAxisConvex) { - - if (useMprGpu) { nContacts = m_totalContactsOut.at(0); { B3_PROFILE("mprPenetrationKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( m_sepNormals.getBufferCL()), - b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( contactOut->getBufferCL()), - b3BufferInfoCL( m_totalContactsOut.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_mprPenetrationKernel,"mprPenetrationKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(m_sepNormals.getBufferCL()), + b3BufferInfoCL(m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(contactOut->getBufferCL()), + b3BufferInfoCL(m_totalContactsOut.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_mprPenetrationKernel, "mprPenetrationKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(maxContactCapacity); - launcher.setConst( nPairs ); + launcher.setConst(nPairs); int num = nPairs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); /* b3AlignedObjectArray<int>hostHasSepAxis; @@ -3027,173 +2847,160 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* */ nContacts = m_totalContactsOut.at(0); contactOut->resize(nContacts); - // printf("nContacts (after mprPenetrationKernel) = %d\n",nContacts); - if (nContacts>maxContactCapacity) + // printf("nContacts (after mprPenetrationKernel) = %d\n",nContacts); + if (nContacts > maxContactCapacity) { - b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity); nContacts = maxContactCapacity; } - } } - + if (1) { - if (1) { - { - B3_PROFILE("findSeparatingAxisVertexFaceKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), - b3BufferInfoCL( m_sepNormals.getBufferCL()), - b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( m_dmins.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_findSeparatingAxisVertexFaceKernel,"findSeparatingAxisVertexFaceKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( nPairs ); + { + B3_PROFILE("findSeparatingAxisVertexFaceKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(clAabbsWorldSpace.getBufferCL(), true), + b3BufferInfoCL(m_sepNormals.getBufferCL()), + b3BufferInfoCL(m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(m_dmins.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_findSeparatingAxisVertexFaceKernel, "findSeparatingAxisVertexFaceKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(nPairs); - int num = nPairs; - launcher.launch1D( num); - clFinish(m_queue); - } + int num = nPairs; + launcher.launch1D(num); + clFinish(m_queue); + } + int numDirections = sizeof(unitSphere162) / sizeof(b3Vector3); - int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3); - - { - B3_PROFILE("findSeparatingAxisEdgeEdgeKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), - b3BufferInfoCL( m_sepNormals.getBufferCL()), - b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( m_dmins.getBufferCL()), - b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true) - - }; - - b3LauncherCL launcher(m_queue, m_findSeparatingAxisEdgeEdgeKernel,"findSeparatingAxisEdgeEdgeKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numDirections); - launcher.setConst( nPairs ); - int num = nPairs; - launcher.launch1D( num); - clFinish(m_queue); + { + B3_PROFILE("findSeparatingAxisEdgeEdgeKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(clAabbsWorldSpace.getBufferCL(), true), + b3BufferInfoCL(m_sepNormals.getBufferCL()), + b3BufferInfoCL(m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(m_dmins.getBufferCL()), + b3BufferInfoCL(m_unitSphereDirections.getBufferCL(), true) - } + }; + + b3LauncherCL launcher(m_queue, m_findSeparatingAxisEdgeEdgeKernel, "findSeparatingAxisEdgeEdgeKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numDirections); + launcher.setConst(nPairs); + int num = nPairs; + launcher.launch1D(num); + clFinish(m_queue); + } } if (useMprGpu) { B3_PROFILE("findSeparatingAxisUnitSphereKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true), - b3BufferInfoCL( m_sepNormals.getBufferCL()), - b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( m_dmins.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_findSeparatingAxisUnitSphereKernel,"findSeparatingAxisUnitSphereKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3); - launcher.setConst( numDirections); - - launcher.setConst( nPairs ); - + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(m_unitSphereDirections.getBufferCL(), true), + b3BufferInfoCL(m_sepNormals.getBufferCL()), + b3BufferInfoCL(m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(m_dmins.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_findSeparatingAxisUnitSphereKernel, "findSeparatingAxisUnitSphereKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + int numDirections = sizeof(unitSphere162) / sizeof(b3Vector3); + launcher.setConst(numDirections); + + launcher.setConst(nPairs); + int num = nPairs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); } + } } - - - } else + else { B3_PROFILE("findSeparatingAxisKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), - b3BufferInfoCL( m_sepNormals.getBufferCL()), - b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_findSeparatingAxisKernel,"m_findSeparatingAxisKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( nPairs ); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(clAabbsWorldSpace.getBufferCL(), true), + b3BufferInfoCL(m_sepNormals.getBufferCL()), + b3BufferInfoCL(m_hasSeparatingNormals.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_findSeparatingAxisKernel, "m_findSeparatingAxisKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(nPairs); int num = nPairs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); } - - } - else - { - + else + { B3_PROFILE("findSeparatingAxisKernel CPU"); - - - b3AlignedObjectArray<b3Int4> hostPairs; - pairs->copyToHost(hostPairs); - b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; - bodyBuf->copyToHost(hostBodyBuf); - - b3AlignedObjectArray<b3Collidable> hostCollidables; - gpuCollidables.copyToHost(hostCollidables); - - b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; - gpuChildShapes.copyToHost(cpuChildShapes); - - b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexShapeData; - convexData.copyToHost(hostConvexShapeData); - - b3AlignedObjectArray<b3Vector3> hostVertices; - gpuVertices.copyToHost(hostVertices); - - b3AlignedObjectArray<int> hostHasSepAxis; - hostHasSepAxis.resize(nPairs); - b3AlignedObjectArray<b3Vector3> hostSepAxis; - hostSepAxis.resize(nPairs); - - b3AlignedObjectArray<b3Vector3> hostUniqueEdges; - gpuUniqueEdges.copyToHost(hostUniqueEdges); - b3AlignedObjectArray<b3GpuFace> hostFaces; - gpuFaces.copyToHost(hostFaces); - - b3AlignedObjectArray<int> hostIndices; - gpuIndices.copyToHost(hostIndices); - + + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + + b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; + gpuChildShapes.copyToHost(cpuChildShapes); + + b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexShapeData; + convexData.copyToHost(hostConvexShapeData); + + b3AlignedObjectArray<b3Vector3> hostVertices; + gpuVertices.copyToHost(hostVertices); + + b3AlignedObjectArray<int> hostHasSepAxis; + hostHasSepAxis.resize(nPairs); + b3AlignedObjectArray<b3Vector3> hostSepAxis; + hostSepAxis.resize(nPairs); + + b3AlignedObjectArray<b3Vector3> hostUniqueEdges; + gpuUniqueEdges.copyToHost(hostUniqueEdges); + b3AlignedObjectArray<b3GpuFace> hostFaces; + gpuFaces.copyToHost(hostFaces); + + b3AlignedObjectArray<int> hostIndices; + gpuIndices.copyToHost(hostIndices); + b3AlignedObjectArray<b3Contact4> hostContacts; if (nContacts) { @@ -3201,61 +3008,56 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* } hostContacts.resize(maxContactCapacity); int nGlobalContactsOut = nContacts; - - - for (int i=0;i<nPairs;i++) - { - - int bodyIndexA = hostPairs[i].x; - int bodyIndexB = hostPairs[i].y; - int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; - int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; - - int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex; - int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex; - - hostHasSepAxis[i] = 0; - - //once the broadphase avoids static-static pairs, we can remove this test - if ((hostBodyBuf[bodyIndexA].m_invMass==0) &&(hostBodyBuf[bodyIndexB].m_invMass==0)) - { - continue; - } - - - if ((hostCollidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(hostCollidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL)) - { - continue; - } - - float dmin = FLT_MAX; - - b3ConvexPolyhedronData* convexShapeA = &hostConvexShapeData[shapeIndexA]; - b3ConvexPolyhedronData* convexShapeB = &hostConvexShapeData[shapeIndexB]; - b3Vector3 posA = hostBodyBuf[bodyIndexA].m_pos; - b3Vector3 posB = hostBodyBuf[bodyIndexB].m_pos; - b3Quaternion ornA =hostBodyBuf[bodyIndexA].m_quat; - b3Quaternion ornB =hostBodyBuf[bodyIndexB].m_quat; - - - if (useGjk) + + for (int i = 0; i < nPairs; i++) + { + int bodyIndexA = hostPairs[i].x; + int bodyIndexB = hostPairs[i].y; + int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; + int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; + + int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex; + + hostHasSepAxis[i] = 0; + + //once the broadphase avoids static-static pairs, we can remove this test + if ((hostBodyBuf[bodyIndexA].m_invMass == 0) && (hostBodyBuf[bodyIndexB].m_invMass == 0)) { + continue; + } + if ((hostCollidables[collidableIndexA].m_shapeType != SHAPE_CONVEX_HULL) || (hostCollidables[collidableIndexB].m_shapeType != SHAPE_CONVEX_HULL)) + { + continue; + } + + float dmin = FLT_MAX; + + b3ConvexPolyhedronData* convexShapeA = &hostConvexShapeData[shapeIndexA]; + b3ConvexPolyhedronData* convexShapeB = &hostConvexShapeData[shapeIndexB]; + b3Vector3 posA = hostBodyBuf[bodyIndexA].m_pos; + b3Vector3 posB = hostBodyBuf[bodyIndexB].m_pos; + b3Quaternion ornA = hostBodyBuf[bodyIndexA].m_quat; + b3Quaternion ornB = hostBodyBuf[bodyIndexB].m_quat; + + if (useGjk) + { //first approximate the separating axis, to 'fail-proof' GJK+EPA or MPR { b3Vector3 c0local = hostConvexShapeData[shapeIndexA].m_localCenter; b3Vector3 c0 = b3TransformPoint(c0local, posA, ornA); b3Vector3 c1local = hostConvexShapeData[shapeIndexB].m_localCenter; - b3Vector3 c1 = b3TransformPoint(c1local,posB,ornB); + b3Vector3 c1 = b3TransformPoint(c1local, posB, ornB); b3Vector3 DeltaC2 = c0 - c1; - + b3Vector3 sepAxis; - + bool hasSepAxisA = b3FindSeparatingAxis(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, - &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), - &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), - &sepAxis, &dmin); - + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin); + if (hasSepAxisA) { bool hasSepAxisB = b3FindSeparatingAxis(convexShapeB, convexShapeA, posB, ornB, posA, ornA, DeltaC2, @@ -3264,11 +3066,11 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* &sepAxis, &dmin); if (hasSepAxisB) { - bool hasEdgeEdge =b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, - &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), - &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), - &sepAxis, &dmin,false); - + bool hasEdgeEdge = b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin, false); + if (hasEdgeEdge) { hostHasSepAxis[i] = 1; @@ -3282,163 +3084,150 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* if (hostHasSepAxis[i]) { int pairIndex = i; - + bool useMpr = true; if (useMpr) { - int res=0; + int res = 0; float depth = 0.f; - b3Vector3 sepAxis2 = b3MakeVector3(1,0,0); - b3Vector3 resultPointOnBWorld = b3MakeVector3(0,0,0); + b3Vector3 sepAxis2 = b3MakeVector3(1, 0, 0); + b3Vector3 resultPointOnBWorld = b3MakeVector3(0, 0, 0); - float depthOut; - b3Vector3 dirOut; - b3Vector3 posOut; - + float depthOut; + b3Vector3 dirOut; + b3Vector3 posOut; - //res = b3MprPenetration(bodyIndexA,bodyIndexB,hostBodyBuf,hostConvexShapeData,hostCollidables,hostVertices,&mprConfig,&depthOut,&dirOut,&posOut); - res = b3MprPenetration(pairIndex,bodyIndexA,bodyIndexB,&hostBodyBuf[0],&hostConvexShapeData[0],&hostCollidables[0],&hostVertices[0],&hostSepAxis[0],&hostHasSepAxis[0],&depthOut,&dirOut,&posOut); - depth = depthOut; - sepAxis2 = b3MakeVector3(-dirOut.x,-dirOut.y,-dirOut.z); - resultPointOnBWorld = posOut; - //hostHasSepAxis[i] = 0; + //res = b3MprPenetration(bodyIndexA,bodyIndexB,hostBodyBuf,hostConvexShapeData,hostCollidables,hostVertices,&mprConfig,&depthOut,&dirOut,&posOut); + res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB, &hostBodyBuf[0], &hostConvexShapeData[0], &hostCollidables[0], &hostVertices[0], &hostSepAxis[0], &hostHasSepAxis[0], &depthOut, &dirOut, &posOut); + depth = depthOut; + sepAxis2 = b3MakeVector3(-dirOut.x, -dirOut.y, -dirOut.z); + resultPointOnBWorld = posOut; + //hostHasSepAxis[i] = 0; + if (res == 0) + { + //add point? + //printf("depth = %f\n",depth); + //printf("normal = %f,%f,%f\n",dir.v[0],dir.v[1],dir.v[2]); + //qprintf("pos = %f,%f,%f\n",pos.v[0],pos.v[1],pos.v[2]); - if (res==0) - { - //add point? - //printf("depth = %f\n",depth); - //printf("normal = %f,%f,%f\n",dir.v[0],dir.v[1],dir.v[2]); - //qprintf("pos = %f,%f,%f\n",pos.v[0],pos.v[1],pos.v[2]); - - - - float dist=0.f; + float dist = 0.f; - const b3ConvexPolyhedronData& hullA = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexA].m_collidableIdx].m_shapeIndex]; - const b3ConvexPolyhedronData& hullB = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexB].m_collidableIdx].m_shapeIndex]; + const b3ConvexPolyhedronData& hullA = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexA].m_collidableIdx].m_shapeIndex]; + const b3ConvexPolyhedronData& hullB = hostConvexShapeData[hostCollidables[hostBodyBuf[bodyIndexB].m_collidableIdx].m_shapeIndex]; - if(b3TestSepAxis( &hullA, &hullB, posA,ornA,posB,ornB,&sepAxis2, &hostVertices[0], &hostVertices[0],&dist)) - { - if (depth > dist) + if (b3TestSepAxis(&hullA, &hullB, posA, ornA, posB, ornB, &sepAxis2, &hostVertices[0], &hostVertices[0], &dist)) { - float diff = depth - dist; - - static float maxdiff = 0.f; - if (maxdiff < diff) + if (depth > dist) { - maxdiff = diff; - printf("maxdiff = %20.10f\n",maxdiff); + float diff = depth - dist; + + static float maxdiff = 0.f; + if (maxdiff < diff) + { + maxdiff = diff; + printf("maxdiff = %20.10f\n", maxdiff); + } } } - } - if (depth > dmin) - { - b3Vector3 oldAxis = hostSepAxis[i]; - depth = dmin; - sepAxis2 = oldAxis; - } - - + if (depth > dmin) + { + b3Vector3 oldAxis = hostSepAxis[i]; + depth = dmin; + sepAxis2 = oldAxis; + } - if(b3TestSepAxis( &hullA, &hullB, posA,ornA,posB,ornB,&sepAxis2, &hostVertices[0], &hostVertices[0],&dist)) - { - if (depth > dist) + if (b3TestSepAxis(&hullA, &hullB, posA, ornA, posB, ornB, &sepAxis2, &hostVertices[0], &hostVertices[0], &dist)) { - float diff = depth - dist; - //printf("?diff = %f\n",diff ); - static float maxdiff = 0.f; - if (maxdiff < diff) + if (depth > dist) + { + float diff = depth - dist; + //printf("?diff = %f\n",diff ); + static float maxdiff = 0.f; + if (maxdiff < diff) + { + maxdiff = diff; + printf("maxdiff = %20.10f\n", maxdiff); + } + } + //this is used for SAT + //hostHasSepAxis[i] = 1; + //hostSepAxis[i] = sepAxis2; + + //add contact point + + //int contactIndex = nGlobalContactsOut; + b3Contact4& newContact = hostContacts.at(nGlobalContactsOut); + nGlobalContactsOut++; + newContact.m_batchIdx = 0; //i; + newContact.m_bodyAPtrAndSignBit = (hostBodyBuf.at(bodyIndexA).m_invMass == 0) ? -bodyIndexA : bodyIndexA; + newContact.m_bodyBPtrAndSignBit = (hostBodyBuf.at(bodyIndexB).m_invMass == 0) ? -bodyIndexB : bodyIndexB; + + newContact.m_frictionCoeffCmp = 45874; + newContact.m_restituitionCoeffCmp = 0; + + static float maxDepth = 0.f; + + if (depth > maxDepth) { - maxdiff = diff; - printf("maxdiff = %20.10f\n",maxdiff); + maxDepth = depth; + printf("MPR maxdepth = %f\n", maxDepth); } + + resultPointOnBWorld.w = -depth; + newContact.m_worldPosB[0] = resultPointOnBWorld; + //b3Vector3 resultPointOnAWorld = resultPointOnBWorld+depth*sepAxis2; + newContact.m_worldNormalOnB = sepAxis2; + newContact.m_worldNormalOnB.w = (b3Scalar)1; } - //this is used for SAT - //hostHasSepAxis[i] = 1; - //hostSepAxis[i] = sepAxis2; - - //add contact point - - //int contactIndex = nGlobalContactsOut; - b3Contact4& newContact = hostContacts.at(nGlobalContactsOut); - nGlobalContactsOut++; - newContact.m_batchIdx = 0;//i; - newContact.m_bodyAPtrAndSignBit = (hostBodyBuf.at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA; - newContact.m_bodyBPtrAndSignBit = (hostBodyBuf.at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB; - - newContact.m_frictionCoeffCmp = 45874; - newContact.m_restituitionCoeffCmp = 0; - - - static float maxDepth = 0.f; - - if (depth > maxDepth) + else { - maxDepth = depth; - printf("MPR maxdepth = %f\n",maxDepth ); - + printf("rejected\n"); } - - - resultPointOnBWorld.w = -depth; - newContact.m_worldPosB[0] = resultPointOnBWorld; - //b3Vector3 resultPointOnAWorld = resultPointOnBWorld+depth*sepAxis2; - newContact.m_worldNormalOnB = sepAxis2; - newContact.m_worldNormalOnB.w = (b3Scalar)1; - } else - { - printf("rejected\n"); } - - } - } else + else { - - - - //int contactIndex = computeContactConvexConvex2( i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf, hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts); - b3AlignedObjectArray<b3Contact4> oldHostContacts; + //int contactIndex = computeContactConvexConvex2( i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf, hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts); + b3AlignedObjectArray<b3Contact4> oldHostContacts; int result; - result = computeContactConvexConvex2( //hostPairs, - pairIndex, - bodyIndexA, bodyIndexB, - collidableIndexA, collidableIndexB, - hostBodyBuf, - hostCollidables, - hostConvexShapeData, - hostVertices, - hostUniqueEdges, - hostIndices, - hostFaces, - hostContacts, - nGlobalContactsOut, - maxContactCapacity, - oldHostContacts - //hostHasSepAxis, - //hostSepAxis - - ); - }//mpr - }//hostHasSepAxis[i] = 1; - - } else + result = computeContactConvexConvex2( //hostPairs, + pairIndex, + bodyIndexA, bodyIndexB, + collidableIndexA, collidableIndexB, + hostBodyBuf, + hostCollidables, + hostConvexShapeData, + hostVertices, + hostUniqueEdges, + hostIndices, + hostFaces, + hostContacts, + nGlobalContactsOut, + maxContactCapacity, + oldHostContacts + //hostHasSepAxis, + //hostSepAxis + + ); + } //mpr + } //hostHasSepAxis[i] = 1; + } + else { - b3Vector3 c0local = hostConvexShapeData[shapeIndexA].m_localCenter; b3Vector3 c0 = b3TransformPoint(c0local, posA, ornA); b3Vector3 c1local = hostConvexShapeData[shapeIndexB].m_localCenter; - b3Vector3 c1 = b3TransformPoint(c1local,posB,ornB); + b3Vector3 c1 = b3TransformPoint(c1local, posB, ornB); b3Vector3 DeltaC2 = c0 - c1; - + b3Vector3 sepAxis; - + bool hasSepAxisA = b3FindSeparatingAxis(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, - &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), - &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), - &sepAxis, &dmin); - + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin); + if (hasSepAxisA) { bool hasSepAxisB = b3FindSeparatingAxis(convexShapeB, convexShapeA, posB, ornB, posA, ornA, DeltaC2, @@ -3447,11 +3236,11 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* &sepAxis, &dmin); if (hasSepAxisB) { - bool hasEdgeEdge =b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, - &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), - &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), - &sepAxis, &dmin,true); - + bool hasEdgeEdge = b3FindSeparatingAxisEdgeEdge(convexShapeA, convexShapeB, posA, ornA, posB, ornB, DeltaC2, + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &hostVertices.at(0), &hostUniqueEdges.at(0), &hostFaces.at(0), &hostIndices.at(0), + &sepAxis, &dmin, true); + if (hasEdgeEdge) { hostHasSepAxis[i] = 1; @@ -3460,21 +3249,21 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* } } } - } - - if (useGjkContacts)//nGlobalContactsOut>0) + } + + if (useGjkContacts) //nGlobalContactsOut>0) { //printf("nGlobalContactsOut=%d\n",nGlobalContactsOut); nContacts = nGlobalContactsOut; contactOut->copyFromHost(hostContacts); - - m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); + + m_totalContactsOut.copyFromHostPointer(&nContacts, 1, 0, true); } - - m_hasSeparatingNormals.copyFromHost(hostHasSepAxis); - m_sepNormals.copyFromHost(hostSepAxis); - - /* + + m_hasSeparatingNormals.copyFromHost(hostHasSepAxis); + m_sepNormals.copyFromHost(hostSepAxis); + + /* //double-check results from GPU (comment-out the 'else' so both paths are executed b3AlignedObjectArray<int> checkHasSepAxis; m_hasSeparatingNormals.copyToHost(checkHasSepAxis); @@ -3491,352 +3280,314 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* //m_hasSeparatingNormals.copyFromHost(hostHasSepAxis); // m_sepNormals.copyFromHost(hostSepAxis); */ - } - - - numCompoundPairs = m_numCompoundPairsOut.at(0); - bool useGpuFindCompoundPairs=true; - if (useGpuFindCompoundPairs) - { - B3_PROFILE("findCompoundPairsKernel"); - b3BufferInfoCL bInfo[] = - { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( clAabbsLocalSpace.getBufferCL(),true), - b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), - b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL()), - b3BufferInfoCL( m_numCompoundPairsOut.getBufferCL()), - b3BufferInfoCL(subTreesGPU->getBufferCL()), - b3BufferInfoCL(treeNodesGPU->getBufferCL()), - b3BufferInfoCL(bvhInfo->getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_findCompoundPairsKernel,"m_findCompoundPairsKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( nPairs ); - launcher.setConst( compoundPairCapacity); - - int num = nPairs; - launcher.launch1D( num); - clFinish(m_queue); - - numCompoundPairs = m_numCompoundPairsOut.at(0); - //printf("numCompoundPairs =%d\n",numCompoundPairs ); - if (numCompoundPairs) - { - //printf("numCompoundPairs=%d\n",numCompoundPairs); - } - - - } else - { - - - b3AlignedObjectArray<b3QuantizedBvhNode> treeNodesCPU; - treeNodesGPU->copyToHost(treeNodesCPU); - - b3AlignedObjectArray<b3BvhSubtreeInfo> subTreesCPU; - subTreesGPU->copyToHost(subTreesCPU); + } - b3AlignedObjectArray<b3BvhInfo> bvhInfoCPU; - bvhInfo->copyToHost(bvhInfoCPU); + numCompoundPairs = m_numCompoundPairsOut.at(0); + bool useGpuFindCompoundPairs = true; + if (useGpuFindCompoundPairs) + { + B3_PROFILE("findCompoundPairsKernel"); + b3BufferInfoCL bInfo[] = + { + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(clAabbsLocalSpace.getBufferCL(), true), + b3BufferInfoCL(gpuChildShapes.getBufferCL(), true), + b3BufferInfoCL(m_gpuCompoundPairs.getBufferCL()), + b3BufferInfoCL(m_numCompoundPairsOut.getBufferCL()), + b3BufferInfoCL(subTreesGPU->getBufferCL()), + b3BufferInfoCL(treeNodesGPU->getBufferCL()), + b3BufferInfoCL(bvhInfo->getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_findCompoundPairsKernel, "m_findCompoundPairsKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(nPairs); + launcher.setConst(compoundPairCapacity); - b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; - clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); + int num = nPairs; + launcher.launch1D(num); + clFinish(m_queue); - b3AlignedObjectArray<b3Aabb> hostAabbsLocalSpace; - clAabbsLocalSpace.copyToHost(hostAabbsLocalSpace); + numCompoundPairs = m_numCompoundPairsOut.at(0); + //printf("numCompoundPairs =%d\n",numCompoundPairs ); + if (numCompoundPairs) + { + //printf("numCompoundPairs=%d\n",numCompoundPairs); + } + } + else + { + b3AlignedObjectArray<b3QuantizedBvhNode> treeNodesCPU; + treeNodesGPU->copyToHost(treeNodesCPU); - b3AlignedObjectArray<b3Int4> hostPairs; - pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3BvhSubtreeInfo> subTreesCPU; + subTreesGPU->copyToHost(subTreesCPU); - b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; - bodyBuf->copyToHost(hostBodyBuf); + b3AlignedObjectArray<b3BvhInfo> bvhInfoCPU; + bvhInfo->copyToHost(bvhInfoCPU); + b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; + clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); - b3AlignedObjectArray<b3Int4> cpuCompoundPairsOut; - cpuCompoundPairsOut.resize(compoundPairCapacity); + b3AlignedObjectArray<b3Aabb> hostAabbsLocalSpace; + clAabbsLocalSpace.copyToHost(hostAabbsLocalSpace); - b3AlignedObjectArray<b3Collidable> hostCollidables; - gpuCollidables.copyToHost(hostCollidables); + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); - b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; - gpuChildShapes.copyToHost(cpuChildShapes); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); - b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; - convexData.copyToHost(hostConvexData); + b3AlignedObjectArray<b3Int4> cpuCompoundPairsOut; + cpuCompoundPairsOut.resize(compoundPairCapacity); - b3AlignedObjectArray<b3Vector3> hostVertices; - gpuVertices.copyToHost(hostVertices); + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; + gpuChildShapes.copyToHost(cpuChildShapes); + b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; + convexData.copyToHost(hostConvexData); + b3AlignedObjectArray<b3Vector3> hostVertices; + gpuVertices.copyToHost(hostVertices); - for (int pairIndex=0;pairIndex<nPairs;pairIndex++) - { - int bodyIndexA = hostPairs[pairIndex].x; - int bodyIndexB = hostPairs[pairIndex].y; - int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; - int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; + for (int pairIndex = 0; pairIndex < nPairs; pairIndex++) + { + int bodyIndexA = hostPairs[pairIndex].x; + int bodyIndexB = hostPairs[pairIndex].y; + int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; + int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; if (cpuChildShapes.size()) { - findCompoundPairsKernel( - pairIndex, - bodyIndexA, - bodyIndexB, - collidableIndexA, - collidableIndexB, - &hostBodyBuf[0], - &hostCollidables[0], - &hostConvexData[0], - hostVertices, - hostAabbsWorldSpace, - hostAabbsLocalSpace, - &cpuChildShapes[0], - &cpuCompoundPairsOut[0], - &numCompoundPairs, - compoundPairCapacity, - treeNodesCPU, - subTreesCPU, - bvhInfoCPU - ); + findCompoundPairsKernel( + pairIndex, + bodyIndexA, + bodyIndexB, + collidableIndexA, + collidableIndexB, + &hostBodyBuf[0], + &hostCollidables[0], + &hostConvexData[0], + hostVertices, + hostAabbsWorldSpace, + hostAabbsLocalSpace, + &cpuChildShapes[0], + &cpuCompoundPairsOut[0], + &numCompoundPairs, + compoundPairCapacity, + treeNodesCPU, + subTreesCPU, + bvhInfoCPU); } - } - + } - m_numCompoundPairsOut.copyFromHostPointer(&numCompoundPairs,1,0,true); + m_numCompoundPairsOut.copyFromHostPointer(&numCompoundPairs, 1, 0, true); if (numCompoundPairs) { b3CompoundOverlappingPair* ptr = (b3CompoundOverlappingPair*)&cpuCompoundPairsOut[0]; - m_gpuCompoundPairs.copyFromHostPointer(ptr,numCompoundPairs,0,true); + m_gpuCompoundPairs.copyFromHostPointer(ptr, numCompoundPairs, 0, true); } //cpuCompoundPairsOut - - } + } if (numCompoundPairs) { - printf("numCompoundPairs=%d\n",numCompoundPairs); + printf("numCompoundPairs=%d\n", numCompoundPairs); } - if (numCompoundPairs > compoundPairCapacity) - { - b3Error("Exceeded compound pair capacity (%d/%d)\n", numCompoundPairs, compoundPairCapacity); - numCompoundPairs = compoundPairCapacity; - } - - - - m_gpuCompoundPairs.resize(numCompoundPairs); - m_gpuHasCompoundSepNormals.resize(numCompoundPairs); - m_gpuCompoundSepNormals.resize(numCompoundPairs); - - - if (numCompoundPairs) - { - B3_PROFILE("processCompoundPairsPrimitivesKernel"); - b3BufferInfoCL bInfo[] = - { - b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), - b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), - b3BufferInfoCL( contactOut->getBufferCL()), - b3BufferInfoCL( m_totalContactsOut.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_processCompoundPairsPrimitivesKernel,"m_processCompoundPairsPrimitivesKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numCompoundPairs ); - launcher.setConst(maxContactCapacity); - - int num = numCompoundPairs; - launcher.launch1D( num); - clFinish(m_queue); - nContacts = m_totalContactsOut.at(0); - //printf("nContacts (after processCompoundPairsPrimitivesKernel) = %d\n",nContacts); - if (nContacts>maxContactCapacity) - { - - b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity); - nContacts = maxContactCapacity; - } - } - - - if (numCompoundPairs) - { - B3_PROFILE("processCompoundPairsKernel"); - b3BufferInfoCL bInfo[] = - { - b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), - b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), - b3BufferInfoCL( m_gpuCompoundSepNormals.getBufferCL()), - b3BufferInfoCL( m_gpuHasCompoundSepNormals.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_processCompoundPairsKernel,"m_processCompoundPairsKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( numCompoundPairs ); - - int num = numCompoundPairs; - launcher.launch1D( num); - clFinish(m_queue); - - } - - - //printf("numConcave = %d\n",numConcave); - - - -// printf("hostNormals.size()=%d\n",hostNormals.size()); + if (numCompoundPairs > compoundPairCapacity) + { + b3Error("Exceeded compound pair capacity (%d/%d)\n", numCompoundPairs, compoundPairCapacity); + numCompoundPairs = compoundPairCapacity; + } + + m_gpuCompoundPairs.resize(numCompoundPairs); + m_gpuHasCompoundSepNormals.resize(numCompoundPairs); + m_gpuCompoundSepNormals.resize(numCompoundPairs); + + if (numCompoundPairs) + { + B3_PROFILE("processCompoundPairsPrimitivesKernel"); + b3BufferInfoCL bInfo[] = + { + b3BufferInfoCL(m_gpuCompoundPairs.getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(clAabbsWorldSpace.getBufferCL(), true), + b3BufferInfoCL(gpuChildShapes.getBufferCL(), true), + b3BufferInfoCL(contactOut->getBufferCL()), + b3BufferInfoCL(m_totalContactsOut.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_processCompoundPairsPrimitivesKernel, "m_processCompoundPairsPrimitivesKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numCompoundPairs); + launcher.setConst(maxContactCapacity); + + int num = numCompoundPairs; + launcher.launch1D(num); + clFinish(m_queue); + nContacts = m_totalContactsOut.at(0); + //printf("nContacts (after processCompoundPairsPrimitivesKernel) = %d\n",nContacts); + if (nContacts > maxContactCapacity) + { + b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity); + nContacts = maxContactCapacity; + } + } + + if (numCompoundPairs) + { + B3_PROFILE("processCompoundPairsKernel"); + b3BufferInfoCL bInfo[] = + { + b3BufferInfoCL(m_gpuCompoundPairs.getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(clAabbsWorldSpace.getBufferCL(), true), + b3BufferInfoCL(gpuChildShapes.getBufferCL(), true), + b3BufferInfoCL(m_gpuCompoundSepNormals.getBufferCL()), + b3BufferInfoCL(m_gpuHasCompoundSepNormals.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_processCompoundPairsKernel, "m_processCompoundPairsKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(numCompoundPairs); + + int num = numCompoundPairs; + launcher.launch1D(num); + clFinish(m_queue); + } + + //printf("numConcave = %d\n",numConcave); + + // printf("hostNormals.size()=%d\n",hostNormals.size()); //int numPairs = pairCount.at(0); - - - } int vertexFaceCapacity = 64; - - { //now perform the tree query on GPU - - - - + if (treeNodesGPU->size() && treeNodesGPU->size()) { if (bvhTraversalKernelGPU) { - B3_PROFILE("m_bvhTraversalKernel"); - - + numConcavePairs = m_numConcavePairsOut.at(0); - - b3LauncherCL launcher(m_queue, m_bvhTraversalKernel,"m_bvhTraversalKernel"); - launcher.setBuffer( pairs->getBufferCL()); - launcher.setBuffer( bodyBuf->getBufferCL()); - launcher.setBuffer( gpuCollidables.getBufferCL()); - launcher.setBuffer( clAabbsWorldSpace.getBufferCL()); - launcher.setBuffer( triangleConvexPairsOut.getBufferCL()); - launcher.setBuffer( m_numConcavePairsOut.getBufferCL()); - launcher.setBuffer( subTreesGPU->getBufferCL()); - launcher.setBuffer( treeNodesGPU->getBufferCL()); - launcher.setBuffer( bvhInfo->getBufferCL()); - - launcher.setConst( nPairs ); - launcher.setConst( maxTriConvexPairCapacity); + + b3LauncherCL launcher(m_queue, m_bvhTraversalKernel, "m_bvhTraversalKernel"); + launcher.setBuffer(pairs->getBufferCL()); + launcher.setBuffer(bodyBuf->getBufferCL()); + launcher.setBuffer(gpuCollidables.getBufferCL()); + launcher.setBuffer(clAabbsWorldSpace.getBufferCL()); + launcher.setBuffer(triangleConvexPairsOut.getBufferCL()); + launcher.setBuffer(m_numConcavePairsOut.getBufferCL()); + launcher.setBuffer(subTreesGPU->getBufferCL()); + launcher.setBuffer(treeNodesGPU->getBufferCL()); + launcher.setBuffer(bvhInfo->getBufferCL()); + + launcher.setConst(nPairs); + launcher.setConst(maxTriConvexPairCapacity); int num = nPairs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); numConcavePairs = m_numConcavePairsOut.at(0); - } else + } + else { - b3AlignedObjectArray<b3Int4> hostPairs; - pairs->copyToHost(hostPairs); - b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; - bodyBuf->copyToHost(hostBodyBuf); - b3AlignedObjectArray<b3Collidable> hostCollidables; - gpuCollidables.copyToHost(hostCollidables); - b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; - clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; + clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); - //int maxTriConvexPairCapacity, - b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost; - triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity); + //int maxTriConvexPairCapacity, + b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost; + triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity); - //int numTriConvexPairsOutHost=0; - numConcavePairs = 0; - //m_numConcavePairsOut + //int numTriConvexPairsOutHost=0; + numConcavePairs = 0; + //m_numConcavePairsOut - b3AlignedObjectArray<b3QuantizedBvhNode> treeNodesCPU; - treeNodesGPU->copyToHost(treeNodesCPU); - b3AlignedObjectArray<b3BvhSubtreeInfo> subTreesCPU; - subTreesGPU->copyToHost(subTreesCPU); - b3AlignedObjectArray<b3BvhInfo> bvhInfoCPU; - bvhInfo->copyToHost(bvhInfoCPU); - //compute it... + b3AlignedObjectArray<b3QuantizedBvhNode> treeNodesCPU; + treeNodesGPU->copyToHost(treeNodesCPU); + b3AlignedObjectArray<b3BvhSubtreeInfo> subTreesCPU; + subTreesGPU->copyToHost(subTreesCPU); + b3AlignedObjectArray<b3BvhInfo> bvhInfoCPU; + bvhInfo->copyToHost(bvhInfoCPU); + //compute it... - volatile int hostNumConcavePairsOut=0; + volatile int hostNumConcavePairsOut = 0; - // - for (int i=0;i<nPairs;i++) - { - b3BvhTraversal( &hostPairs.at(0), - &hostBodyBuf.at(0), - &hostCollidables.at(0), - &hostAabbsWorldSpace.at(0), - &triangleConvexPairsOutHost.at(0), - &hostNumConcavePairsOut, - &subTreesCPU.at(0), - &treeNodesCPU.at(0), - &bvhInfoCPU.at(0), - nPairs, - maxTriConvexPairCapacity, - i); - } - numConcavePairs = hostNumConcavePairsOut; + // + for (int i = 0; i < nPairs; i++) + { + b3BvhTraversal(&hostPairs.at(0), + &hostBodyBuf.at(0), + &hostCollidables.at(0), + &hostAabbsWorldSpace.at(0), + &triangleConvexPairsOutHost.at(0), + &hostNumConcavePairsOut, + &subTreesCPU.at(0), + &treeNodesCPU.at(0), + &bvhInfoCPU.at(0), + nPairs, + maxTriConvexPairCapacity, + i); + } + numConcavePairs = hostNumConcavePairsOut; - if (hostNumConcavePairsOut) - { - triangleConvexPairsOutHost.resize(hostNumConcavePairsOut); - triangleConvexPairsOut.copyFromHost(triangleConvexPairsOutHost); - } - // + if (hostNumConcavePairsOut) + { + triangleConvexPairsOutHost.resize(hostNumConcavePairsOut); + triangleConvexPairsOut.copyFromHost(triangleConvexPairsOutHost); + } + // - m_numConcavePairsOut.resize(0); - m_numConcavePairsOut.push_back(numConcavePairs); + m_numConcavePairsOut.resize(0); + m_numConcavePairsOut.push_back(numConcavePairs); } - //printf("numConcavePairs=%d (max = %d\n",numConcavePairs,maxTriConvexPairCapacity); - + //printf("numConcavePairs=%d (max = %d\n",numConcavePairs,maxTriConvexPairCapacity); + if (numConcavePairs > maxTriConvexPairCapacity) { static int exceeded_maxTriConvexPairCapacity_count = 0; b3Error("Exceeded the maxTriConvexPairCapacity (found %d but max is %d, it happened %d times)\n", - numConcavePairs,maxTriConvexPairCapacity,exceeded_maxTriConvexPairCapacity_count++); + numConcavePairs, maxTriConvexPairCapacity, exceeded_maxTriConvexPairCapacity_count++); numConcavePairs = maxTriConvexPairCapacity; } triangleConvexPairsOut.resize(numConcavePairs); - + if (numConcavePairs) { - - - - clippingFacesOutGPU.resize(numConcavePairs); worldNormalsAGPU.resize(numConcavePairs); - worldVertsA1GPU.resize(vertexFaceCapacity*(numConcavePairs)); - worldVertsB1GPU.resize(vertexFaceCapacity*(numConcavePairs)); - + worldVertsA1GPU.resize(vertexFaceCapacity * (numConcavePairs)); + worldVertsB1GPU.resize(vertexFaceCapacity * (numConcavePairs)); if (findConcaveSeparatingAxisKernelGPU) { - /* m_concaveHasSeparatingNormals.copyFromHost(concaveHasSeparatingNormalsCPU); clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); @@ -3846,236 +3597,213 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* */ //now perform a SAT test for each triangle-convex element (stored in triangleConvexPairsOut) - if (splitSearchSepAxisConcave) - { - //printf("numConcavePairs = %d\n",numConcavePairs); - m_dmins.resize(numConcavePairs); - { - B3_PROFILE("findConcaveSeparatingAxisVertexFaceKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), - b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), - b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), - b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), - b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), - b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), - b3BufferInfoCL(worldVertsB1GPU.getBufferCL()), - b3BufferInfoCL(m_dmins.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisVertexFaceKernel,"m_findConcaveSeparatingAxisVertexFaceKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(vertexFaceCapacity); - launcher.setConst( numConcavePairs ); - - int num = numConcavePairs; - launcher.launch1D( num); - clFinish(m_queue); - - - } -// numConcavePairs = 0; - if (1) - { - B3_PROFILE("findConcaveSeparatingAxisEdgeEdgeKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), - b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), - b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), - b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), - b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), - b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), - b3BufferInfoCL(worldVertsB1GPU.getBufferCL()), - b3BufferInfoCL(m_dmins.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisEdgeEdgeKernel,"m_findConcaveSeparatingAxisEdgeEdgeKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(vertexFaceCapacity); - launcher.setConst( numConcavePairs ); - - int num = numConcavePairs; - launcher.launch1D( num); - clFinish(m_queue); - } - - - // numConcavePairs = 0; - - - - - - - } else - { - B3_PROFILE("findConcaveSeparatingAxisKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), - b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), - b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), - b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), - b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), - b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), - b3BufferInfoCL(worldVertsB1GPU.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisKernel,"m_findConcaveSeparatingAxisKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(vertexFaceCapacity); - launcher.setConst( numConcavePairs ); - - int num = numConcavePairs; - launcher.launch1D( num); - clFinish(m_queue); - } - - - } else - { - - b3AlignedObjectArray<b3Int4> clippingFacesOutCPU; - b3AlignedObjectArray<b3Vector3> worldVertsA1CPU; - b3AlignedObjectArray<b3Vector3> worldNormalsACPU; - b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; - b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU; + if (splitSearchSepAxisConcave) + { + //printf("numConcavePairs = %d\n",numConcavePairs); + m_dmins.resize(numConcavePairs); + { + B3_PROFILE("findConcaveSeparatingAxisVertexFaceKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(triangleConvexPairsOut.getBufferCL()), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(gpuChildShapes.getBufferCL(), true), + b3BufferInfoCL(clAabbsWorldSpace.getBufferCL(), true), + b3BufferInfoCL(m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL(m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL(worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB1GPU.getBufferCL()), + b3BufferInfoCL(m_dmins.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisVertexFaceKernel, "m_findConcaveSeparatingAxisVertexFaceKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(vertexFaceCapacity); + launcher.setConst(numConcavePairs); - b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost; - triangleConvexPairsOut.copyToHost(triangleConvexPairsOutHost); - //triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity); - b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; - bodyBuf->copyToHost(hostBodyBuf); - b3AlignedObjectArray<b3Collidable> hostCollidables; - gpuCollidables.copyToHost(hostCollidables); - b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; - clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); + int num = numConcavePairs; + launcher.launch1D(num); + clFinish(m_queue); + } + // numConcavePairs = 0; + if (1) + { + B3_PROFILE("findConcaveSeparatingAxisEdgeEdgeKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(triangleConvexPairsOut.getBufferCL()), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(gpuChildShapes.getBufferCL(), true), + b3BufferInfoCL(clAabbsWorldSpace.getBufferCL(), true), + b3BufferInfoCL(m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL(m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL(worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB1GPU.getBufferCL()), + b3BufferInfoCL(m_dmins.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisEdgeEdgeKernel, "m_findConcaveSeparatingAxisEdgeEdgeKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(vertexFaceCapacity); + launcher.setConst(numConcavePairs); - b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; - convexData.copyToHost(hostConvexData); + int num = numConcavePairs; + launcher.launch1D(num); + clFinish(m_queue); + } - b3AlignedObjectArray<b3Vector3> hostVertices; - gpuVertices.copyToHost(hostVertices); + // numConcavePairs = 0; + } + else + { + B3_PROFILE("findConcaveSeparatingAxisKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(triangleConvexPairsOut.getBufferCL()), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(gpuChildShapes.getBufferCL(), true), + b3BufferInfoCL(clAabbsWorldSpace.getBufferCL(), true), + b3BufferInfoCL(m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL(m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL(worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB1GPU.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_findConcaveSeparatingAxisKernel, "m_findConcaveSeparatingAxisKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(vertexFaceCapacity); + launcher.setConst(numConcavePairs); - b3AlignedObjectArray<b3Vector3> hostUniqueEdges; - gpuUniqueEdges.copyToHost(hostUniqueEdges); - b3AlignedObjectArray<b3GpuFace> hostFaces; - gpuFaces.copyToHost(hostFaces); - b3AlignedObjectArray<int> hostIndices; - gpuIndices.copyToHost(hostIndices); - b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; - gpuChildShapes.copyToHost(cpuChildShapes); + int num = numConcavePairs; + launcher.launch1D(num); + clFinish(m_queue); + } + } + else + { + b3AlignedObjectArray<b3Int4> clippingFacesOutCPU; + b3AlignedObjectArray<b3Vector3> worldVertsA1CPU; + b3AlignedObjectArray<b3Vector3> worldNormalsACPU; + b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; + b3AlignedObjectArray<int> concaveHasSeparatingNormalsCPU; + b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost; + triangleConvexPairsOut.copyToHost(triangleConvexPairsOutHost); + //triangleConvexPairsOutHost.resize(maxTriConvexPairCapacity); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); + b3AlignedObjectArray<b3Aabb> hostAabbsWorldSpace; + clAabbsWorldSpace.copyToHost(hostAabbsWorldSpace); - - b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost; - m_concaveSepNormals.copyToHost(concaveSepNormalsHost); - concaveHasSeparatingNormalsCPU.resize(concaveSepNormalsHost.size()); + b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; + convexData.copyToHost(hostConvexData); - b3GpuChildShape* childShapePointerCPU = 0; - if (cpuChildShapes.size()) - childShapePointerCPU = &cpuChildShapes.at(0); + b3AlignedObjectArray<b3Vector3> hostVertices; + gpuVertices.copyToHost(hostVertices); - clippingFacesOutCPU.resize(clippingFacesOutGPU.size()); - worldVertsA1CPU.resize(worldVertsA1GPU.size()); - worldNormalsACPU.resize(worldNormalsAGPU.size()); - worldVertsB1CPU.resize(worldVertsB1GPU.size()); + b3AlignedObjectArray<b3Vector3> hostUniqueEdges; + gpuUniqueEdges.copyToHost(hostUniqueEdges); + b3AlignedObjectArray<b3GpuFace> hostFaces; + gpuFaces.copyToHost(hostFaces); + b3AlignedObjectArray<int> hostIndices; + gpuIndices.copyToHost(hostIndices); + b3AlignedObjectArray<b3GpuChildShape> cpuChildShapes; + gpuChildShapes.copyToHost(cpuChildShapes); - for (int i=0;i<numConcavePairs;i++) - { - b3FindConcaveSeparatingAxisKernel(&triangleConvexPairsOutHost.at(0), - &hostBodyBuf.at(0), - &hostCollidables.at(0), - &hostConvexData.at(0), &hostVertices.at(0),&hostUniqueEdges.at(0), - &hostFaces.at(0),&hostIndices.at(0),childShapePointerCPU, - &hostAabbsWorldSpace.at(0), - &concaveSepNormalsHost.at(0), - &clippingFacesOutCPU.at(0), - &worldVertsA1CPU.at(0), - &worldNormalsACPU.at(0), - &worldVertsB1CPU.at(0), - &concaveHasSeparatingNormalsCPU.at(0), - vertexFaceCapacity, - numConcavePairs,i); - }; + b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost; + m_concaveSepNormals.copyToHost(concaveSepNormalsHost); + concaveHasSeparatingNormalsCPU.resize(concaveSepNormalsHost.size()); - m_concaveSepNormals.copyFromHost(concaveSepNormalsHost); - m_concaveHasSeparatingNormals.copyFromHost(concaveHasSeparatingNormalsCPU); - clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); - worldVertsA1GPU.copyFromHost(worldVertsA1CPU); - worldNormalsAGPU.copyFromHost(worldNormalsACPU); - worldVertsB1GPU.copyFromHost(worldVertsB1CPU); + b3GpuChildShape* childShapePointerCPU = 0; + if (cpuChildShapes.size()) + childShapePointerCPU = &cpuChildShapes.at(0); + clippingFacesOutCPU.resize(clippingFacesOutGPU.size()); + worldVertsA1CPU.resize(worldVertsA1GPU.size()); + worldNormalsACPU.resize(worldNormalsAGPU.size()); + worldVertsB1CPU.resize(worldVertsB1GPU.size()); + for (int i = 0; i < numConcavePairs; i++) + { + b3FindConcaveSeparatingAxisKernel(&triangleConvexPairsOutHost.at(0), + &hostBodyBuf.at(0), + &hostCollidables.at(0), + &hostConvexData.at(0), &hostVertices.at(0), &hostUniqueEdges.at(0), + &hostFaces.at(0), &hostIndices.at(0), childShapePointerCPU, + &hostAabbsWorldSpace.at(0), + &concaveSepNormalsHost.at(0), + &clippingFacesOutCPU.at(0), + &worldVertsA1CPU.at(0), + &worldNormalsACPU.at(0), + &worldVertsB1CPU.at(0), + &concaveHasSeparatingNormalsCPU.at(0), + vertexFaceCapacity, + numConcavePairs, i); + }; + m_concaveSepNormals.copyFromHost(concaveSepNormalsHost); + m_concaveHasSeparatingNormals.copyFromHost(concaveHasSeparatingNormalsCPU); + clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); + worldVertsA1GPU.copyFromHost(worldVertsA1CPU); + worldNormalsAGPU.copyFromHost(worldNormalsACPU); + worldVertsB1GPU.copyFromHost(worldVertsB1CPU); } -// b3AlignedObjectArray<b3Vector3> cpuCompoundSepNormals; -// m_concaveSepNormals.copyToHost(cpuCompoundSepNormals); -// b3AlignedObjectArray<b3Int4> cpuConcavePairs; -// triangleConvexPairsOut.copyToHost(cpuConcavePairs); - - + // b3AlignedObjectArray<b3Vector3> cpuCompoundSepNormals; + // m_concaveSepNormals.copyToHost(cpuCompoundSepNormals); + // b3AlignedObjectArray<b3Int4> cpuConcavePairs; + // triangleConvexPairsOut.copyToHost(cpuConcavePairs); } } - - } if (numConcavePairs) { - if (numConcavePairs) + if (numConcavePairs) { B3_PROFILE("findConcaveSphereContactsKernel"); - nContacts = m_totalContactsOut.at(0); -// printf("nContacts1 = %d\n",nContacts); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( triangleConvexPairsOut.getBufferCL() ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true), - b3BufferInfoCL( contactOut->getBufferCL()), - b3BufferInfoCL( m_totalContactsOut.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_findConcaveSphereContactsKernel,"m_findConcaveSphereContactsKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - - launcher.setConst( numConcavePairs ); + nContacts = m_totalContactsOut.at(0); + // printf("nContacts1 = %d\n",nContacts); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(triangleConvexPairsOut.getBufferCL()), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(clAabbsWorldSpace.getBufferCL(), true), + b3BufferInfoCL(contactOut->getBufferCL()), + b3BufferInfoCL(m_totalContactsOut.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_findConcaveSphereContactsKernel, "m_findConcaveSphereContactsKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + + launcher.setConst(numConcavePairs); launcher.setConst(maxContactCapacity); int num = numConcavePairs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); nContacts = m_totalContactsOut.at(0); //printf("nContacts (after findConcaveSphereContactsKernel) = %d\n",nContacts); @@ -4088,11 +3816,8 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* nContacts = maxContactCapacity; } } - } - - #ifdef __APPLE__ bool contactClippingOnGpu = true; #else @@ -4101,9 +3826,8 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* if (contactClippingOnGpu) { - m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); -// printf("nContacts3 = %d\n",nContacts); - + m_totalContactsOut.copyFromHostPointer(&nContacts, 1, 0, true); + // printf("nContacts3 = %d\n",nContacts); //B3_PROFILE("clipHullHullKernel"); @@ -4122,15 +3846,12 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* if (breakupConcaveConvexKernel) { - - worldVertsB2GPU.resize(vertexFaceCapacity*numConcavePairs); - + worldVertsB2GPU.resize(vertexFaceCapacity * numConcavePairs); //clipFacesAndFindContacts if (clipConcaveFacesAndFindContactsCPU) { - b3AlignedObjectArray<b3Int4> clippingFacesOutCPU; b3AlignedObjectArray<b3Vector3> worldVertsA1CPU; b3AlignedObjectArray<b3Vector3> worldNormalsACPU; @@ -4141,120 +3862,108 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* worldNormalsAGPU.copyToHost(worldNormalsACPU); worldVertsB1GPU.copyToHost(worldVertsB1CPU); - - - b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU; + b3AlignedObjectArray<int> concaveHasSeparatingNormalsCPU; m_concaveHasSeparatingNormals.copyToHost(concaveHasSeparatingNormalsCPU); b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost; m_concaveSepNormals.copyToHost(concaveSepNormalsHost); - b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; + b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; worldVertsB2CPU.resize(worldVertsB2GPU.size()); - - for (int i=0;i<numConcavePairs;i++) + for (int i = 0; i < numConcavePairs; i++) { - - clipFacesAndFindContactsKernel( &concaveSepNormalsHost.at(0), - &concaveHasSeparatingNormalsCPU.at(0), - &clippingFacesOutCPU.at(0), - &worldVertsA1CPU.at(0), - &worldNormalsACPU.at(0), - &worldVertsB1CPU.at(0), - &worldVertsB2CPU.at(0), - vertexFaceCapacity, - i); + clipFacesAndFindContactsKernel(&concaveSepNormalsHost.at(0), + &concaveHasSeparatingNormalsCPU.at(0), + &clippingFacesOutCPU.at(0), + &worldVertsA1CPU.at(0), + &worldNormalsACPU.at(0), + &worldVertsB1CPU.at(0), + &worldVertsB2CPU.at(0), + vertexFaceCapacity, + i); } clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); worldVertsB2GPU.copyFromHost(worldVertsB2CPU); - - - } else + } + else { - if (1) { - - - B3_PROFILE("clipFacesAndFindContacts"); //nContacts = m_totalContactsOut.at(0); //int h = m_hasSeparatingNormals.at(0); //int4 p = clippingFacesOutGPU.at(0); b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), - b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), - b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), - b3BufferInfoCL( worldNormalsAGPU.getBufferCL()), - b3BufferInfoCL( worldVertsB1GPU.getBufferCL()), - b3BufferInfoCL( worldVertsB2GPU.getBufferCL()) - }; - b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts,"m_clipFacesAndFindContacts"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + b3BufferInfoCL(m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL(m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL(worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB1GPU.getBufferCL()), + b3BufferInfoCL(worldVertsB2GPU.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts, "m_clipFacesAndFindContacts"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(vertexFaceCapacity); - launcher.setConst( numConcavePairs ); + launcher.setConst(numConcavePairs); int debugMode = 0; - launcher.setConst( debugMode); + launcher.setConst(debugMode); int num = numConcavePairs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); //int bla = m_totalContactsOut.at(0); } } //contactReduction { - int newContactCapacity=nContacts+numConcavePairs; + int newContactCapacity = nContacts + numConcavePairs; contactOut->reserve(newContactCapacity); if (reduceConcaveContactsOnGPU) { -// printf("newReservation = %d\n",newReservation); + // printf("newReservation = %d\n",newReservation); { B3_PROFILE("newContactReductionKernel"); b3BufferInfoCL bInfo[] = - { - b3BufferInfoCL( triangleConvexPairsOut.getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), - b3BufferInfoCL( m_concaveHasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( contactOut->getBufferCL()), - b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), - b3BufferInfoCL( worldVertsB2GPU.getBufferCL()), - b3BufferInfoCL( m_totalContactsOut.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_newContactReductionKernel,"m_newContactReductionKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + { + b3BufferInfoCL(triangleConvexPairsOut.getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL(m_concaveHasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(contactOut->getBufferCL()), + b3BufferInfoCL(clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB2GPU.getBufferCL()), + b3BufferInfoCL(m_totalContactsOut.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_newContactReductionKernel, "m_newContactReductionKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(vertexFaceCapacity); launcher.setConst(newContactCapacity); - launcher.setConst( numConcavePairs ); + launcher.setConst(numConcavePairs); int num = numConcavePairs; - launcher.launch1D( num); + launcher.launch1D(num); } nContacts = m_totalContactsOut.at(0); contactOut->resize(nContacts); //printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts); - }else + } + else { - volatile int nGlobalContactsOut = nContacts; b3AlignedObjectArray<b3Int4> triangleConvexPairsOutHost; triangleConvexPairsOut.copyToHost(triangleConvexPairsOutHost); b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; bodyBuf->copyToHost(hostBodyBuf); - b3AlignedObjectArray<int>concaveHasSeparatingNormalsCPU; + b3AlignedObjectArray<int> concaveHasSeparatingNormalsCPU; m_concaveHasSeparatingNormals.copyToHost(concaveHasSeparatingNormalsCPU); b3AlignedObjectArray<b3Vector3> concaveSepNormalsHost; m_concaveSepNormals.copyToHost(concaveSepNormalsHost); - b3AlignedObjectArray<b3Contact4> hostContacts; if (nContacts) { @@ -4268,67 +3977,59 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* clippingFacesOutGPU.copyToHost(clippingFacesOutCPU); worldVertsB2GPU.copyToHost(worldVertsB2CPU); - - - for (int i=0;i<numConcavePairs;i++) + for (int i = 0; i < numConcavePairs; i++) { - b3NewContactReductionKernel( &triangleConvexPairsOutHost.at(0), - &hostBodyBuf.at(0), - &concaveSepNormalsHost.at(0), - &concaveHasSeparatingNormalsCPU.at(0), - &hostContacts.at(0), - &clippingFacesOutCPU.at(0), - &worldVertsB2CPU.at(0), - &nGlobalContactsOut, - vertexFaceCapacity, - newContactCapacity, - numConcavePairs, - i - ); - + b3NewContactReductionKernel(&triangleConvexPairsOutHost.at(0), + &hostBodyBuf.at(0), + &concaveSepNormalsHost.at(0), + &concaveHasSeparatingNormalsCPU.at(0), + &hostContacts.at(0), + &clippingFacesOutCPU.at(0), + &worldVertsB2CPU.at(0), + &nGlobalContactsOut, + vertexFaceCapacity, + newContactCapacity, + numConcavePairs, + i); } - nContacts = nGlobalContactsOut; - m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); -// nContacts = m_totalContactsOut.at(0); + m_totalContactsOut.copyFromHostPointer(&nContacts, 1, 0, true); + // nContacts = m_totalContactsOut.at(0); //contactOut->resize(nContacts); hostContacts.resize(nContacts); //printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts); contactOut->copyFromHost(hostContacts); } - } //re-use? - - - } else + } + else { B3_PROFILE("clipHullHullConcaveConvexKernel"); nContacts = m_totalContactsOut.at(0); int newContactCapacity = contactOut->capacity(); //printf("contactOut5 = %d\n",nContacts); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( triangleConvexPairsOut.getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), - b3BufferInfoCL( m_concaveSepNormals.getBufferCL()), - b3BufferInfoCL( contactOut->getBufferCL()), - b3BufferInfoCL( m_totalContactsOut.getBufferCL()) - }; - b3LauncherCL launcher(m_queue, m_clipHullHullConcaveConvexKernel,"m_clipHullHullConcaveConvexKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(triangleConvexPairsOut.getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(gpuChildShapes.getBufferCL(), true), + b3BufferInfoCL(m_concaveSepNormals.getBufferCL()), + b3BufferInfoCL(contactOut->getBufferCL()), + b3BufferInfoCL(m_totalContactsOut.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_clipHullHullConcaveConvexKernel, "m_clipHullHullConcaveConvexKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(newContactCapacity); - launcher.setConst( numConcavePairs ); + launcher.setConst(numConcavePairs); int num = numConcavePairs; - launcher.launch1D( num); + launcher.launch1D(num); clFinish(m_queue); nContacts = m_totalContactsOut.at(0); contactOut->resize(nContacts); @@ -4337,12 +4038,10 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* contactOut->copyToHost(cpuContacts); } // printf("nContacts after = %d\n", nContacts); - }//numConcavePairs - - + } //numConcavePairs //convex-convex contact clipping - + bool breakupKernel = false; #ifdef __APPLE__ @@ -4350,166 +4049,149 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* #endif #ifdef CHECK_ON_HOST - bool computeConvexConvex = false; + bool computeConvexConvex = false; #else - bool computeConvexConvex = true; -#endif//CHECK_ON_HOST + bool computeConvexConvex = true; +#endif //CHECK_ON_HOST if (computeConvexConvex) { B3_PROFILE("clipHullHullKernel"); - if (breakupKernel) - { - - - - - worldVertsB1GPU.resize(vertexFaceCapacity*nPairs); - clippingFacesOutGPU.resize(nPairs); - worldNormalsAGPU.resize(nPairs); - worldVertsA1GPU.resize(vertexFaceCapacity*nPairs); - worldVertsB2GPU.resize(vertexFaceCapacity*nPairs); - - if (findConvexClippingFacesGPU) - { - B3_PROFILE("findClippingFacesKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( m_sepNormals.getBufferCL()), - b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), - b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), - b3BufferInfoCL( worldNormalsAGPU.getBufferCL()), - b3BufferInfoCL( worldVertsB1GPU.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_findClippingFacesKernel,"m_findClippingFacesKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( vertexFaceCapacity); - launcher.setConst( nPairs ); - int num = nPairs; - launcher.launch1D( num); - clFinish(m_queue); - - } else + if (breakupKernel) { - - float minDist = -1e30f; - float maxDist = 0.02f; + worldVertsB1GPU.resize(vertexFaceCapacity * nPairs); + clippingFacesOutGPU.resize(nPairs); + worldNormalsAGPU.resize(nPairs); + worldVertsA1GPU.resize(vertexFaceCapacity * nPairs); + worldVertsB2GPU.resize(vertexFaceCapacity * nPairs); - b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; - convexData.copyToHost(hostConvexData); - b3AlignedObjectArray<b3Collidable> hostCollidables; - gpuCollidables.copyToHost(hostCollidables); + if (findConvexClippingFacesGPU) + { + B3_PROFILE("findClippingFacesKernel"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(m_sepNormals.getBufferCL()), + b3BufferInfoCL(m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL(worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB1GPU.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_findClippingFacesKernel, "m_findClippingFacesKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(vertexFaceCapacity); + launcher.setConst(nPairs); + int num = nPairs; + launcher.launch1D(num); + clFinish(m_queue); + } + else + { + float minDist = -1e30f; + float maxDist = 0.02f; - b3AlignedObjectArray<int> hostHasSepNormals; - m_hasSeparatingNormals.copyToHost(hostHasSepNormals); - b3AlignedObjectArray<b3Vector3> cpuSepNormals; - m_sepNormals.copyToHost(cpuSepNormals); + b3AlignedObjectArray<b3ConvexPolyhedronData> hostConvexData; + convexData.copyToHost(hostConvexData); + b3AlignedObjectArray<b3Collidable> hostCollidables; + gpuCollidables.copyToHost(hostCollidables); - b3AlignedObjectArray<b3Int4> hostPairs; - pairs->copyToHost(hostPairs); - b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; - bodyBuf->copyToHost(hostBodyBuf); + b3AlignedObjectArray<int> hostHasSepNormals; + m_hasSeparatingNormals.copyToHost(hostHasSepNormals); + b3AlignedObjectArray<b3Vector3> cpuSepNormals; + m_sepNormals.copyToHost(cpuSepNormals); + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); - //worldVertsB1GPU.resize(vertexFaceCapacity*nPairs); - b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; - worldVertsB1GPU.copyToHost(worldVertsB1CPU); + //worldVertsB1GPU.resize(vertexFaceCapacity*nPairs); + b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; + worldVertsB1GPU.copyToHost(worldVertsB1CPU); - b3AlignedObjectArray<b3Int4> clippingFacesOutCPU; - clippingFacesOutGPU.copyToHost(clippingFacesOutCPU); + b3AlignedObjectArray<b3Int4> clippingFacesOutCPU; + clippingFacesOutGPU.copyToHost(clippingFacesOutCPU); - b3AlignedObjectArray<b3Vector3> worldNormalsACPU; - worldNormalsACPU.resize(nPairs); + b3AlignedObjectArray<b3Vector3> worldNormalsACPU; + worldNormalsACPU.resize(nPairs); - b3AlignedObjectArray<b3Vector3> worldVertsA1CPU; - worldVertsA1CPU.resize(worldVertsA1GPU.size()); - - - b3AlignedObjectArray<b3Vector3> hostVertices; - gpuVertices.copyToHost(hostVertices); - b3AlignedObjectArray<b3GpuFace> hostFaces; - gpuFaces.copyToHost(hostFaces); - b3AlignedObjectArray<int> hostIndices; - gpuIndices.copyToHost(hostIndices); - - - for (int i=0;i<nPairs;i++) - { + b3AlignedObjectArray<b3Vector3> worldVertsA1CPU; + worldVertsA1CPU.resize(worldVertsA1GPU.size()); - int bodyIndexA = hostPairs[i].x; - int bodyIndexB = hostPairs[i].y; - - int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; - int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; - - int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex; - int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex; - + b3AlignedObjectArray<b3Vector3> hostVertices; + gpuVertices.copyToHost(hostVertices); + b3AlignedObjectArray<b3GpuFace> hostFaces; + gpuFaces.copyToHost(hostFaces); + b3AlignedObjectArray<int> hostIndices; + gpuIndices.copyToHost(hostIndices); - if (hostHasSepNormals[i]) + for (int i = 0; i < nPairs; i++) { - b3FindClippingFaces(cpuSepNormals[i], - &hostConvexData[shapeIndexA], - &hostConvexData[shapeIndexB], - hostBodyBuf[bodyIndexA].m_pos,hostBodyBuf[bodyIndexA].m_quat, - hostBodyBuf[bodyIndexB].m_pos,hostBodyBuf[bodyIndexB].m_quat, - &worldVertsA1CPU.at(0),&worldNormalsACPU.at(0), - &worldVertsB1CPU.at(0), - vertexFaceCapacity,minDist,maxDist, - &hostVertices.at(0),&hostFaces.at(0), - &hostIndices.at(0), - &hostVertices.at(0),&hostFaces.at(0), - &hostIndices.at(0),&clippingFacesOutCPU.at(0),i); - } - } - - clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); - worldVertsA1GPU.copyFromHost(worldVertsA1CPU); - worldNormalsAGPU.copyFromHost(worldNormalsACPU); - worldVertsB1GPU.copyFromHost(worldVertsB1CPU); - - } + int bodyIndexA = hostPairs[i].x; + int bodyIndexB = hostPairs[i].y; + int collidableIndexA = hostBodyBuf[bodyIndexA].m_collidableIdx; + int collidableIndexB = hostBodyBuf[bodyIndexB].m_collidableIdx; + int shapeIndexA = hostCollidables[collidableIndexA].m_shapeIndex; + int shapeIndexB = hostCollidables[collidableIndexB].m_shapeIndex; + if (hostHasSepNormals[i]) + { + b3FindClippingFaces(cpuSepNormals[i], + &hostConvexData[shapeIndexA], + &hostConvexData[shapeIndexB], + hostBodyBuf[bodyIndexA].m_pos, hostBodyBuf[bodyIndexA].m_quat, + hostBodyBuf[bodyIndexB].m_pos, hostBodyBuf[bodyIndexB].m_quat, + &worldVertsA1CPU.at(0), &worldNormalsACPU.at(0), + &worldVertsB1CPU.at(0), + vertexFaceCapacity, minDist, maxDist, + &hostVertices.at(0), &hostFaces.at(0), + &hostIndices.at(0), + &hostVertices.at(0), &hostFaces.at(0), + &hostIndices.at(0), &clippingFacesOutCPU.at(0), i); + } + } + clippingFacesOutGPU.copyFromHost(clippingFacesOutCPU); + worldVertsA1GPU.copyFromHost(worldVertsA1CPU); + worldNormalsAGPU.copyFromHost(worldNormalsACPU); + worldVertsB1GPU.copyFromHost(worldVertsB1CPU); + } - ///clip face B against face A, reduce contacts and append them to a global contact array - if (1) - { - if (clipConvexFacesAndFindContactsCPU) + ///clip face B against face A, reduce contacts and append them to a global contact array + if (1) { + if (clipConvexFacesAndFindContactsCPU) + { + //b3AlignedObjectArray<b3Int4> hostPairs; + //pairs->copyToHost(hostPairs); - //b3AlignedObjectArray<b3Int4> hostPairs; - //pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3Vector3> hostSepNormals; + m_sepNormals.copyToHost(hostSepNormals); + b3AlignedObjectArray<int> hostHasSepAxis; + m_hasSeparatingNormals.copyToHost(hostHasSepAxis); - b3AlignedObjectArray<b3Vector3> hostSepNormals; - m_sepNormals.copyToHost(hostSepNormals); - b3AlignedObjectArray<int> hostHasSepAxis; - m_hasSeparatingNormals.copyToHost(hostHasSepAxis); + b3AlignedObjectArray<b3Int4> hostClippingFaces; + clippingFacesOutGPU.copyToHost(hostClippingFaces); + b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; + worldVertsB2CPU.resize(vertexFaceCapacity * nPairs); - b3AlignedObjectArray<b3Int4> hostClippingFaces; - clippingFacesOutGPU.copyToHost(hostClippingFaces); - b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; - worldVertsB2CPU.resize(vertexFaceCapacity*nPairs); - - b3AlignedObjectArray<b3Vector3>worldVertsA1CPU; - worldVertsA1GPU.copyToHost(worldVertsA1CPU); - b3AlignedObjectArray<b3Vector3> worldNormalsACPU; - worldNormalsAGPU.copyToHost(worldNormalsACPU); + b3AlignedObjectArray<b3Vector3> worldVertsA1CPU; + worldVertsA1GPU.copyToHost(worldVertsA1CPU); + b3AlignedObjectArray<b3Vector3> worldNormalsACPU; + worldNormalsAGPU.copyToHost(worldNormalsACPU); - b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; - worldVertsB1GPU.copyToHost(worldVertsB1CPU); + b3AlignedObjectArray<b3Vector3> worldVertsB1CPU; + worldVertsB1GPU.copyToHost(worldVertsB1CPU); - /* + /* __global const b3Float4* separatingNormals, __global const int* hasSeparatingAxis, __global b3Int4* clippingFacesOut, @@ -4520,214 +4202,207 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* int vertexFaceCapacity, int pairIndex */ - for (int i=0;i<nPairs;i++) - { - clipFacesAndFindContactsKernel( - &hostSepNormals.at(0), - &hostHasSepAxis.at(0), - &hostClippingFaces.at(0), - &worldVertsA1CPU.at(0), - &worldNormalsACPU.at(0), - &worldVertsB1CPU.at(0), - &worldVertsB2CPU.at(0), - - vertexFaceCapacity, - i); - } - - clippingFacesOutGPU.copyFromHost(hostClippingFaces); - worldVertsB2GPU.copyFromHost(worldVertsB2CPU); + for (int i = 0; i < nPairs; i++) + { + clipFacesAndFindContactsKernel( + &hostSepNormals.at(0), + &hostHasSepAxis.at(0), + &hostClippingFaces.at(0), + &worldVertsA1CPU.at(0), + &worldNormalsACPU.at(0), + &worldVertsB1CPU.at(0), + &worldVertsB2CPU.at(0), - } else - { - B3_PROFILE("clipFacesAndFindContacts"); - //nContacts = m_totalContactsOut.at(0); - //int h = m_hasSeparatingNormals.at(0); - //int4 p = clippingFacesOutGPU.at(0); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_sepNormals.getBufferCL()), - b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), - b3BufferInfoCL( worldVertsA1GPU.getBufferCL()), - b3BufferInfoCL( worldNormalsAGPU.getBufferCL()), - b3BufferInfoCL( worldVertsB1GPU.getBufferCL()), - b3BufferInfoCL( worldVertsB2GPU.getBufferCL()) - }; + vertexFaceCapacity, + i); + } - b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts,"m_clipFacesAndFindContacts"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(vertexFaceCapacity); + clippingFacesOutGPU.copyFromHost(hostClippingFaces); + worldVertsB2GPU.copyFromHost(worldVertsB2CPU); + } + else + { + B3_PROFILE("clipFacesAndFindContacts"); + //nContacts = m_totalContactsOut.at(0); + //int h = m_hasSeparatingNormals.at(0); + //int4 p = clippingFacesOutGPU.at(0); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_sepNormals.getBufferCL()), + b3BufferInfoCL(m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL(worldVertsA1GPU.getBufferCL()), + b3BufferInfoCL(worldNormalsAGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB1GPU.getBufferCL()), + b3BufferInfoCL(worldVertsB2GPU.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_clipFacesAndFindContacts, "m_clipFacesAndFindContacts"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(vertexFaceCapacity); - launcher.setConst( nPairs ); - int debugMode = 0; - launcher.setConst( debugMode); - int num = nPairs; - launcher.launch1D( num); - clFinish(m_queue); - } + launcher.setConst(nPairs); + int debugMode = 0; + launcher.setConst(debugMode); + int num = nPairs; + launcher.launch1D(num); + clFinish(m_queue); + } - { - nContacts = m_totalContactsOut.at(0); - //printf("nContacts = %d\n",nContacts); + { + nContacts = m_totalContactsOut.at(0); + //printf("nContacts = %d\n",nContacts); - int newContactCapacity = nContacts+nPairs; - contactOut->reserve(newContactCapacity); + int newContactCapacity = nContacts + nPairs; + contactOut->reserve(newContactCapacity); - if (reduceConvexContactsOnGPU) - { + if (reduceConvexContactsOnGPU) { - B3_PROFILE("newContactReductionKernel"); - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( m_sepNormals.getBufferCL()), - b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( contactOut->getBufferCL()), - b3BufferInfoCL( clippingFacesOutGPU.getBufferCL()), - b3BufferInfoCL( worldVertsB2GPU.getBufferCL()), - b3BufferInfoCL( m_totalContactsOut.getBufferCL()) - }; - - b3LauncherCL launcher(m_queue, m_newContactReductionKernel,"m_newContactReductionKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(vertexFaceCapacity); - launcher.setConst(newContactCapacity); - launcher.setConst( nPairs ); - int num = nPairs; - - launcher.launch1D( num); + B3_PROFILE("newContactReductionKernel"); + b3BufferInfoCL bInfo[] = + { + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(m_sepNormals.getBufferCL()), + b3BufferInfoCL(m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(contactOut->getBufferCL()), + b3BufferInfoCL(clippingFacesOutGPU.getBufferCL()), + b3BufferInfoCL(worldVertsB2GPU.getBufferCL()), + b3BufferInfoCL(m_totalContactsOut.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_newContactReductionKernel, "m_newContactReductionKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(vertexFaceCapacity); + launcher.setConst(newContactCapacity); + launcher.setConst(nPairs); + int num = nPairs; + + launcher.launch1D(num); + } + nContacts = m_totalContactsOut.at(0); + contactOut->resize(nContacts); } - nContacts = m_totalContactsOut.at(0); - contactOut->resize(nContacts); - } else - { - - volatile int nGlobalContactsOut = nContacts; - b3AlignedObjectArray<b3Int4> hostPairs; - pairs->copyToHost(hostPairs); - b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; - bodyBuf->copyToHost(hostBodyBuf); - b3AlignedObjectArray<b3Vector3> hostSepNormals; - m_sepNormals.copyToHost(hostSepNormals); - b3AlignedObjectArray<int> hostHasSepAxis; - m_hasSeparatingNormals.copyToHost(hostHasSepAxis); - b3AlignedObjectArray<b3Contact4> hostContactsOut; - contactOut->copyToHost(hostContactsOut); - hostContactsOut.resize(newContactCapacity); - - b3AlignedObjectArray<b3Int4> hostClippingFaces; - clippingFacesOutGPU.copyToHost(hostClippingFaces); - b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; - worldVertsB2GPU.copyToHost(worldVertsB2CPU); - - for (int i=0;i<nPairs;i++) + else { - b3NewContactReductionKernel(&hostPairs.at(0), - &hostBodyBuf.at(0), - &hostSepNormals.at(0), - &hostHasSepAxis.at(0), - &hostContactsOut.at(0), - &hostClippingFaces.at(0), - &worldVertsB2CPU.at(0), - &nGlobalContactsOut, - vertexFaceCapacity, - newContactCapacity, - nPairs, - i); + volatile int nGlobalContactsOut = nContacts; + b3AlignedObjectArray<b3Int4> hostPairs; + pairs->copyToHost(hostPairs); + b3AlignedObjectArray<b3RigidBodyData> hostBodyBuf; + bodyBuf->copyToHost(hostBodyBuf); + b3AlignedObjectArray<b3Vector3> hostSepNormals; + m_sepNormals.copyToHost(hostSepNormals); + b3AlignedObjectArray<int> hostHasSepAxis; + m_hasSeparatingNormals.copyToHost(hostHasSepAxis); + b3AlignedObjectArray<b3Contact4> hostContactsOut; + contactOut->copyToHost(hostContactsOut); + hostContactsOut.resize(newContactCapacity); + + b3AlignedObjectArray<b3Int4> hostClippingFaces; + clippingFacesOutGPU.copyToHost(hostClippingFaces); + b3AlignedObjectArray<b3Vector3> worldVertsB2CPU; + worldVertsB2GPU.copyToHost(worldVertsB2CPU); + + for (int i = 0; i < nPairs; i++) + { + b3NewContactReductionKernel(&hostPairs.at(0), + &hostBodyBuf.at(0), + &hostSepNormals.at(0), + &hostHasSepAxis.at(0), + &hostContactsOut.at(0), + &hostClippingFaces.at(0), + &worldVertsB2CPU.at(0), + &nGlobalContactsOut, + vertexFaceCapacity, + newContactCapacity, + nPairs, + i); + } + + nContacts = nGlobalContactsOut; + m_totalContactsOut.copyFromHostPointer(&nContacts, 1, 0, true); + hostContactsOut.resize(nContacts); + //printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts); + contactOut->copyFromHost(hostContactsOut); } + // b3Contact4 pt = contactOut->at(0); + // printf("nContacts = %d\n",nContacts); + } + } + } + else //breakupKernel + { + if (nPairs) + { + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(pairs->getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(m_sepNormals.getBufferCL()), + b3BufferInfoCL(m_hasSeparatingNormals.getBufferCL()), + b3BufferInfoCL(contactOut->getBufferCL()), + b3BufferInfoCL(m_totalContactsOut.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_clipHullHullKernel, "m_clipHullHullKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(nPairs); + launcher.setConst(maxContactCapacity); - nContacts = nGlobalContactsOut; - m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true); - hostContactsOut.resize(nContacts); - //printf("contactOut4 (after newContactReductionKernel) = %d\n",nContacts); - contactOut->copyFromHost(hostContactsOut); + int num = nPairs; + launcher.launch1D(num); + clFinish(m_queue); + + nContacts = m_totalContactsOut.at(0); + if (nContacts >= maxContactCapacity) + { + b3Error("Exceeded contact capacity (%d/%d)\n", nContacts, maxContactCapacity); + nContacts = maxContactCapacity; } - // b3Contact4 pt = contactOut->at(0); - // printf("nContacts = %d\n",nContacts); + contactOut->resize(nContacts); } } - } - else//breakupKernel - { - if (nPairs) + int nCompoundsPairs = m_gpuCompoundPairs.size(); + + if (nCompoundsPairs) { b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( pairs->getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( m_sepNormals.getBufferCL()), - b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()), - b3BufferInfoCL( contactOut->getBufferCL()), - b3BufferInfoCL( m_totalContactsOut.getBufferCL()) - }; - b3LauncherCL launcher(m_queue, m_clipHullHullKernel,"m_clipHullHullKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( nPairs ); + b3BufferInfoCL(m_gpuCompoundPairs.getBufferCL(), true), + b3BufferInfoCL(bodyBuf->getBufferCL(), true), + b3BufferInfoCL(gpuCollidables.getBufferCL(), true), + b3BufferInfoCL(convexData.getBufferCL(), true), + b3BufferInfoCL(gpuVertices.getBufferCL(), true), + b3BufferInfoCL(gpuUniqueEdges.getBufferCL(), true), + b3BufferInfoCL(gpuFaces.getBufferCL(), true), + b3BufferInfoCL(gpuIndices.getBufferCL(), true), + b3BufferInfoCL(gpuChildShapes.getBufferCL(), true), + b3BufferInfoCL(m_gpuCompoundSepNormals.getBufferCL(), true), + b3BufferInfoCL(m_gpuHasCompoundSepNormals.getBufferCL(), true), + b3BufferInfoCL(contactOut->getBufferCL()), + b3BufferInfoCL(m_totalContactsOut.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_clipCompoundsHullHullKernel, "m_clipCompoundsHullHullKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(nCompoundsPairs); launcher.setConst(maxContactCapacity); - int num = nPairs; - launcher.launch1D( num); + int num = nCompoundsPairs; + launcher.launch1D(num); clFinish(m_queue); nContacts = m_totalContactsOut.at(0); - if (nContacts >= maxContactCapacity) + if (nContacts > maxContactCapacity) { - b3Error("Exceeded contact capacity (%d/%d)\n",nContacts,maxContactCapacity); + b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity); nContacts = maxContactCapacity; } contactOut->resize(nContacts); - } - } - - - int nCompoundsPairs = m_gpuCompoundPairs.size(); - - if (nCompoundsPairs) - { - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_gpuCompoundPairs.getBufferCL(), true ), - b3BufferInfoCL( bodyBuf->getBufferCL(),true), - b3BufferInfoCL( gpuCollidables.getBufferCL(),true), - b3BufferInfoCL( convexData.getBufferCL(),true), - b3BufferInfoCL( gpuVertices.getBufferCL(),true), - b3BufferInfoCL( gpuUniqueEdges.getBufferCL(),true), - b3BufferInfoCL( gpuFaces.getBufferCL(),true), - b3BufferInfoCL( gpuIndices.getBufferCL(),true), - b3BufferInfoCL( gpuChildShapes.getBufferCL(),true), - b3BufferInfoCL( m_gpuCompoundSepNormals.getBufferCL(),true), - b3BufferInfoCL( m_gpuHasCompoundSepNormals.getBufferCL(),true), - b3BufferInfoCL( contactOut->getBufferCL()), - b3BufferInfoCL( m_totalContactsOut.getBufferCL()) - }; - b3LauncherCL launcher(m_queue, m_clipCompoundsHullHullKernel,"m_clipCompoundsHullHullKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( nCompoundsPairs ); - launcher.setConst(maxContactCapacity); - - int num = nCompoundsPairs; - launcher.launch1D( num); - clFinish(m_queue); - - nContacts = m_totalContactsOut.at(0); - if (nContacts>maxContactCapacity) - { - - b3Error("Error: contacts exceeds capacity (%d/%d)\n", nContacts, maxContactCapacity); - nContacts = maxContactCapacity; - } - contactOut->resize(nContacts); - }//if nCompoundsPairs + } //if nCompoundsPairs } - }//contactClippingOnGpu + } //contactClippingOnGpu //printf("nContacts end = %d\n",nContacts); - + //printf("frameCount = %d\n",frameCount++); } diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h index e24c1579c6..53e8c4ed4d 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h @@ -17,102 +17,90 @@ //#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h" - - - struct GpuSatCollision { - cl_context m_context; - cl_device_id m_device; - cl_command_queue m_queue; - cl_kernel m_findSeparatingAxisKernel; - cl_kernel m_mprPenetrationKernel; - cl_kernel m_findSeparatingAxisUnitSphereKernel; - + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + cl_kernel m_findSeparatingAxisKernel; + cl_kernel m_mprPenetrationKernel; + cl_kernel m_findSeparatingAxisUnitSphereKernel; cl_kernel m_findSeparatingAxisVertexFaceKernel; cl_kernel m_findSeparatingAxisEdgeEdgeKernel; - - cl_kernel m_findConcaveSeparatingAxisKernel; - cl_kernel m_findConcaveSeparatingAxisVertexFaceKernel; - cl_kernel m_findConcaveSeparatingAxisEdgeEdgeKernel; - - - - - cl_kernel m_findCompoundPairsKernel; - cl_kernel m_processCompoundPairsKernel; - - cl_kernel m_clipHullHullKernel; - cl_kernel m_clipCompoundsHullHullKernel; - - cl_kernel m_clipFacesAndFindContacts; - cl_kernel m_findClippingFacesKernel; - - cl_kernel m_clipHullHullConcaveConvexKernel; -// cl_kernel m_extractManifoldAndAddContactKernel; - cl_kernel m_newContactReductionKernel; - - cl_kernel m_bvhTraversalKernel; - cl_kernel m_primitiveContactsKernel; - cl_kernel m_findConcaveSphereContactsKernel; - - cl_kernel m_processCompoundPairsPrimitivesKernel; - + + cl_kernel m_findConcaveSeparatingAxisKernel; + cl_kernel m_findConcaveSeparatingAxisVertexFaceKernel; + cl_kernel m_findConcaveSeparatingAxisEdgeEdgeKernel; + + cl_kernel m_findCompoundPairsKernel; + cl_kernel m_processCompoundPairsKernel; + + cl_kernel m_clipHullHullKernel; + cl_kernel m_clipCompoundsHullHullKernel; + + cl_kernel m_clipFacesAndFindContacts; + cl_kernel m_findClippingFacesKernel; + + cl_kernel m_clipHullHullConcaveConvexKernel; + // cl_kernel m_extractManifoldAndAddContactKernel; + cl_kernel m_newContactReductionKernel; + + cl_kernel m_bvhTraversalKernel; + cl_kernel m_primitiveContactsKernel; + cl_kernel m_findConcaveSphereContactsKernel; + + cl_kernel m_processCompoundPairsPrimitivesKernel; + b3OpenCLArray<b3Vector3> m_unitSphereDirections; - b3OpenCLArray<int> m_totalContactsOut; + b3OpenCLArray<int> m_totalContactsOut; b3OpenCLArray<b3Vector3> m_sepNormals; b3OpenCLArray<float> m_dmins; - b3OpenCLArray<int> m_hasSeparatingNormals; + b3OpenCLArray<int> m_hasSeparatingNormals; b3OpenCLArray<b3Vector3> m_concaveSepNormals; - b3OpenCLArray<int> m_concaveHasSeparatingNormals; - b3OpenCLArray<int> m_numConcavePairsOut; + b3OpenCLArray<int> m_concaveHasSeparatingNormals; + b3OpenCLArray<int> m_numConcavePairsOut; b3OpenCLArray<b3CompoundOverlappingPair> m_gpuCompoundPairs; b3OpenCLArray<b3Vector3> m_gpuCompoundSepNormals; - b3OpenCLArray<int> m_gpuHasCompoundSepNormals; - b3OpenCLArray<int> m_numCompoundPairsOut; - + b3OpenCLArray<int> m_gpuHasCompoundSepNormals; + b3OpenCLArray<int> m_numCompoundPairsOut; - GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue q ); + GpuSatCollision(cl_context ctx, cl_device_id device, cl_command_queue q); virtual ~GpuSatCollision(); - - - void computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs, - const b3OpenCLArray<b3RigidBodyData>* bodyBuf, - b3OpenCLArray<b3Contact4>* contactOut, int& nContacts, - const b3OpenCLArray<b3Contact4>* oldContacts, - int maxContactCapacity, - int compoundPairCapacity, - const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData, - const b3OpenCLArray<b3Vector3>& vertices, - const b3OpenCLArray<b3Vector3>& uniqueEdges, - const b3OpenCLArray<b3GpuFace>& faces, - const b3OpenCLArray<int>& indices, - const b3OpenCLArray<b3Collidable>& gpuCollidables, - const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes, - - const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace, - const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace, - - b3OpenCLArray<b3Vector3>& worldVertsB1GPU, - b3OpenCLArray<b3Int4>& clippingFacesOutGPU, - b3OpenCLArray<b3Vector3>& worldNormalsAGPU, - b3OpenCLArray<b3Vector3>& worldVertsA1GPU, - b3OpenCLArray<b3Vector3>& worldVertsB2GPU, - b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData, - b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU, - b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU, - b3OpenCLArray<b3BvhInfo>* bvhInfo, - int numObjects, - int maxTriConvexPairCapacity, - b3OpenCLArray<b3Int4>& triangleConvexPairs, - int& numTriConvexPairsOut - ); - + void computeConvexConvexContactsGPUSAT(b3OpenCLArray<b3Int4>* pairs, int nPairs, + const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + b3OpenCLArray<b3Contact4>* contactOut, int& nContacts, + const b3OpenCLArray<b3Contact4>* oldContacts, + int maxContactCapacity, + int compoundPairCapacity, + const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData, + const b3OpenCLArray<b3Vector3>& vertices, + const b3OpenCLArray<b3Vector3>& uniqueEdges, + const b3OpenCLArray<b3GpuFace>& faces, + const b3OpenCLArray<int>& indices, + const b3OpenCLArray<b3Collidable>& gpuCollidables, + const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes, + + const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace, + const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace, + + b3OpenCLArray<b3Vector3>& worldVertsB1GPU, + b3OpenCLArray<b3Int4>& clippingFacesOutGPU, + b3OpenCLArray<b3Vector3>& worldNormalsAGPU, + b3OpenCLArray<b3Vector3>& worldVertsA1GPU, + b3OpenCLArray<b3Vector3>& worldVertsB2GPU, + b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData, + b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU, + b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU, + b3OpenCLArray<b3BvhInfo>* bvhInfo, + int numObjects, + int maxTriConvexPairCapacity, + b3OpenCLArray<b3Int4>& triangleConvexPairs, + int& numTriConvexPairsOut); }; -#endif //_CONVEX_HULL_CONTACT_H +#endif //_CONVEX_HULL_CONTACT_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h index 337100fb1a..c4cf700076 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h @@ -4,6 +4,4 @@ #include "Bullet3Common/b3Transform.h" #include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" - - -#endif //CONVEX_POLYHEDRON_CL +#endif //CONVEX_POLYHEDRON_CL diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp index d636f983c6..974b246f03 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp @@ -29,902 +29,951 @@ GJK-EPA collision solver by Nathanael Presson, 2008 namespace gjkepa2_impl2 { +// Config - // Config +/* GJK */ +#define GJK_MAX_ITERATIONS 128 +#define GJK_ACCURACY ((b3Scalar)0.0001) +#define GJK_MIN_DISTANCE ((b3Scalar)0.0001) +#define GJK_DUPLICATED_EPS ((b3Scalar)0.0001) +#define GJK_SIMPLEX2_EPS ((b3Scalar)0.0) +#define GJK_SIMPLEX3_EPS ((b3Scalar)0.0) +#define GJK_SIMPLEX4_EPS ((b3Scalar)0.0) - /* GJK */ -#define GJK_MAX_ITERATIONS 128 -#define GJK_ACCURACY ((b3Scalar)0.0001) -#define GJK_MIN_DISTANCE ((b3Scalar)0.0001) -#define GJK_DUPLICATED_EPS ((b3Scalar)0.0001) -#define GJK_SIMPLEX2_EPS ((b3Scalar)0.0) -#define GJK_SIMPLEX3_EPS ((b3Scalar)0.0) -#define GJK_SIMPLEX4_EPS ((b3Scalar)0.0) +/* EPA */ +#define EPA_MAX_VERTICES 64 +#define EPA_MAX_FACES (EPA_MAX_VERTICES * 2) +#define EPA_MAX_ITERATIONS 255 +#define EPA_ACCURACY ((b3Scalar)0.0001) +#define EPA_FALLBACK (10 * EPA_ACCURACY) +#define EPA_PLANE_EPS ((b3Scalar)0.00001) +#define EPA_INSIDE_EPS ((b3Scalar)0.01) - /* EPA */ -#define EPA_MAX_VERTICES 64 -#define EPA_MAX_FACES (EPA_MAX_VERTICES*2) -#define EPA_MAX_ITERATIONS 255 -#define EPA_ACCURACY ((b3Scalar)0.0001) -#define EPA_FALLBACK (10*EPA_ACCURACY) -#define EPA_PLANE_EPS ((b3Scalar)0.00001) -#define EPA_INSIDE_EPS ((b3Scalar)0.01) +// Shorthands +// MinkowskiDiff +struct b3MinkowskiDiff +{ + const b3ConvexPolyhedronData* m_shapes[2]; - // Shorthands - - - // MinkowskiDiff - struct b3MinkowskiDiff - { - - - const b3ConvexPolyhedronData* m_shapes[2]; - - - b3Matrix3x3 m_toshape1; - b3Transform m_toshape0; + b3Matrix3x3 m_toshape1; + b3Transform m_toshape0; - bool m_enableMargin; - + bool m_enableMargin; - void EnableMargin(bool enable) - { - m_enableMargin = enable; - } - inline b3Vector3 Support0(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesA) const + void EnableMargin(bool enable) + { + m_enableMargin = enable; + } + inline b3Vector3 Support0(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesA) const + { + if (m_enableMargin) { - if (m_enableMargin) - { - return localGetSupportVertexWithMargin(d,m_shapes[0],verticesA,0.f); - } else - { - return localGetSupportVertexWithoutMargin(d,m_shapes[0],verticesA); - } + return localGetSupportVertexWithMargin(d, m_shapes[0], verticesA, 0.f); } - inline b3Vector3 Support1(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesB) const + else { - if (m_enableMargin) - { - return m_toshape0*(localGetSupportVertexWithMargin(m_toshape1*d,m_shapes[1],verticesB,0.f)); - } else - { - return m_toshape0*(localGetSupportVertexWithoutMargin(m_toshape1*d,m_shapes[1],verticesB)); - } + return localGetSupportVertexWithoutMargin(d, m_shapes[0], verticesA); } - - inline b3Vector3 Support(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB) const + } + inline b3Vector3 Support1(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesB) const + { + if (m_enableMargin) { - return(Support0(d,verticesA)-Support1(-d,verticesB)); + return m_toshape0 * (localGetSupportVertexWithMargin(m_toshape1 * d, m_shapes[1], verticesB, 0.f)); } - b3Vector3 Support(const b3Vector3& d,unsigned int index,const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB) const + else { - if(index) - return(Support1(d,verticesA)); - else - return(Support0(d,verticesB)); + return m_toshape0 * (localGetSupportVertexWithoutMargin(m_toshape1 * d, m_shapes[1], verticesB)); } - }; + } - typedef b3MinkowskiDiff tShape; + inline b3Vector3 Support(const b3Vector3& d, const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB) const + { + return (Support0(d, verticesA) - Support1(-d, verticesB)); + } + b3Vector3 Support(const b3Vector3& d, unsigned int index, const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB) const + { + if (index) + return (Support1(d, verticesA)); + else + return (Support0(d, verticesB)); + } +}; +typedef b3MinkowskiDiff tShape; - // GJK - struct b3GJK +// GJK +struct b3GJK +{ + /* Types */ + struct sSV { - /* Types */ - struct sSV - { - b3Vector3 d,w; - }; - struct sSimplex + b3Vector3 d, w; + }; + struct sSimplex + { + sSV* c[4]; + b3Scalar p[4]; + unsigned int rank; + }; + struct eStatus + { + enum _ { - sSV* c[4]; - b3Scalar p[4]; - unsigned int rank; - }; - struct eStatus { enum _ { Valid, Inside, - Failed };}; - /* Fields */ - tShape m_shape; - const b3AlignedObjectArray<b3Vector3>& m_verticesA; - const b3AlignedObjectArray<b3Vector3>& m_verticesB; - b3Vector3 m_ray; - b3Scalar m_distance; - sSimplex m_simplices[2]; - sSV m_store[4]; - sSV* m_free[4]; - unsigned int m_nfree; - unsigned int m_current; - sSimplex* m_simplex; - eStatus::_ m_status; - /* Methods */ - b3GJK(const b3AlignedObjectArray<b3Vector3>& verticesA,const b3AlignedObjectArray<b3Vector3>& verticesB) - :m_verticesA(verticesA),m_verticesB(verticesB) - { - Initialize(); + Failed + }; + }; + /* Fields */ + tShape m_shape; + const b3AlignedObjectArray<b3Vector3>& m_verticesA; + const b3AlignedObjectArray<b3Vector3>& m_verticesB; + b3Vector3 m_ray; + b3Scalar m_distance; + sSimplex m_simplices[2]; + sSV m_store[4]; + sSV* m_free[4]; + unsigned int m_nfree; + unsigned int m_current; + sSimplex* m_simplex; + eStatus::_ m_status; + /* Methods */ + b3GJK(const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB) + : m_verticesA(verticesA), m_verticesB(verticesB) + { + Initialize(); + } + void Initialize() + { + m_ray = b3MakeVector3(0, 0, 0); + m_nfree = 0; + m_status = eStatus::Failed; + m_current = 0; + m_distance = 0; + } + eStatus::_ Evaluate(const tShape& shapearg, const b3Vector3& guess) + { + unsigned int iterations = 0; + b3Scalar sqdist = 0; + b3Scalar alpha = 0; + b3Vector3 lastw[4]; + unsigned int clastw = 0; + /* Initialize solver */ + m_free[0] = &m_store[0]; + m_free[1] = &m_store[1]; + m_free[2] = &m_store[2]; + m_free[3] = &m_store[3]; + m_nfree = 4; + m_current = 0; + m_status = eStatus::Valid; + m_shape = shapearg; + m_distance = 0; + /* Initialize simplex */ + m_simplices[0].rank = 0; + m_ray = guess; + const b3Scalar sqrl = m_ray.length2(); + appendvertice(m_simplices[0], sqrl > 0 ? -m_ray : b3MakeVector3(1, 0, 0)); + m_simplices[0].p[0] = 1; + m_ray = m_simplices[0].c[0]->w; + sqdist = sqrl; + lastw[0] = + lastw[1] = + lastw[2] = + lastw[3] = m_ray; + /* Loop */ + do + { + const unsigned int next = 1 - m_current; + sSimplex& cs = m_simplices[m_current]; + sSimplex& ns = m_simplices[next]; + /* Check zero */ + const b3Scalar rl = m_ray.length(); + if (rl < GJK_MIN_DISTANCE) + { /* Touching or inside */ + m_status = eStatus::Inside; + break; } - void Initialize() + /* Append new vertice in -'v' direction */ + appendvertice(cs, -m_ray); + const b3Vector3& w = cs.c[cs.rank - 1]->w; + bool found = false; + for (unsigned int i = 0; i < 4; ++i) { - m_ray = b3MakeVector3(0,0,0); - m_nfree = 0; - m_status = eStatus::Failed; - m_current = 0; - m_distance = 0; + if ((w - lastw[i]).length2() < GJK_DUPLICATED_EPS) + { + found = true; + break; + } + } + if (found) + { /* Return old simplex */ + removevertice(m_simplices[m_current]); + break; } - eStatus::_ Evaluate(const tShape& shapearg,const b3Vector3& guess) + else + { /* Update lastw */ + lastw[clastw = (clastw + 1) & 3] = w; + } + /* Check for termination */ + const b3Scalar omega = b3Dot(m_ray, w) / rl; + alpha = b3Max(omega, alpha); + if (((rl - alpha) - (GJK_ACCURACY * rl)) <= 0) + { /* Return old simplex */ + removevertice(m_simplices[m_current]); + break; + } + /* Reduce simplex */ + b3Scalar weights[4]; + unsigned int mask = 0; + switch (cs.rank) { - unsigned int iterations=0; - b3Scalar sqdist=0; - b3Scalar alpha=0; - b3Vector3 lastw[4]; - unsigned int clastw=0; - /* Initialize solver */ - m_free[0] = &m_store[0]; - m_free[1] = &m_store[1]; - m_free[2] = &m_store[2]; - m_free[3] = &m_store[3]; - m_nfree = 4; - m_current = 0; - m_status = eStatus::Valid; - m_shape = shapearg; - m_distance = 0; - /* Initialize simplex */ - m_simplices[0].rank = 0; - m_ray = guess; - const b3Scalar sqrl= m_ray.length2(); - appendvertice(m_simplices[0],sqrl>0?-m_ray:b3MakeVector3(1,0,0)); - m_simplices[0].p[0] = 1; - m_ray = m_simplices[0].c[0]->w; - sqdist = sqrl; - lastw[0] = - lastw[1] = - lastw[2] = - lastw[3] = m_ray; - /* Loop */ - do { - const unsigned int next=1-m_current; - sSimplex& cs=m_simplices[m_current]; - sSimplex& ns=m_simplices[next]; - /* Check zero */ - const b3Scalar rl=m_ray.length(); - if(rl<GJK_MIN_DISTANCE) - {/* Touching or inside */ - m_status=eStatus::Inside; - break; - } - /* Append new vertice in -'v' direction */ - appendvertice(cs,-m_ray); - const b3Vector3& w=cs.c[cs.rank-1]->w; - bool found=false; - for(unsigned int i=0;i<4;++i) + case 2: + sqdist = projectorigin(cs.c[0]->w, + cs.c[1]->w, + weights, mask); + break; + case 3: + sqdist = projectorigin(cs.c[0]->w, + cs.c[1]->w, + cs.c[2]->w, + weights, mask); + break; + case 4: + sqdist = projectorigin(cs.c[0]->w, + cs.c[1]->w, + cs.c[2]->w, + cs.c[3]->w, + weights, mask); + break; + } + if (sqdist >= 0) + { /* Valid */ + ns.rank = 0; + m_ray = b3MakeVector3(0, 0, 0); + m_current = next; + for (unsigned int i = 0, ni = cs.rank; i < ni; ++i) + { + if (mask & (1 << i)) { - if((w-lastw[i]).length2()<GJK_DUPLICATED_EPS) - { found=true;break; } - } - if(found) - {/* Return old simplex */ - removevertice(m_simplices[m_current]); - break; + ns.c[ns.rank] = cs.c[i]; + ns.p[ns.rank++] = weights[i]; + m_ray += cs.c[i]->w * weights[i]; } else - {/* Update lastw */ - lastw[clastw=(clastw+1)&3]=w; - } - /* Check for termination */ - const b3Scalar omega=b3Dot(m_ray,w)/rl; - alpha=b3Max(omega,alpha); - if(((rl-alpha)-(GJK_ACCURACY*rl))<=0) - {/* Return old simplex */ - removevertice(m_simplices[m_current]); - break; - } - /* Reduce simplex */ - b3Scalar weights[4]; - unsigned int mask=0; - switch(cs.rank) { - case 2: sqdist=projectorigin( cs.c[0]->w, - cs.c[1]->w, - weights,mask);break; - case 3: sqdist=projectorigin( cs.c[0]->w, - cs.c[1]->w, - cs.c[2]->w, - weights,mask);break; - case 4: sqdist=projectorigin( cs.c[0]->w, - cs.c[1]->w, - cs.c[2]->w, - cs.c[3]->w, - weights,mask);break; - } - if(sqdist>=0) - {/* Valid */ - ns.rank = 0; - m_ray = b3MakeVector3(0,0,0); - m_current = next; - for(unsigned int i=0,ni=cs.rank;i<ni;++i) - { - if(mask&(1<<i)) - { - ns.c[ns.rank] = cs.c[i]; - ns.p[ns.rank++] = weights[i]; - m_ray += cs.c[i]->w*weights[i]; - } - else - { - m_free[m_nfree++] = cs.c[i]; - } - } - if(mask==15) m_status=eStatus::Inside; + m_free[m_nfree++] = cs.c[i]; } - else - {/* Return old simplex */ - removevertice(m_simplices[m_current]); - break; - } - m_status=((++iterations)<GJK_MAX_ITERATIONS)?m_status:eStatus::Failed; - } while(m_status==eStatus::Valid); - m_simplex=&m_simplices[m_current]; - switch(m_status) + } + if (mask == 15) m_status = eStatus::Inside; + } + else + { /* Return old simplex */ + removevertice(m_simplices[m_current]); + break; + } + m_status = ((++iterations) < GJK_MAX_ITERATIONS) ? m_status : eStatus::Failed; + } while (m_status == eStatus::Valid); + m_simplex = &m_simplices[m_current]; + switch (m_status) + { + case eStatus::Valid: + m_distance = m_ray.length(); + break; + case eStatus::Inside: + m_distance = 0; + break; + default: + { + } + } + return (m_status); + } + bool EncloseOrigin() + { + switch (m_simplex->rank) + { + case 1: + { + for (unsigned int i = 0; i < 3; ++i) { - case eStatus::Valid: m_distance=m_ray.length();break; - case eStatus::Inside: m_distance=0;break; - default: - { - } - } - return(m_status); + b3Vector3 axis = b3MakeVector3(0, 0, 0); + axis[i] = 1; + appendvertice(*m_simplex, axis); + if (EncloseOrigin()) return (true); + removevertice(*m_simplex); + appendvertice(*m_simplex, -axis); + if (EncloseOrigin()) return (true); + removevertice(*m_simplex); + } } - bool EncloseOrigin() + break; + case 2: { - switch(m_simplex->rank) + const b3Vector3 d = m_simplex->c[1]->w - m_simplex->c[0]->w; + for (unsigned int i = 0; i < 3; ++i) { - case 1: + b3Vector3 axis = b3MakeVector3(0, 0, 0); + axis[i] = 1; + const b3Vector3 p = b3Cross(d, axis); + if (p.length2() > 0) { - for(unsigned int i=0;i<3;++i) - { - b3Vector3 axis=b3MakeVector3(0,0,0); - axis[i]=1; - appendvertice(*m_simplex, axis); - if(EncloseOrigin()) return(true); - removevertice(*m_simplex); - appendvertice(*m_simplex,-axis); - if(EncloseOrigin()) return(true); - removevertice(*m_simplex); - } + appendvertice(*m_simplex, p); + if (EncloseOrigin()) return (true); + removevertice(*m_simplex); + appendvertice(*m_simplex, -p); + if (EncloseOrigin()) return (true); + removevertice(*m_simplex); } - break; - case 2: - { - const b3Vector3 d=m_simplex->c[1]->w-m_simplex->c[0]->w; - for(unsigned int i=0;i<3;++i) - { - b3Vector3 axis=b3MakeVector3(0,0,0); - axis[i]=1; - const b3Vector3 p=b3Cross(d,axis); - if(p.length2()>0) - { - appendvertice(*m_simplex, p); - if(EncloseOrigin()) return(true); - removevertice(*m_simplex); - appendvertice(*m_simplex,-p); - if(EncloseOrigin()) return(true); - removevertice(*m_simplex); - } - } - } - break; - case 3: - { - const b3Vector3 n=b3Cross(m_simplex->c[1]->w-m_simplex->c[0]->w, - m_simplex->c[2]->w-m_simplex->c[0]->w); - if(n.length2()>0) - { - appendvertice(*m_simplex,n); - if(EncloseOrigin()) return(true); - removevertice(*m_simplex); - appendvertice(*m_simplex,-n); - if(EncloseOrigin()) return(true); - removevertice(*m_simplex); - } - } - break; - case 4: - { - if(b3Fabs(det( m_simplex->c[0]->w-m_simplex->c[3]->w, - m_simplex->c[1]->w-m_simplex->c[3]->w, - m_simplex->c[2]->w-m_simplex->c[3]->w))>0) - return(true); - } - break; } - return(false); } - /* Internals */ - void getsupport(const b3Vector3& d,sSV& sv) const + break; + case 3: { - sv.d = d/d.length(); - sv.w = m_shape.Support(sv.d,m_verticesA,m_verticesB); + const b3Vector3 n = b3Cross(m_simplex->c[1]->w - m_simplex->c[0]->w, + m_simplex->c[2]->w - m_simplex->c[0]->w); + if (n.length2() > 0) + { + appendvertice(*m_simplex, n); + if (EncloseOrigin()) return (true); + removevertice(*m_simplex); + appendvertice(*m_simplex, -n); + if (EncloseOrigin()) return (true); + removevertice(*m_simplex); + } } - void removevertice(sSimplex& simplex) + break; + case 4: { - m_free[m_nfree++]=simplex.c[--simplex.rank]; + if (b3Fabs(det(m_simplex->c[0]->w - m_simplex->c[3]->w, + m_simplex->c[1]->w - m_simplex->c[3]->w, + m_simplex->c[2]->w - m_simplex->c[3]->w)) > 0) + return (true); } - void appendvertice(sSimplex& simplex,const b3Vector3& v) + break; + } + return (false); + } + /* Internals */ + void getsupport(const b3Vector3& d, sSV& sv) const + { + sv.d = d / d.length(); + sv.w = m_shape.Support(sv.d, m_verticesA, m_verticesB); + } + void removevertice(sSimplex& simplex) + { + m_free[m_nfree++] = simplex.c[--simplex.rank]; + } + void appendvertice(sSimplex& simplex, const b3Vector3& v) + { + simplex.p[simplex.rank] = 0; + simplex.c[simplex.rank] = m_free[--m_nfree]; + getsupport(v, *simplex.c[simplex.rank++]); + } + static b3Scalar det(const b3Vector3& a, const b3Vector3& b, const b3Vector3& c) + { + return (a.y * b.z * c.x + a.z * b.x * c.y - + a.x * b.z * c.y - a.y * b.x * c.z + + a.x * b.y * c.z - a.z * b.y * c.x); + } + static b3Scalar projectorigin(const b3Vector3& a, + const b3Vector3& b, + b3Scalar* w, unsigned int& m) + { + const b3Vector3 d = b - a; + const b3Scalar l = d.length2(); + if (l > GJK_SIMPLEX2_EPS) + { + const b3Scalar t(l > 0 ? -b3Dot(a, d) / l : 0); + if (t >= 1) { - simplex.p[simplex.rank]=0; - simplex.c[simplex.rank]=m_free[--m_nfree]; - getsupport(v,*simplex.c[simplex.rank++]); + w[0] = 0; + w[1] = 1; + m = 2; + return (b.length2()); } - static b3Scalar det(const b3Vector3& a,const b3Vector3& b,const b3Vector3& c) + else if (t <= 0) { - return( a.y*b.z*c.x+a.z*b.x*c.y- - a.x*b.z*c.y-a.y*b.x*c.z+ - a.x*b.y*c.z-a.z*b.y*c.x); + w[0] = 1; + w[1] = 0; + m = 1; + return (a.length2()); } - static b3Scalar projectorigin( const b3Vector3& a, - const b3Vector3& b, - b3Scalar* w,unsigned int& m) + else { - const b3Vector3 d=b-a; - const b3Scalar l=d.length2(); - if(l>GJK_SIMPLEX2_EPS) - { - const b3Scalar t(l>0?-b3Dot(a,d)/l:0); - if(t>=1) { w[0]=0;w[1]=1;m=2;return(b.length2()); } - else if(t<=0) { w[0]=1;w[1]=0;m=1;return(a.length2()); } - else { w[0]=1-(w[1]=t);m=3;return((a+d*t).length2()); } - } - return(-1); + w[0] = 1 - (w[1] = t); + m = 3; + return ((a + d * t).length2()); } - static b3Scalar projectorigin( const b3Vector3& a, - const b3Vector3& b, - const b3Vector3& c, - b3Scalar* w,unsigned int& m) + } + return (-1); + } + static b3Scalar projectorigin(const b3Vector3& a, + const b3Vector3& b, + const b3Vector3& c, + b3Scalar* w, unsigned int& m) + { + static const unsigned int imd3[] = {1, 2, 0}; + const b3Vector3* vt[] = {&a, &b, &c}; + const b3Vector3 dl[] = {a - b, b - c, c - a}; + const b3Vector3 n = b3Cross(dl[0], dl[1]); + const b3Scalar l = n.length2(); + if (l > GJK_SIMPLEX3_EPS) + { + b3Scalar mindist = -1; + b3Scalar subw[2] = {0.f, 0.f}; + unsigned int subm(0); + for (unsigned int i = 0; i < 3; ++i) { - static const unsigned int imd3[]={1,2,0}; - const b3Vector3* vt[]={&a,&b,&c}; - const b3Vector3 dl[]={a-b,b-c,c-a}; - const b3Vector3 n=b3Cross(dl[0],dl[1]); - const b3Scalar l=n.length2(); - if(l>GJK_SIMPLEX3_EPS) + if (b3Dot(*vt[i], b3Cross(dl[i], n)) > 0) { - b3Scalar mindist=-1; - b3Scalar subw[2]={0.f,0.f}; - unsigned int subm(0); - for(unsigned int i=0;i<3;++i) - { - if(b3Dot(*vt[i],b3Cross(dl[i],n))>0) - { - const unsigned int j=imd3[i]; - const b3Scalar subd(projectorigin(*vt[i],*vt[j],subw,subm)); - if((mindist<0)||(subd<mindist)) - { - mindist = subd; - m = static_cast<unsigned int>(((subm&1)?1<<i:0)+((subm&2)?1<<j:0)); - w[i] = subw[0]; - w[j] = subw[1]; - w[imd3[j]] = 0; - } - } - } - if(mindist<0) + const unsigned int j = imd3[i]; + const b3Scalar subd(projectorigin(*vt[i], *vt[j], subw, subm)); + if ((mindist < 0) || (subd < mindist)) { - const b3Scalar d=b3Dot(a,n); - const b3Scalar s=b3Sqrt(l); - const b3Vector3 p=n*(d/l); - mindist = p.length2(); - m = 7; - w[0] = (b3Cross(dl[1],b-p)).length()/s; - w[1] = (b3Cross(dl[2],c-p)).length()/s; - w[2] = 1-(w[0]+w[1]); + mindist = subd; + m = static_cast<unsigned int>(((subm & 1) ? 1 << i : 0) + ((subm & 2) ? 1 << j : 0)); + w[i] = subw[0]; + w[j] = subw[1]; + w[imd3[j]] = 0; } - return(mindist); } - return(-1); } - static b3Scalar projectorigin( const b3Vector3& a, - const b3Vector3& b, - const b3Vector3& c, - const b3Vector3& d, - b3Scalar* w,unsigned int& m) + if (mindist < 0) + { + const b3Scalar d = b3Dot(a, n); + const b3Scalar s = b3Sqrt(l); + const b3Vector3 p = n * (d / l); + mindist = p.length2(); + m = 7; + w[0] = (b3Cross(dl[1], b - p)).length() / s; + w[1] = (b3Cross(dl[2], c - p)).length() / s; + w[2] = 1 - (w[0] + w[1]); + } + return (mindist); + } + return (-1); + } + static b3Scalar projectorigin(const b3Vector3& a, + const b3Vector3& b, + const b3Vector3& c, + const b3Vector3& d, + b3Scalar* w, unsigned int& m) + { + static const unsigned int imd3[] = {1, 2, 0}; + const b3Vector3* vt[] = {&a, &b, &c, &d}; + const b3Vector3 dl[] = {a - d, b - d, c - d}; + const b3Scalar vl = det(dl[0], dl[1], dl[2]); + const bool ng = (vl * b3Dot(a, b3Cross(b - c, a - b))) <= 0; + if (ng && (b3Fabs(vl) > GJK_SIMPLEX4_EPS)) + { + b3Scalar mindist = -1; + b3Scalar subw[3] = {0.f, 0.f, 0.f}; + unsigned int subm(0); + for (unsigned int i = 0; i < 3; ++i) { - static const unsigned int imd3[]={1,2,0}; - const b3Vector3* vt[]={&a,&b,&c,&d}; - const b3Vector3 dl[]={a-d,b-d,c-d}; - const b3Scalar vl=det(dl[0],dl[1],dl[2]); - const bool ng=(vl*b3Dot(a,b3Cross(b-c,a-b)))<=0; - if(ng&&(b3Fabs(vl)>GJK_SIMPLEX4_EPS)) + const unsigned int j = imd3[i]; + const b3Scalar s = vl * b3Dot(d, b3Cross(dl[i], dl[j])); + if (s > 0) { - b3Scalar mindist=-1; - b3Scalar subw[3]={0.f,0.f,0.f}; - unsigned int subm(0); - for(unsigned int i=0;i<3;++i) + const b3Scalar subd = projectorigin(*vt[i], *vt[j], d, subw, subm); + if ((mindist < 0) || (subd < mindist)) { - const unsigned int j=imd3[i]; - const b3Scalar s=vl*b3Dot(d,b3Cross(dl[i],dl[j])); - if(s>0) - { - const b3Scalar subd=projectorigin(*vt[i],*vt[j],d,subw,subm); - if((mindist<0)||(subd<mindist)) - { - mindist = subd; - m = static_cast<unsigned int>((subm&1?1<<i:0)+ - (subm&2?1<<j:0)+ - (subm&4?8:0)); - w[i] = subw[0]; - w[j] = subw[1]; - w[imd3[j]] = 0; - w[3] = subw[2]; - } - } + mindist = subd; + m = static_cast<unsigned int>((subm & 1 ? 1 << i : 0) + + (subm & 2 ? 1 << j : 0) + + (subm & 4 ? 8 : 0)); + w[i] = subw[0]; + w[j] = subw[1]; + w[imd3[j]] = 0; + w[3] = subw[2]; } - if(mindist<0) - { - mindist = 0; - m = 15; - w[0] = det(c,b,d)/vl; - w[1] = det(a,c,d)/vl; - w[2] = det(b,a,d)/vl; - w[3] = 1-(w[0]+w[1]+w[2]); - } - return(mindist); } - return(-1); } - }; + if (mindist < 0) + { + mindist = 0; + m = 15; + w[0] = det(c, b, d) / vl; + w[1] = det(a, c, d) / vl; + w[2] = det(b, a, d) / vl; + w[3] = 1 - (w[0] + w[1] + w[2]); + } + return (mindist); + } + return (-1); + } +}; - // EPA - struct b3EPA +// EPA +struct b3EPA +{ + /* Types */ + typedef b3GJK::sSV sSV; + struct sFace { - /* Types */ - typedef b3GJK::sSV sSV; - struct sFace - { - b3Vector3 n; - b3Scalar d; - sSV* c[3]; - sFace* f[3]; - sFace* l[2]; - unsigned char e[3]; - unsigned char pass; - }; - struct sList - { - sFace* root; - unsigned int count; - sList() : root(0),count(0) {} - }; - struct sHorizon + b3Vector3 n; + b3Scalar d; + sSV* c[3]; + sFace* f[3]; + sFace* l[2]; + unsigned char e[3]; + unsigned char pass; + }; + struct sList + { + sFace* root; + unsigned int count; + sList() : root(0), count(0) {} + }; + struct sHorizon + { + sFace* cf; + sFace* ff; + unsigned int nf; + sHorizon() : cf(0), ff(0), nf(0) {} + }; + struct eStatus + { + enum _ { - sFace* cf; - sFace* ff; - unsigned int nf; - sHorizon() : cf(0),ff(0),nf(0) {} - }; - struct eStatus { enum _ { Valid, Touching, Degenerated, NonConvex, - InvalidHull, + InvalidHull, OutOfFaces, OutOfVertices, AccuraryReached, FallBack, - Failed };}; - /* Fields */ - eStatus::_ m_status; - b3GJK::sSimplex m_result; - b3Vector3 m_normal; - b3Scalar m_depth; - sSV m_sv_store[EPA_MAX_VERTICES]; - sFace m_fc_store[EPA_MAX_FACES]; - unsigned int m_nextsv; - sList m_hull; - sList m_stock; - /* Methods */ - b3EPA() - { - Initialize(); - } + Failed + }; + }; + /* Fields */ + eStatus::_ m_status; + b3GJK::sSimplex m_result; + b3Vector3 m_normal; + b3Scalar m_depth; + sSV m_sv_store[EPA_MAX_VERTICES]; + sFace m_fc_store[EPA_MAX_FACES]; + unsigned int m_nextsv; + sList m_hull; + sList m_stock; + /* Methods */ + b3EPA() + { + Initialize(); + } + static inline void bind(sFace* fa, unsigned int ea, sFace* fb, unsigned int eb) + { + fa->e[ea] = (unsigned char)eb; + fa->f[ea] = fb; + fb->e[eb] = (unsigned char)ea; + fb->f[eb] = fa; + } + static inline void append(sList& list, sFace* face) + { + face->l[0] = 0; + face->l[1] = list.root; + if (list.root) list.root->l[0] = face; + list.root = face; + ++list.count; + } + static inline void remove(sList& list, sFace* face) + { + if (face->l[1]) face->l[1]->l[0] = face->l[0]; + if (face->l[0]) face->l[0]->l[1] = face->l[1]; + if (face == list.root) list.root = face->l[1]; + --list.count; + } - static inline void bind(sFace* fa,unsigned int ea,sFace* fb,unsigned int eb) - { - fa->e[ea]=(unsigned char)eb;fa->f[ea]=fb; - fb->e[eb]=(unsigned char)ea;fb->f[eb]=fa; - } - static inline void append(sList& list,sFace* face) + void Initialize() + { + m_status = eStatus::Failed; + m_normal = b3MakeVector3(0, 0, 0); + m_depth = 0; + m_nextsv = 0; + for (unsigned int i = 0; i < EPA_MAX_FACES; ++i) + { + append(m_stock, &m_fc_store[EPA_MAX_FACES - i - 1]); + } + } + eStatus::_ Evaluate(b3GJK& gjk, const b3Vector3& guess) + { + b3GJK::sSimplex& simplex = *gjk.m_simplex; + if ((simplex.rank > 1) && gjk.EncloseOrigin()) + { + /* Clean up */ + while (m_hull.root) { - face->l[0] = 0; - face->l[1] = list.root; - if(list.root) list.root->l[0]=face; - list.root = face; - ++list.count; + sFace* f = m_hull.root; + remove(m_hull, f); + append(m_stock, f); } - static inline void remove(sList& list,sFace* face) + m_status = eStatus::Valid; + m_nextsv = 0; + /* Orient simplex */ + if (gjk.det(simplex.c[0]->w - simplex.c[3]->w, + simplex.c[1]->w - simplex.c[3]->w, + simplex.c[2]->w - simplex.c[3]->w) < 0) { - if(face->l[1]) face->l[1]->l[0]=face->l[0]; - if(face->l[0]) face->l[0]->l[1]=face->l[1]; - if(face==list.root) list.root=face->l[1]; - --list.count; + b3Swap(simplex.c[0], simplex.c[1]); + b3Swap(simplex.p[0], simplex.p[1]); } - - - void Initialize() + /* Build initial hull */ + sFace* tetra[] = {newface(simplex.c[0], simplex.c[1], simplex.c[2], true), + newface(simplex.c[1], simplex.c[0], simplex.c[3], true), + newface(simplex.c[2], simplex.c[1], simplex.c[3], true), + newface(simplex.c[0], simplex.c[2], simplex.c[3], true)}; + if (m_hull.count == 4) { - m_status = eStatus::Failed; - m_normal = b3MakeVector3(0,0,0); - m_depth = 0; - m_nextsv = 0; - for(unsigned int i=0;i<EPA_MAX_FACES;++i) + sFace* best = findbest(); + sFace outer = *best; + unsigned int pass = 0; + unsigned int iterations = 0; + bind(tetra[0], 0, tetra[1], 0); + bind(tetra[0], 1, tetra[2], 0); + bind(tetra[0], 2, tetra[3], 0); + bind(tetra[1], 1, tetra[3], 2); + bind(tetra[1], 2, tetra[2], 1); + bind(tetra[2], 2, tetra[3], 1); + m_status = eStatus::Valid; + for (; iterations < EPA_MAX_ITERATIONS; ++iterations) { - append(m_stock,&m_fc_store[EPA_MAX_FACES-i-1]); - } - } - eStatus::_ Evaluate(b3GJK& gjk,const b3Vector3& guess) - { - b3GJK::sSimplex& simplex=*gjk.m_simplex; - if((simplex.rank>1)&&gjk.EncloseOrigin()) - { - - /* Clean up */ - while(m_hull.root) + if (m_nextsv < EPA_MAX_VERTICES) { - sFace* f = m_hull.root; - remove(m_hull,f); - append(m_stock,f); - } - m_status = eStatus::Valid; - m_nextsv = 0; - /* Orient simplex */ - if(gjk.det( simplex.c[0]->w-simplex.c[3]->w, - simplex.c[1]->w-simplex.c[3]->w, - simplex.c[2]->w-simplex.c[3]->w)<0) - { - b3Swap(simplex.c[0],simplex.c[1]); - b3Swap(simplex.p[0],simplex.p[1]); - } - /* Build initial hull */ - sFace* tetra[]={newface(simplex.c[0],simplex.c[1],simplex.c[2],true), - newface(simplex.c[1],simplex.c[0],simplex.c[3],true), - newface(simplex.c[2],simplex.c[1],simplex.c[3],true), - newface(simplex.c[0],simplex.c[2],simplex.c[3],true)}; - if(m_hull.count==4) - { - sFace* best=findbest(); - sFace outer=*best; - unsigned int pass=0; - unsigned int iterations=0; - bind(tetra[0],0,tetra[1],0); - bind(tetra[0],1,tetra[2],0); - bind(tetra[0],2,tetra[3],0); - bind(tetra[1],1,tetra[3],2); - bind(tetra[1],2,tetra[2],1); - bind(tetra[2],2,tetra[3],1); - m_status=eStatus::Valid; - for(;iterations<EPA_MAX_ITERATIONS;++iterations) + sHorizon horizon; + sSV* w = &m_sv_store[m_nextsv++]; + bool valid = true; + best->pass = (unsigned char)(++pass); + gjk.getsupport(best->n, *w); + const b3Scalar wdist = b3Dot(best->n, w->w) - best->d; + if (wdist > EPA_ACCURACY) { - if(m_nextsv<EPA_MAX_VERTICES) - { - sHorizon horizon; - sSV* w=&m_sv_store[m_nextsv++]; - bool valid=true; - best->pass = (unsigned char)(++pass); - gjk.getsupport(best->n,*w); - const b3Scalar wdist=b3Dot(best->n,w->w)-best->d; - if(wdist>EPA_ACCURACY) - { - for(unsigned int j=0;(j<3)&&valid;++j) - { - valid&=expand( pass,w, - best->f[j],best->e[j], - horizon); - } - if(valid&&(horizon.nf>=3)) - { - bind(horizon.cf,1,horizon.ff,2); - remove(m_hull,best); - append(m_stock,best); - best=findbest(); - outer=*best; - } else { - m_status=eStatus::Failed; - //m_status=eStatus::InvalidHull; - break; } - } else { m_status=eStatus::AccuraryReached;break; } - } else { m_status=eStatus::OutOfVertices;break; } + for (unsigned int j = 0; (j < 3) && valid; ++j) + { + valid &= expand(pass, w, + best->f[j], best->e[j], + horizon); + } + if (valid && (horizon.nf >= 3)) + { + bind(horizon.cf, 1, horizon.ff, 2); + remove(m_hull, best); + append(m_stock, best); + best = findbest(); + outer = *best; + } + else + { + m_status = eStatus::Failed; + //m_status=eStatus::InvalidHull; + break; + } + } + else + { + m_status = eStatus::AccuraryReached; + break; } - const b3Vector3 projection=outer.n*outer.d; - m_normal = outer.n; - m_depth = outer.d; - m_result.rank = 3; - m_result.c[0] = outer.c[0]; - m_result.c[1] = outer.c[1]; - m_result.c[2] = outer.c[2]; - m_result.p[0] = b3Cross( outer.c[1]->w-projection, - outer.c[2]->w-projection).length(); - m_result.p[1] = b3Cross( outer.c[2]->w-projection, - outer.c[0]->w-projection).length(); - m_result.p[2] = b3Cross( outer.c[0]->w-projection, - outer.c[1]->w-projection).length(); - const b3Scalar sum=m_result.p[0]+m_result.p[1]+m_result.p[2]; - m_result.p[0] /= sum; - m_result.p[1] /= sum; - m_result.p[2] /= sum; - return(m_status); - } - } - /* Fallback */ - m_status = eStatus::FallBack; - m_normal = -guess; - const b3Scalar nl=m_normal.length(); - if(nl>0) - m_normal = m_normal/nl; - else - m_normal = b3MakeVector3(1,0,0); - m_depth = 0; - m_result.rank=1; - m_result.c[0]=simplex.c[0]; - m_result.p[0]=1; - return(m_status); - } - bool getedgedist(sFace* face, sSV* a, sSV* b, b3Scalar& dist) - { - const b3Vector3 ba = b->w - a->w; - const b3Vector3 n_ab = b3Cross(ba, face->n); // Outward facing edge normal direction, on triangle plane - const b3Scalar a_dot_nab = b3Dot(a->w, n_ab); // Only care about the sign to determine inside/outside, so not normalization required - - if(a_dot_nab < 0) - { - // Outside of edge a->b - - const b3Scalar ba_l2 = ba.length2(); - const b3Scalar a_dot_ba = b3Dot(a->w, ba); - const b3Scalar b_dot_ba = b3Dot(b->w, ba); - - if(a_dot_ba > 0) - { - // Pick distance vertex a - dist = a->w.length(); - } - else if(b_dot_ba < 0) - { - // Pick distance vertex b - dist = b->w.length(); } else { - // Pick distance to edge a->b - const b3Scalar a_dot_b = b3Dot(a->w, b->w); - dist = b3Sqrt(b3Max((a->w.length2() * b->w.length2() - a_dot_b * a_dot_b) / ba_l2, (b3Scalar)0)); + m_status = eStatus::OutOfVertices; + break; } - - return true; } + const b3Vector3 projection = outer.n * outer.d; + m_normal = outer.n; + m_depth = outer.d; + m_result.rank = 3; + m_result.c[0] = outer.c[0]; + m_result.c[1] = outer.c[1]; + m_result.c[2] = outer.c[2]; + m_result.p[0] = b3Cross(outer.c[1]->w - projection, + outer.c[2]->w - projection) + .length(); + m_result.p[1] = b3Cross(outer.c[2]->w - projection, + outer.c[0]->w - projection) + .length(); + m_result.p[2] = b3Cross(outer.c[0]->w - projection, + outer.c[1]->w - projection) + .length(); + const b3Scalar sum = m_result.p[0] + m_result.p[1] + m_result.p[2]; + m_result.p[0] /= sum; + m_result.p[1] /= sum; + m_result.p[2] /= sum; + return (m_status); + } + } + /* Fallback */ + m_status = eStatus::FallBack; + m_normal = -guess; + const b3Scalar nl = m_normal.length(); + if (nl > 0) + m_normal = m_normal / nl; + else + m_normal = b3MakeVector3(1, 0, 0); + m_depth = 0; + m_result.rank = 1; + m_result.c[0] = simplex.c[0]; + m_result.p[0] = 1; + return (m_status); + } + bool getedgedist(sFace* face, sSV* a, sSV* b, b3Scalar& dist) + { + const b3Vector3 ba = b->w - a->w; + const b3Vector3 n_ab = b3Cross(ba, face->n); // Outward facing edge normal direction, on triangle plane + const b3Scalar a_dot_nab = b3Dot(a->w, n_ab); // Only care about the sign to determine inside/outside, so not normalization required + + if (a_dot_nab < 0) + { + // Outside of edge a->b + + const b3Scalar ba_l2 = ba.length2(); + const b3Scalar a_dot_ba = b3Dot(a->w, ba); + const b3Scalar b_dot_ba = b3Dot(b->w, ba); - return false; + if (a_dot_ba > 0) + { + // Pick distance vertex a + dist = a->w.length(); } - sFace* newface(sSV* a,sSV* b,sSV* c,bool forced) + else if (b_dot_ba < 0) { - if(m_stock.root) - { - sFace* face=m_stock.root; - remove(m_stock,face); - append(m_hull,face); - face->pass = 0; - face->c[0] = a; - face->c[1] = b; - face->c[2] = c; - face->n = b3Cross(b->w-a->w,c->w-a->w); - const b3Scalar l=face->n.length(); - const bool v=l>EPA_ACCURACY; - - if(v) - { - if(!(getedgedist(face, a, b, face->d) || - getedgedist(face, b, c, face->d) || - getedgedist(face, c, a, face->d))) - { - // Origin projects to the interior of the triangle - // Use distance to triangle plane - face->d = b3Dot(a->w, face->n) / l; - } + // Pick distance vertex b + dist = b->w.length(); + } + else + { + // Pick distance to edge a->b + const b3Scalar a_dot_b = b3Dot(a->w, b->w); + dist = b3Sqrt(b3Max((a->w.length2() * b->w.length2() - a_dot_b * a_dot_b) / ba_l2, (b3Scalar)0)); + } - face->n /= l; - if(forced || (face->d >= -EPA_PLANE_EPS)) - { - return face; - } - else - m_status=eStatus::NonConvex; - } - else - m_status=eStatus::Degenerated; + return true; + } - remove(m_hull, face); - append(m_stock, face); - return 0; + return false; + } + sFace* newface(sSV* a, sSV* b, sSV* c, bool forced) + { + if (m_stock.root) + { + sFace* face = m_stock.root; + remove(m_stock, face); + append(m_hull, face); + face->pass = 0; + face->c[0] = a; + face->c[1] = b; + face->c[2] = c; + face->n = b3Cross(b->w - a->w, c->w - a->w); + const b3Scalar l = face->n.length(); + const bool v = l > EPA_ACCURACY; + if (v) + { + if (!(getedgedist(face, a, b, face->d) || + getedgedist(face, b, c, face->d) || + getedgedist(face, c, a, face->d))) + { + // Origin projects to the interior of the triangle + // Use distance to triangle plane + face->d = b3Dot(a->w, face->n) / l; } - m_status = m_stock.root ? eStatus::OutOfVertices : eStatus::OutOfFaces; - return 0; + + face->n /= l; + if (forced || (face->d >= -EPA_PLANE_EPS)) + { + return face; + } + else + m_status = eStatus::NonConvex; } - sFace* findbest() + else + m_status = eStatus::Degenerated; + + remove(m_hull, face); + append(m_stock, face); + return 0; + } + m_status = m_stock.root ? eStatus::OutOfVertices : eStatus::OutOfFaces; + return 0; + } + sFace* findbest() + { + sFace* minf = m_hull.root; + b3Scalar mind = minf->d * minf->d; + for (sFace* f = minf->l[1]; f; f = f->l[1]) + { + const b3Scalar sqd = f->d * f->d; + if (sqd < mind) { - sFace* minf=m_hull.root; - b3Scalar mind=minf->d*minf->d; - for(sFace* f=minf->l[1];f;f=f->l[1]) + minf = f; + mind = sqd; + } + } + return (minf); + } + bool expand(unsigned int pass, sSV* w, sFace* f, unsigned int e, sHorizon& horizon) + { + static const unsigned int i1m3[] = {1, 2, 0}; + static const unsigned int i2m3[] = {2, 0, 1}; + if (f->pass != pass) + { + const unsigned int e1 = i1m3[e]; + if ((b3Dot(f->n, w->w) - f->d) < -EPA_PLANE_EPS) + { + sFace* nf = newface(f->c[e1], f->c[e], w, false); + if (nf) { - const b3Scalar sqd=f->d*f->d; - if(sqd<mind) - { - minf=f; - mind=sqd; - } + bind(nf, 0, f, e); + if (horizon.cf) + bind(horizon.cf, 1, nf, 2); + else + horizon.ff = nf; + horizon.cf = nf; + ++horizon.nf; + return (true); } - return(minf); } - bool expand(unsigned int pass,sSV* w,sFace* f,unsigned int e,sHorizon& horizon) + else { - static const unsigned int i1m3[]={1,2,0}; - static const unsigned int i2m3[]={2,0,1}; - if(f->pass!=pass) + const unsigned int e2 = i2m3[e]; + f->pass = (unsigned char)pass; + if (expand(pass, w, f->f[e1], f->e[e1], horizon) && + expand(pass, w, f->f[e2], f->e[e2], horizon)) { - const unsigned int e1=i1m3[e]; - if((b3Dot(f->n,w->w)-f->d)<-EPA_PLANE_EPS) - { - sFace* nf=newface(f->c[e1],f->c[e],w,false); - if(nf) - { - bind(nf,0,f,e); - if(horizon.cf) bind(horizon.cf,1,nf,2); else horizon.ff=nf; - horizon.cf=nf; - ++horizon.nf; - return(true); - } - } - else - { - const unsigned int e2=i2m3[e]; - f->pass = (unsigned char)pass; - if( expand(pass,w,f->f[e1],f->e[e1],horizon)&& - expand(pass,w,f->f[e2],f->e[e2],horizon)) - { - remove(m_hull,f); - append(m_stock,f); - return(true); - } - } + remove(m_hull, f); + append(m_stock, f); + return (true); } - return(false); } - - }; - - // - static void Initialize(const b3Transform& transA, const b3Transform& transB, - const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, - const b3AlignedObjectArray<b3Vector3>& verticesA, - const b3AlignedObjectArray<b3Vector3>& verticesB, - b3GjkEpaSolver2::sResults& results, - tShape& shape, - bool withmargins) - { - /* Results */ - results.witnesses[0] = - results.witnesses[1] = b3MakeVector3(0,0,0); - results.status = b3GjkEpaSolver2::sResults::Separated; - /* Shape */ - shape.m_shapes[0] = hullA; - shape.m_shapes[1] = hullB; - shape.m_toshape1 = transB.getBasis().transposeTimes(transA.getBasis()); - shape.m_toshape0 = transA.inverseTimes(transB); - shape.EnableMargin(withmargins); + } + return (false); } +}; +// +static void Initialize(const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + b3GjkEpaSolver2::sResults& results, + tShape& shape, + bool withmargins) +{ + /* Results */ + results.witnesses[0] = + results.witnesses[1] = b3MakeVector3(0, 0, 0); + results.status = b3GjkEpaSolver2::sResults::Separated; + /* Shape */ + shape.m_shapes[0] = hullA; + shape.m_shapes[1] = hullB; + shape.m_toshape1 = transB.getBasis().transposeTimes(transA.getBasis()); + shape.m_toshape0 = transA.inverseTimes(transB); + shape.EnableMargin(withmargins); } +} // namespace gjkepa2_impl2 + // // Api // -using namespace gjkepa2_impl2; +using namespace gjkepa2_impl2; // -int b3GjkEpaSolver2::StackSizeRequirement() +int b3GjkEpaSolver2::StackSizeRequirement() { - return(sizeof(b3GJK)+sizeof(b3EPA)); + return (sizeof(b3GJK) + sizeof(b3EPA)); } // -bool b3GjkEpaSolver2::Distance( const b3Transform& transA, const b3Transform& transB, - const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, - const b3AlignedObjectArray<b3Vector3>& verticesA, - const b3AlignedObjectArray<b3Vector3>& verticesB, - const b3Vector3& guess, - sResults& results) +bool b3GjkEpaSolver2::Distance(const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3Vector3& guess, + sResults& results) { - tShape shape; - Initialize(transA,transB,hullA,hullB,verticesA,verticesB,results,shape,false); - b3GJK gjk(verticesA,verticesB); - b3GJK::eStatus::_ gjk_status=gjk.Evaluate(shape,guess); - if(gjk_status==b3GJK::eStatus::Valid) + tShape shape; + Initialize(transA, transB, hullA, hullB, verticesA, verticesB, results, shape, false); + b3GJK gjk(verticesA, verticesB); + b3GJK::eStatus::_ gjk_status = gjk.Evaluate(shape, guess); + if (gjk_status == b3GJK::eStatus::Valid) { - b3Vector3 w0=b3MakeVector3(0,0,0); - b3Vector3 w1=b3MakeVector3(0,0,0); - for(unsigned int i=0;i<gjk.m_simplex->rank;++i) + b3Vector3 w0 = b3MakeVector3(0, 0, 0); + b3Vector3 w1 = b3MakeVector3(0, 0, 0); + for (unsigned int i = 0; i < gjk.m_simplex->rank; ++i) { - const b3Scalar p=gjk.m_simplex->p[i]; - w0+=shape.Support( gjk.m_simplex->c[i]->d,0,verticesA,verticesB)*p; - w1+=shape.Support(-gjk.m_simplex->c[i]->d,1,verticesA,verticesB)*p; + const b3Scalar p = gjk.m_simplex->p[i]; + w0 += shape.Support(gjk.m_simplex->c[i]->d, 0, verticesA, verticesB) * p; + w1 += shape.Support(-gjk.m_simplex->c[i]->d, 1, verticesA, verticesB) * p; } - results.witnesses[0] = transA*w0; - results.witnesses[1] = transA*w1; - results.normal = w0-w1; - results.distance = results.normal.length(); - results.normal /= results.distance>GJK_MIN_DISTANCE?results.distance:1; - return(true); + results.witnesses[0] = transA * w0; + results.witnesses[1] = transA * w1; + results.normal = w0 - w1; + results.distance = results.normal.length(); + results.normal /= results.distance > GJK_MIN_DISTANCE ? results.distance : 1; + return (true); } else { - results.status = gjk_status==b3GJK::eStatus::Inside? - sResults::Penetrating : - sResults::GJK_Failed ; - return(false); + results.status = gjk_status == b3GJK::eStatus::Inside ? sResults::Penetrating : sResults::GJK_Failed; + return (false); } } // -bool b3GjkEpaSolver2::Penetration( const b3Transform& transA, const b3Transform& transB, - const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, - const b3AlignedObjectArray<b3Vector3>& verticesA, - const b3AlignedObjectArray<b3Vector3>& verticesB, - const b3Vector3& guess, - sResults& results, - bool usemargins) +bool b3GjkEpaSolver2::Penetration(const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3Vector3& guess, + sResults& results, + bool usemargins) { - - tShape shape; - Initialize(transA,transB,hullA,hullB,verticesA,verticesB,results,shape,usemargins); - b3GJK gjk(verticesA,verticesB); - b3GJK::eStatus::_ gjk_status=gjk.Evaluate(shape,guess); - switch(gjk_status) + tShape shape; + Initialize(transA, transB, hullA, hullB, verticesA, verticesB, results, shape, usemargins); + b3GJK gjk(verticesA, verticesB); + b3GJK::eStatus::_ gjk_status = gjk.Evaluate(shape, guess); + switch (gjk_status) { - case b3GJK::eStatus::Inside: + case b3GJK::eStatus::Inside: { - b3EPA epa; - b3EPA::eStatus::_ epa_status=epa.Evaluate(gjk,-guess); - if(epa_status!=b3EPA::eStatus::Failed) + b3EPA epa; + b3EPA::eStatus::_ epa_status = epa.Evaluate(gjk, -guess); + if (epa_status != b3EPA::eStatus::Failed) { - b3Vector3 w0=b3MakeVector3(0,0,0); - for(unsigned int i=0;i<epa.m_result.rank;++i) + b3Vector3 w0 = b3MakeVector3(0, 0, 0); + for (unsigned int i = 0; i < epa.m_result.rank; ++i) { - w0+=shape.Support(epa.m_result.c[i]->d,0,verticesA,verticesB)*epa.m_result.p[i]; + w0 += shape.Support(epa.m_result.c[i]->d, 0, verticesA, verticesB) * epa.m_result.p[i]; } - results.status = sResults::Penetrating; - results.witnesses[0] = transA*w0; - results.witnesses[1] = transA*(w0-epa.m_normal*epa.m_depth); - results.normal = -epa.m_normal; - results.distance = -epa.m_depth; - return(true); - } else results.status=sResults::EPA_Failed; + results.status = sResults::Penetrating; + results.witnesses[0] = transA * w0; + results.witnesses[1] = transA * (w0 - epa.m_normal * epa.m_depth); + results.normal = -epa.m_normal; + results.distance = -epa.m_depth; + return (true); + } + else + results.status = sResults::EPA_Failed; } break; - case b3GJK::eStatus::Failed: - results.status=sResults::GJK_Failed; - break; + case b3GJK::eStatus::Failed: + results.status = sResults::GJK_Failed; + break; default: - { - } + { + } } - return(false); + return (false); } - #if 0 // b3Scalar b3GjkEpaSolver2::SignedDistance(const b3Vector3& position, @@ -994,8 +1043,7 @@ bool b3GjkEpaSolver2::SignedDistance(const btConvexShape* shape0, } #endif - -/* Symbols cleanup */ +/* Symbols cleanup */ #undef GJK_MAX_ITERATIONS #undef GJK_ACCURACY diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h index 976238a04c..7db32c6309 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h @@ -29,40 +29,39 @@ GJK-EPA collision solver by Nathanael Presson, 2008 #include "Bullet3Common/b3Transform.h" #include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" - ///btGjkEpaSolver contributed under zlib by Nathanael Presson -struct b3GjkEpaSolver2 +struct b3GjkEpaSolver2 { -struct sResults + struct sResults { - enum eStatus + enum eStatus { - Separated, /* Shapes doesnt penetrate */ - Penetrating, /* Shapes are penetrating */ - GJK_Failed, /* GJK phase fail, no big issue, shapes are probably just 'touching' */ - EPA_Failed /* EPA phase fail, bigger problem, need to save parameters, and debug */ - } status; - b3Vector3 witnesses[2]; - b3Vector3 normal; - b3Scalar distance; + Separated, /* Shapes doesnt penetrate */ + Penetrating, /* Shapes are penetrating */ + GJK_Failed, /* GJK phase fail, no big issue, shapes are probably just 'touching' */ + EPA_Failed /* EPA phase fail, bigger problem, need to save parameters, and debug */ + } status; + b3Vector3 witnesses[2]; + b3Vector3 normal; + b3Scalar distance; }; -static int StackSizeRequirement(); + static int StackSizeRequirement(); -static bool Distance( const b3Transform& transA, const b3Transform& transB, - const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, - const b3AlignedObjectArray<b3Vector3>& verticesA, - const b3AlignedObjectArray<b3Vector3>& verticesB, - const b3Vector3& guess, - sResults& results); + static bool Distance(const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + const b3AlignedObjectArray<b3Vector3>& verticesA, + const b3AlignedObjectArray<b3Vector3>& verticesB, + const b3Vector3& guess, + sResults& results); -static bool Penetration( const b3Transform& transA, const b3Transform& transB, - const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, + static bool Penetration(const b3Transform& transA, const b3Transform& transB, + const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB, const b3Vector3& guess, sResults& results, - bool usemargins=true); + bool usemargins = true); #if 0 static b3Scalar SignedDistance( const b3Vector3& position, b3Scalar margin, @@ -74,9 +73,7 @@ static bool SignedDistance( const btConvexShape* shape0,const btTransform& wtrs const btConvexShape* shape1,const btTransform& wtrs1, const b3Vector3& guess, sResults& results); -#endif - +#endif }; -#endif //B3_GJK_EPA2_H - +#endif //B3_GJK_EPA2_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp index e9e51d5a36..6f2c5251a0 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp @@ -13,50 +13,45 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #include "b3OptimizedBvh.h" #include "b3StridingMeshInterface.h" #include "Bullet3Geometry/b3AabbUtil.h" - b3OptimizedBvh::b3OptimizedBvh() -{ +{ } b3OptimizedBvh::~b3OptimizedBvh() { } - void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax) { m_useQuantization = useQuantizedAabbCompression; - // NodeArray triangleNodes; - struct NodeTriangleCallback : public b3InternalTriangleIndexCallback + struct NodeTriangleCallback : public b3InternalTriangleIndexCallback { - - NodeArray& m_triangleNodes; + NodeArray& m_triangleNodes; NodeTriangleCallback& operator=(NodeTriangleCallback& other) { m_triangleNodes.copyFromArray(other.m_triangleNodes); return *this; } - - NodeTriangleCallback(NodeArray& triangleNodes) - :m_triangleNodes(triangleNodes) + + NodeTriangleCallback(NodeArray& triangleNodes) + : m_triangleNodes(triangleNodes) { } - virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) + virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex) { b3OptimizedBvhNode node; - b3Vector3 aabbMin,aabbMax; - aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); - aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); + b3Vector3 aabbMin, aabbMax; + aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT)); + aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT)); aabbMin.setMin(triangle[0]); aabbMax.setMax(triangle[0]); aabbMin.setMin(triangle[1]); @@ -69,17 +64,17 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized node.m_aabbMaxOrg = aabbMax; node.m_escapeIndex = -1; - + //for child nodes node.m_subPart = partId; node.m_triangleIndex = triangleIndex; m_triangleNodes.push_back(node); } }; - struct QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback + struct QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback { - QuantizedNodeArray& m_triangleNodes; - const b3QuantizedBvh* m_optimizedTree; // for quantization + QuantizedNodeArray& m_triangleNodes; + const b3QuantizedBvh* m_optimizedTree; // for quantization QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other) { @@ -88,23 +83,23 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized return *this; } - QuantizedNodeTriangleCallback(QuantizedNodeArray& triangleNodes,const b3QuantizedBvh* tree) - :m_triangleNodes(triangleNodes),m_optimizedTree(tree) + QuantizedNodeTriangleCallback(QuantizedNodeArray& triangleNodes, const b3QuantizedBvh* tree) + : m_triangleNodes(triangleNodes), m_optimizedTree(tree) { } - virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) + virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex) { // The partId and triangle index must fit in the same (positive) integer - b3Assert(partId < (1<<MAX_NUM_PARTS_IN_BITS)); - b3Assert(triangleIndex < (1<<(31-MAX_NUM_PARTS_IN_BITS))); + b3Assert(partId < (1 << MAX_NUM_PARTS_IN_BITS)); + b3Assert(triangleIndex < (1 << (31 - MAX_NUM_PARTS_IN_BITS))); //negative indices are reserved for escapeIndex - b3Assert(triangleIndex>=0); + b3Assert(triangleIndex >= 0); b3QuantizedBvhNode node; - b3Vector3 aabbMin,aabbMax; - aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); - aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); + b3Vector3 aabbMin, aabbMax; + aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT)); + aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT)); aabbMin.setMin(triangle[0]); aabbMax.setMax(triangle[0]); aabbMin.setMin(triangle[1]); @@ -131,59 +126,52 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized aabbMin.setZ(aabbMin.getZ() - MIN_AABB_HALF_DIMENSION); } - m_optimizedTree->quantize(&node.m_quantizedAabbMin[0],aabbMin,0); - m_optimizedTree->quantize(&node.m_quantizedAabbMax[0],aabbMax,1); + m_optimizedTree->quantize(&node.m_quantizedAabbMin[0], aabbMin, 0); + m_optimizedTree->quantize(&node.m_quantizedAabbMax[0], aabbMax, 1); - node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | triangleIndex; + node.m_escapeIndexOrTriangleIndex = (partId << (31 - MAX_NUM_PARTS_IN_BITS)) | triangleIndex; m_triangleNodes.push_back(node); } }; - - int numLeafNodes = 0; - if (m_useQuantization) { - //initialize quantization values - setQuantizationValues(bvhAabbMin,bvhAabbMax); + setQuantizationValues(bvhAabbMin, bvhAabbMax); - QuantizedNodeTriangleCallback callback(m_quantizedLeafNodes,this); + QuantizedNodeTriangleCallback callback(m_quantizedLeafNodes, this); - - triangles->InternalProcessAllTriangles(&callback,m_bvhAabbMin,m_bvhAabbMax); + triangles->InternalProcessAllTriangles(&callback, m_bvhAabbMin, m_bvhAabbMax); //now we have an array of leafnodes in m_leafNodes numLeafNodes = m_quantizedLeafNodes.size(); - - m_quantizedContiguousNodes.resize(2*numLeafNodes); - - - } else + m_quantizedContiguousNodes.resize(2 * numLeafNodes); + } + else { - NodeTriangleCallback callback(m_leafNodes); + NodeTriangleCallback callback(m_leafNodes); - b3Vector3 aabbMin=b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); - b3Vector3 aabbMax=b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); + b3Vector3 aabbMin = b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT)); + b3Vector3 aabbMax = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT)); - triangles->InternalProcessAllTriangles(&callback,aabbMin,aabbMax); + triangles->InternalProcessAllTriangles(&callback, aabbMin, aabbMax); //now we have an array of leafnodes in m_leafNodes numLeafNodes = m_leafNodes.size(); - m_contiguousNodes.resize(2*numLeafNodes); + m_contiguousNodes.resize(2 * numLeafNodes); } m_curNodeIndex = 0; - buildTree(0,numLeafNodes); + buildTree(0, numLeafNodes); ///if the entire tree is small then subtree size, we need to create a header info for the tree - if(m_useQuantization && !m_SubtreeHeaders.size()) + if (m_useQuantization && !m_SubtreeHeaders.size()) { b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand(); subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]); @@ -199,37 +187,29 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized m_leafNodes.clear(); } - - - -void b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax) +void b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax) { if (m_useQuantization) { + setQuantizationValues(aabbMin, aabbMax); - setQuantizationValues(aabbMin,aabbMax); - - updateBvhNodes(meshInterface,0,m_curNodeIndex,0); + updateBvhNodes(meshInterface, 0, m_curNodeIndex, 0); ///now update all subtree headers int i; - for (i=0;i<m_SubtreeHeaders.size();i++) + for (i = 0; i < m_SubtreeHeaders.size(); i++) { b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i]; subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]); } - - } else + } + else { - } } - - - -void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax) +void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax) { //incrementally initialize quantization values b3Assert(m_useQuantization); @@ -244,147 +224,135 @@ void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b ///we should update all quantization values, using updateBvhNodes(meshInterface); ///but we only update chunks that overlap the given aabb - - unsigned short quantizedQueryAabbMin[3]; - unsigned short quantizedQueryAabbMax[3]; - quantize(&quantizedQueryAabbMin[0],aabbMin,0); - quantize(&quantizedQueryAabbMax[0],aabbMax,1); + unsigned short quantizedQueryAabbMin[3]; + unsigned short quantizedQueryAabbMax[3]; + + quantize(&quantizedQueryAabbMin[0], aabbMin, 0); + quantize(&quantizedQueryAabbMax[0], aabbMax, 1); int i; - for (i=0;i<this->m_SubtreeHeaders.size();i++) + for (i = 0; i < this->m_SubtreeHeaders.size(); i++) { b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i]; //PCK: unsigned instead of bool - unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax); + unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin, quantizedQueryAabbMax, subtree.m_quantizedAabbMin, subtree.m_quantizedAabbMax); if (overlap != 0) { - updateBvhNodes(meshInterface,subtree.m_rootNodeIndex,subtree.m_rootNodeIndex+subtree.m_subtreeSize,i); + updateBvhNodes(meshInterface, subtree.m_rootNodeIndex, subtree.m_rootNodeIndex + subtree.m_subtreeSize, i); subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]); } } - } -void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index) +void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface, int firstNode, int endNode, int index) { (void)index; b3Assert(m_useQuantization); - int curNodeSubPart=-1; + int curNodeSubPart = -1; //get access info to trianglemesh data - const unsigned char *vertexbase = 0; - int numverts = 0; - PHY_ScalarType type = PHY_INTEGER; - int stride = 0; - const unsigned char *indexbase = 0; - int indexstride = 0; - int numfaces = 0; - PHY_ScalarType indicestype = PHY_INTEGER; - - b3Vector3 triangleVerts[3]; - b3Vector3 aabbMin,aabbMax; - const b3Vector3& meshScaling = meshInterface->getScaling(); - - int i; - for (i=endNode-1;i>=firstNode;i--) + const unsigned char* vertexbase = 0; + int numverts = 0; + PHY_ScalarType type = PHY_INTEGER; + int stride = 0; + const unsigned char* indexbase = 0; + int indexstride = 0; + int numfaces = 0; + PHY_ScalarType indicestype = PHY_INTEGER; + + b3Vector3 triangleVerts[3]; + b3Vector3 aabbMin, aabbMax; + const b3Vector3& meshScaling = meshInterface->getScaling(); + + int i; + for (i = endNode - 1; i >= firstNode; i--) + { + b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i]; + if (curNode.isLeafNode()) { + //recalc aabb from triangle data + int nodeSubPart = curNode.getPartId(); + int nodeTriangleIndex = curNode.getTriangleIndex(); + if (nodeSubPart != curNodeSubPart) + { + if (curNodeSubPart >= 0) + meshInterface->unLockReadOnlyVertexBase(curNodeSubPart); + meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numfaces, indicestype, nodeSubPart); + curNodeSubPart = nodeSubPart; + b3Assert(indicestype == PHY_INTEGER || indicestype == PHY_SHORT); + } + //triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts, - b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i]; - if (curNode.isLeafNode()) + unsigned int* gfxbase = (unsigned int*)(indexbase + nodeTriangleIndex * indexstride); + + for (int j = 2; j >= 0; j--) { - //recalc aabb from triangle data - int nodeSubPart = curNode.getPartId(); - int nodeTriangleIndex = curNode.getTriangleIndex(); - if (nodeSubPart != curNodeSubPart) + int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j]; + if (type == PHY_FLOAT) { - if (curNodeSubPart >= 0) - meshInterface->unLockReadOnlyVertexBase(curNodeSubPart); - meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase,numverts, type,stride,&indexbase,indexstride,numfaces,indicestype,nodeSubPart); - - curNodeSubPart = nodeSubPart; - b3Assert(indicestype==PHY_INTEGER||indicestype==PHY_SHORT); + float* graphicsbase = (float*)(vertexbase + graphicsindex * stride); + triangleVerts[j] = b3MakeVector3( + graphicsbase[0] * meshScaling.getX(), + graphicsbase[1] * meshScaling.getY(), + graphicsbase[2] * meshScaling.getZ()); } - //triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts, - - unsigned int* gfxbase = (unsigned int*)(indexbase+nodeTriangleIndex*indexstride); - - - for (int j=2;j>=0;j--) + else { - - int graphicsindex = indicestype==PHY_SHORT?((unsigned short*)gfxbase)[j]:gfxbase[j]; - if (type == PHY_FLOAT) - { - float* graphicsbase = (float*)(vertexbase+graphicsindex*stride); - triangleVerts[j] = b3MakeVector3( - graphicsbase[0]*meshScaling.getX(), - graphicsbase[1]*meshScaling.getY(), - graphicsbase[2]*meshScaling.getZ()); - } - else - { - double* graphicsbase = (double*)(vertexbase+graphicsindex*stride); - triangleVerts[j] = b3MakeVector3( b3Scalar(graphicsbase[0]*meshScaling.getX()), b3Scalar(graphicsbase[1]*meshScaling.getY()), b3Scalar(graphicsbase[2]*meshScaling.getZ())); - } + double* graphicsbase = (double*)(vertexbase + graphicsindex * stride); + triangleVerts[j] = b3MakeVector3(b3Scalar(graphicsbase[0] * meshScaling.getX()), b3Scalar(graphicsbase[1] * meshScaling.getY()), b3Scalar(graphicsbase[2] * meshScaling.getZ())); } + } + aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT)); + aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT)); + aabbMin.setMin(triangleVerts[0]); + aabbMax.setMax(triangleVerts[0]); + aabbMin.setMin(triangleVerts[1]); + aabbMax.setMax(triangleVerts[1]); + aabbMin.setMin(triangleVerts[2]); + aabbMax.setMax(triangleVerts[2]); + + quantize(&curNode.m_quantizedAabbMin[0], aabbMin, 0); + quantize(&curNode.m_quantizedAabbMax[0], aabbMax, 1); + } + else + { + //combine aabb from both children - - aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); - aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); - aabbMin.setMin(triangleVerts[0]); - aabbMax.setMax(triangleVerts[0]); - aabbMin.setMin(triangleVerts[1]); - aabbMax.setMax(triangleVerts[1]); - aabbMin.setMin(triangleVerts[2]); - aabbMax.setMax(triangleVerts[2]); - - quantize(&curNode.m_quantizedAabbMin[0],aabbMin,0); - quantize(&curNode.m_quantizedAabbMax[0],aabbMax,1); - - } else - { - //combine aabb from both children + b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i + 1]; - b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i+1]; - - b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i+2] : - &m_quantizedContiguousNodes[i+1+leftChildNode->getEscapeIndex()]; - + b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i + 2] : &m_quantizedContiguousNodes[i + 1 + leftChildNode->getEscapeIndex()]; + { + for (int i = 0; i < 3; i++) { - for (int i=0;i<3;i++) - { - curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i]; - if (curNode.m_quantizedAabbMin[i]>rightChildNode->m_quantizedAabbMin[i]) - curNode.m_quantizedAabbMin[i]=rightChildNode->m_quantizedAabbMin[i]; - - curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i]; - if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i]) - curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i]; - } + curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i]; + if (curNode.m_quantizedAabbMin[i] > rightChildNode->m_quantizedAabbMin[i]) + curNode.m_quantizedAabbMin[i] = rightChildNode->m_quantizedAabbMin[i]; + + curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i]; + if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i]) + curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i]; } } - } + } - if (curNodeSubPart >= 0) - meshInterface->unLockReadOnlyVertexBase(curNodeSubPart); - - + if (curNodeSubPart >= 0) + meshInterface->unLockReadOnlyVertexBase(curNodeSubPart); } ///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place' -b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian) +b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian) { - b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer,i_dataBufferSize,i_swapEndian); - + b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer, i_dataBufferSize, i_swapEndian); + //we don't add additional data so just do a static upcast return static_cast<b3OptimizedBvh*>(bvh); } diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h index 0272ef83bf..1286552939 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h @@ -22,44 +22,35 @@ subject to the following restrictions: class b3StridingMeshInterface; - ///The b3OptimizedBvh extends the b3QuantizedBvh to create AABB tree for triangle meshes, through the b3StridingMeshInterface. -B3_ATTRIBUTE_ALIGNED16(class) b3OptimizedBvh : public b3QuantizedBvh +B3_ATTRIBUTE_ALIGNED16(class) +b3OptimizedBvh : public b3QuantizedBvh { - public: B3_DECLARE_ALIGNED_ALLOCATOR(); protected: - public: - b3OptimizedBvh(); virtual ~b3OptimizedBvh(); - void build(b3StridingMeshInterface* triangles,bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax); + void build(b3StridingMeshInterface * triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax); - void refit(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin,const b3Vector3& aabbMax); + void refit(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax); - void refitPartial(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin, const b3Vector3& aabbMax); + void refitPartial(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax); - void updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index); + void updateBvhNodes(b3StridingMeshInterface * meshInterface, int firstNode, int endNode, int index); /// Data buffer MUST be 16 byte aligned - virtual bool serializeInPlace(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const + virtual bool serializeInPlace(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const { - return b3QuantizedBvh::serialize(o_alignedDataBuffer,i_dataBufferSize,i_swapEndian); - + return b3QuantizedBvh::serialize(o_alignedDataBuffer, i_dataBufferSize, i_swapEndian); } ///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place' - static b3OptimizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian); - - + static b3OptimizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian); }; - -#endif //B3_OPTIMIZED_BVH_H - - +#endif //B3_OPTIMIZED_BVH_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp index 52027e1118..9a448495f3 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp @@ -17,46 +17,40 @@ subject to the following restrictions: #include "Bullet3Geometry/b3AabbUtil.h" - #define RAYAABB2 -b3QuantizedBvh::b3QuantizedBvh() : - m_bulletVersion(B3_BULLET_VERSION), - m_useQuantization(false), - m_traversalMode(TRAVERSAL_STACKLESS_CACHE_FRIENDLY) - //m_traversalMode(TRAVERSAL_STACKLESS) - //m_traversalMode(TRAVERSAL_RECURSIVE) - ,m_subtreeHeaderCount(0) //PCK: add this line +b3QuantizedBvh::b3QuantizedBvh() : m_bulletVersion(B3_BULLET_VERSION), + m_useQuantization(false), + m_traversalMode(TRAVERSAL_STACKLESS_CACHE_FRIENDLY) + //m_traversalMode(TRAVERSAL_STACKLESS) + //m_traversalMode(TRAVERSAL_RECURSIVE) + , + m_subtreeHeaderCount(0) //PCK: add this line { - m_bvhAabbMin.setValue(-B3_INFINITY,-B3_INFINITY,-B3_INFINITY); - m_bvhAabbMax.setValue(B3_INFINITY,B3_INFINITY,B3_INFINITY); + m_bvhAabbMin.setValue(-B3_INFINITY, -B3_INFINITY, -B3_INFINITY); + m_bvhAabbMax.setValue(B3_INFINITY, B3_INFINITY, B3_INFINITY); } - - - - void b3QuantizedBvh::buildInternal() { ///assumes that caller filled in the m_quantizedLeafNodes m_useQuantization = true; int numLeafNodes = 0; - + if (m_useQuantization) { //now we have an array of leafnodes in m_leafNodes numLeafNodes = m_quantizedLeafNodes.size(); - m_quantizedContiguousNodes.resize(2*numLeafNodes); - + m_quantizedContiguousNodes.resize(2 * numLeafNodes); } m_curNodeIndex = 0; - buildTree(0,numLeafNodes); + buildTree(0, numLeafNodes); ///if the entire tree is small then subtree size, we need to create a header info for the tree - if(m_useQuantization && !m_SubtreeHeaders.size()) + if (m_useQuantization && !m_SubtreeHeaders.size()) { b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand(); subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]); @@ -72,35 +66,27 @@ void b3QuantizedBvh::buildInternal() m_leafNodes.clear(); } - - ///just for debugging, to visualize the individual patches/subtrees #ifdef DEBUG_PATCH_COLORS -b3Vector3 color[4]= -{ - b3Vector3(1,0,0), - b3Vector3(0,1,0), - b3Vector3(0,0,1), - b3Vector3(0,1,1) -}; -#endif //DEBUG_PATCH_COLORS - - +b3Vector3 color[4] = + { + b3Vector3(1, 0, 0), + b3Vector3(0, 1, 0), + b3Vector3(0, 0, 1), + b3Vector3(0, 1, 1)}; +#endif //DEBUG_PATCH_COLORS -void b3QuantizedBvh::setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin) +void b3QuantizedBvh::setQuantizationValues(const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax, b3Scalar quantizationMargin) { //enlarge the AABB to avoid division by zero when initializing the quantization values - b3Vector3 clampValue =b3MakeVector3(quantizationMargin,quantizationMargin,quantizationMargin); + b3Vector3 clampValue = b3MakeVector3(quantizationMargin, quantizationMargin, quantizationMargin); m_bvhAabbMin = bvhAabbMin - clampValue; m_bvhAabbMax = bvhAabbMax + clampValue; b3Vector3 aabbSize = m_bvhAabbMax - m_bvhAabbMin; - m_bvhQuantization = b3MakeVector3(b3Scalar(65533.0),b3Scalar(65533.0),b3Scalar(65533.0)) / aabbSize; + m_bvhQuantization = b3MakeVector3(b3Scalar(65533.0), b3Scalar(65533.0), b3Scalar(65533.0)) / aabbSize; m_useQuantization = true; } - - - b3QuantizedBvh::~b3QuantizedBvh() { } @@ -108,104 +94,100 @@ b3QuantizedBvh::~b3QuantizedBvh() #ifdef DEBUG_TREE_BUILDING int gStackDepth = 0; int gMaxStackDepth = 0; -#endif //DEBUG_TREE_BUILDING +#endif //DEBUG_TREE_BUILDING -void b3QuantizedBvh::buildTree (int startIndex,int endIndex) +void b3QuantizedBvh::buildTree(int startIndex, int endIndex) { #ifdef DEBUG_TREE_BUILDING gStackDepth++; if (gStackDepth > gMaxStackDepth) gMaxStackDepth = gStackDepth; -#endif //DEBUG_TREE_BUILDING - +#endif //DEBUG_TREE_BUILDING int splitAxis, splitIndex, i; - int numIndices =endIndex-startIndex; + int numIndices = endIndex - startIndex; int curIndex = m_curNodeIndex; - b3Assert(numIndices>0); + b3Assert(numIndices > 0); - if (numIndices==1) + if (numIndices == 1) { #ifdef DEBUG_TREE_BUILDING gStackDepth--; -#endif //DEBUG_TREE_BUILDING - - assignInternalNodeFromLeafNode(m_curNodeIndex,startIndex); +#endif //DEBUG_TREE_BUILDING + + assignInternalNodeFromLeafNode(m_curNodeIndex, startIndex); m_curNodeIndex++; - return; + return; } //calculate Best Splitting Axis and where to split it. Sort the incoming 'leafNodes' array within range 'startIndex/endIndex'. - - splitAxis = calcSplittingAxis(startIndex,endIndex); - splitIndex = sortAndCalcSplittingIndex(startIndex,endIndex,splitAxis); + splitAxis = calcSplittingAxis(startIndex, endIndex); + + splitIndex = sortAndCalcSplittingIndex(startIndex, endIndex, splitAxis); int internalNodeIndex = m_curNodeIndex; - + //set the min aabb to 'inf' or a max value, and set the max aabb to a -inf/minimum value. //the aabb will be expanded during buildTree/mergeInternalNodeAabb with actual node values - setInternalNodeAabbMin(m_curNodeIndex,m_bvhAabbMax);//can't use b3Vector3(B3_INFINITY,B3_INFINITY,B3_INFINITY)) because of quantization - setInternalNodeAabbMax(m_curNodeIndex,m_bvhAabbMin);//can't use b3Vector3(-B3_INFINITY,-B3_INFINITY,-B3_INFINITY)) because of quantization - - - for (i=startIndex;i<endIndex;i++) + setInternalNodeAabbMin(m_curNodeIndex, m_bvhAabbMax); //can't use b3Vector3(B3_INFINITY,B3_INFINITY,B3_INFINITY)) because of quantization + setInternalNodeAabbMax(m_curNodeIndex, m_bvhAabbMin); //can't use b3Vector3(-B3_INFINITY,-B3_INFINITY,-B3_INFINITY)) because of quantization + + for (i = startIndex; i < endIndex; i++) { - mergeInternalNodeAabb(m_curNodeIndex,getAabbMin(i),getAabbMax(i)); + mergeInternalNodeAabb(m_curNodeIndex, getAabbMin(i), getAabbMax(i)); } m_curNodeIndex++; - //internalNode->m_escapeIndex; - + int leftChildNodexIndex = m_curNodeIndex; //build left child tree - buildTree(startIndex,splitIndex); + buildTree(startIndex, splitIndex); int rightChildNodexIndex = m_curNodeIndex; //build right child tree - buildTree(splitIndex,endIndex); + buildTree(splitIndex, endIndex); #ifdef DEBUG_TREE_BUILDING gStackDepth--; -#endif //DEBUG_TREE_BUILDING +#endif //DEBUG_TREE_BUILDING int escapeIndex = m_curNodeIndex - curIndex; if (m_useQuantization) { //escapeIndex is the number of nodes of this subtree - const int sizeQuantizedNode =sizeof(b3QuantizedBvhNode); + const int sizeQuantizedNode = sizeof(b3QuantizedBvhNode); const int treeSizeInBytes = escapeIndex * sizeQuantizedNode; if (treeSizeInBytes > MAX_SUBTREE_SIZE_IN_BYTES) { - updateSubtreeHeaders(leftChildNodexIndex,rightChildNodexIndex); + updateSubtreeHeaders(leftChildNodexIndex, rightChildNodexIndex); } - } else + } + else { - } - setInternalNodeEscapeIndex(internalNodeIndex,escapeIndex); - + setInternalNodeEscapeIndex(internalNodeIndex, escapeIndex); } -void b3QuantizedBvh::updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex) +void b3QuantizedBvh::updateSubtreeHeaders(int leftChildNodexIndex, int rightChildNodexIndex) { b3Assert(m_useQuantization); b3QuantizedBvhNode& leftChildNode = m_quantizedContiguousNodes[leftChildNodexIndex]; int leftSubTreeSize = leftChildNode.isLeafNode() ? 1 : leftChildNode.getEscapeIndex(); - int leftSubTreeSizeInBytes = leftSubTreeSize * static_cast<int>(sizeof(b3QuantizedBvhNode)); - + int leftSubTreeSizeInBytes = leftSubTreeSize * static_cast<int>(sizeof(b3QuantizedBvhNode)); + b3QuantizedBvhNode& rightChildNode = m_quantizedContiguousNodes[rightChildNodexIndex]; int rightSubTreeSize = rightChildNode.isLeafNode() ? 1 : rightChildNode.getEscapeIndex(); - int rightSubTreeSizeInBytes = rightSubTreeSize * static_cast<int>(sizeof(b3QuantizedBvhNode)); + int rightSubTreeSizeInBytes = rightSubTreeSize * static_cast<int>(sizeof(b3QuantizedBvhNode)); - if(leftSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES) + if (leftSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES) { b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand(); subtree.setAabbFromQuantizeNode(leftChildNode); @@ -213,7 +195,7 @@ void b3QuantizedBvh::updateSubtreeHeaders(int leftChildNodexIndex,int rightChild subtree.m_subtreeSize = leftSubTreeSize; } - if(rightSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES) + if (rightSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES) { b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand(); subtree.setAabbFromQuantizeNode(rightChildNode); @@ -225,32 +207,31 @@ void b3QuantizedBvh::updateSubtreeHeaders(int leftChildNodexIndex,int rightChild m_subtreeHeaderCount = m_SubtreeHeaders.size(); } - -int b3QuantizedBvh::sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis) +int b3QuantizedBvh::sortAndCalcSplittingIndex(int startIndex, int endIndex, int splitAxis) { int i; - int splitIndex =startIndex; + int splitIndex = startIndex; int numIndices = endIndex - startIndex; b3Scalar splitValue; - b3Vector3 means=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); - for (i=startIndex;i<endIndex;i++) + b3Vector3 means = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); + for (i = startIndex; i < endIndex; i++) { - b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i)); - means+=center; + b3Vector3 center = b3Scalar(0.5) * (getAabbMax(i) + getAabbMin(i)); + means += center; } - means *= (b3Scalar(1.)/(b3Scalar)numIndices); - + means *= (b3Scalar(1.) / (b3Scalar)numIndices); + splitValue = means[splitAxis]; - + //sort leafNodes so all values larger then splitValue comes first, and smaller values start from 'splitIndex'. - for (i=startIndex;i<endIndex;i++) + for (i = startIndex; i < endIndex; i++) { - b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i)); + b3Vector3 center = b3Scalar(0.5) * (getAabbMax(i) + getAabbMin(i)); if (center[splitAxis] > splitValue) { //swap - swapLeafNodes(i,splitIndex); + swapLeafNodes(i, splitIndex); splitIndex++; } } @@ -260,56 +241,53 @@ int b3QuantizedBvh::sortAndCalcSplittingIndex(int startIndex,int endIndex,int sp //unbalanced1 is unsafe: it can cause stack overflows //bool unbalanced1 = ((splitIndex==startIndex) || (splitIndex == (endIndex-1))); - //unbalanced2 should work too: always use center (perfect balanced trees) + //unbalanced2 should work too: always use center (perfect balanced trees) //bool unbalanced2 = true; //this should be safe too: - int rangeBalancedIndices = numIndices/3; - bool unbalanced = ((splitIndex<=(startIndex+rangeBalancedIndices)) || (splitIndex >=(endIndex-1-rangeBalancedIndices))); - + int rangeBalancedIndices = numIndices / 3; + bool unbalanced = ((splitIndex <= (startIndex + rangeBalancedIndices)) || (splitIndex >= (endIndex - 1 - rangeBalancedIndices))); + if (unbalanced) { - splitIndex = startIndex+ (numIndices>>1); + splitIndex = startIndex + (numIndices >> 1); } - bool unbal = (splitIndex==startIndex) || (splitIndex == (endIndex)); + bool unbal = (splitIndex == startIndex) || (splitIndex == (endIndex)); (void)unbal; b3Assert(!unbal); return splitIndex; } - -int b3QuantizedBvh::calcSplittingAxis(int startIndex,int endIndex) +int b3QuantizedBvh::calcSplittingAxis(int startIndex, int endIndex) { int i; - b3Vector3 means=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); - b3Vector3 variance=b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); - int numIndices = endIndex-startIndex; + b3Vector3 means = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); + b3Vector3 variance = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); + int numIndices = endIndex - startIndex; - for (i=startIndex;i<endIndex;i++) + for (i = startIndex; i < endIndex; i++) { - b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i)); - means+=center; + b3Vector3 center = b3Scalar(0.5) * (getAabbMax(i) + getAabbMin(i)); + means += center; } - means *= (b3Scalar(1.)/(b3Scalar)numIndices); - - for (i=startIndex;i<endIndex;i++) + means *= (b3Scalar(1.) / (b3Scalar)numIndices); + + for (i = startIndex; i < endIndex; i++) { - b3Vector3 center = b3Scalar(0.5)*(getAabbMax(i)+getAabbMin(i)); - b3Vector3 diff2 = center-means; + b3Vector3 center = b3Scalar(0.5) * (getAabbMax(i) + getAabbMin(i)); + b3Vector3 diff2 = center - means; diff2 = diff2 * diff2; variance += diff2; } - variance *= (b3Scalar(1.)/ ((b3Scalar)numIndices-1) ); - + variance *= (b3Scalar(1.) / ((b3Scalar)numIndices - 1)); + return variance.maxAxis(); } - - -void b3QuantizedBvh::reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const +void b3QuantizedBvh::reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const { //either choose recursive traversal (walkTree) or stackless (walkStacklessTree) @@ -318,38 +296,37 @@ void b3QuantizedBvh::reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallb ///quantize query AABB unsigned short int quantizedQueryAabbMin[3]; unsigned short int quantizedQueryAabbMax[3]; - quantizeWithClamp(quantizedQueryAabbMin,aabbMin,0); - quantizeWithClamp(quantizedQueryAabbMax,aabbMax,1); + quantizeWithClamp(quantizedQueryAabbMin, aabbMin, 0); + quantizeWithClamp(quantizedQueryAabbMax, aabbMax, 1); switch (m_traversalMode) { - case TRAVERSAL_STACKLESS: - walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,0,m_curNodeIndex); - break; - case TRAVERSAL_STACKLESS_CACHE_FRIENDLY: - walkStacklessQuantizedTreeCacheFriendly(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax); - break; - case TRAVERSAL_RECURSIVE: + case TRAVERSAL_STACKLESS: + walkStacklessQuantizedTree(nodeCallback, quantizedQueryAabbMin, quantizedQueryAabbMax, 0, m_curNodeIndex); + break; + case TRAVERSAL_STACKLESS_CACHE_FRIENDLY: + walkStacklessQuantizedTreeCacheFriendly(nodeCallback, quantizedQueryAabbMin, quantizedQueryAabbMax); + break; + case TRAVERSAL_RECURSIVE: { const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[0]; - walkRecursiveQuantizedTreeAgainstQueryAabb(rootNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax); + walkRecursiveQuantizedTreeAgainstQueryAabb(rootNode, nodeCallback, quantizedQueryAabbMin, quantizedQueryAabbMax); } break; - default: - //unsupported - b3Assert(0); + default: + //unsupported + b3Assert(0); } - } else + } + else { - walkStacklessTree(nodeCallback,aabbMin,aabbMax); + walkStacklessTree(nodeCallback, aabbMin, aabbMax); } } - static int b3s_maxIterations = 0; - -void b3QuantizedBvh::walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const +void b3QuantizedBvh::walkStacklessTree(b3NodeOverlapCallback* nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const { b3Assert(!m_useQuantization); @@ -363,24 +340,25 @@ void b3QuantizedBvh::walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const while (curIndex < m_curNodeIndex) { //catch bugs in tree data - b3Assert (walkIterations < m_curNodeIndex); + b3Assert(walkIterations < m_curNodeIndex); walkIterations++; - aabbOverlap = b3TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg); + aabbOverlap = b3TestAabbAgainstAabb2(aabbMin, aabbMax, rootNode->m_aabbMinOrg, rootNode->m_aabbMaxOrg); isLeafNode = rootNode->m_escapeIndex == -1; - + //PCK: unsigned instead of bool if (isLeafNode && (aabbOverlap != 0)) { - nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex); - } - + nodeCallback->processNode(rootNode->m_subPart, rootNode->m_triangleIndex); + } + //PCK: unsigned instead of bool if ((aabbOverlap != 0) || isLeafNode) { rootNode++; curIndex++; - } else + } + else { escapeIndex = rootNode->m_escapeIndex; rootNode += escapeIndex; @@ -389,7 +367,6 @@ void b3QuantizedBvh::walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const } if (b3s_maxIterations < walkIterations) b3s_maxIterations = walkIterations; - } /* @@ -413,39 +390,38 @@ void b3QuantizedBvh::walkTree(b3OptimizedBvhNode* rootNode,b3NodeOverlapCallback } */ -void b3QuantizedBvh::walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const +void b3QuantizedBvh::walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode, b3NodeOverlapCallback* nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const { b3Assert(m_useQuantization); - + bool isLeafNode; //PCK: unsigned instead of bool unsigned aabbOverlap; //PCK: unsigned instead of bool - aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,currentNode->m_quantizedAabbMin,currentNode->m_quantizedAabbMax); + aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin, quantizedQueryAabbMax, currentNode->m_quantizedAabbMin, currentNode->m_quantizedAabbMax); isLeafNode = currentNode->isLeafNode(); - + //PCK: unsigned instead of bool if (aabbOverlap != 0) { if (isLeafNode) { - nodeCallback->processNode(currentNode->getPartId(),currentNode->getTriangleIndex()); - } else + nodeCallback->processNode(currentNode->getPartId(), currentNode->getTriangleIndex()); + } + else { //process left and right children - const b3QuantizedBvhNode* leftChildNode = currentNode+1; - walkRecursiveQuantizedTreeAgainstQueryAabb(leftChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax); + const b3QuantizedBvhNode* leftChildNode = currentNode + 1; + walkRecursiveQuantizedTreeAgainstQueryAabb(leftChildNode, nodeCallback, quantizedQueryAabbMin, quantizedQueryAabbMax); - const b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? leftChildNode+1:leftChildNode+leftChildNode->getEscapeIndex(); - walkRecursiveQuantizedTreeAgainstQueryAabb(rightChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax); + const b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? leftChildNode + 1 : leftChildNode + leftChildNode->getEscapeIndex(); + walkRecursiveQuantizedTreeAgainstQueryAabb(rightChildNode, nodeCallback, quantizedQueryAabbMin, quantizedQueryAabbMax); } - } + } } - - -void b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const +void b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const { b3Assert(!m_useQuantization); @@ -454,11 +430,11 @@ void b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCall int walkIterations = 0; bool isLeafNode; //PCK: unsigned instead of bool - unsigned aabbOverlap=0; - unsigned rayBoxOverlap=0; + unsigned aabbOverlap = 0; + unsigned rayBoxOverlap = 0; b3Scalar lambda_max = 1.0; - - /* Quick pruning by quantized box */ + + /* Quick pruning by quantized box */ b3Vector3 rayAabbMin = raySource; b3Vector3 rayAabbMax = raySource; rayAabbMin.setMin(rayTarget); @@ -469,15 +445,15 @@ void b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCall rayAabbMax += aabbMax; #ifdef RAYAABB2 - b3Vector3 rayDir = (rayTarget-raySource); - rayDir.normalize (); - lambda_max = rayDir.dot(rayTarget-raySource); + b3Vector3 rayDir = (rayTarget - raySource); + rayDir.normalize(); + lambda_max = rayDir.dot(rayTarget - raySource); ///what about division by zero? --> just set rayDirection[i] to 1.0 b3Vector3 rayDirectionInverse; rayDirectionInverse[0] = rayDir[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[0]; rayDirectionInverse[1] = rayDir[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[1]; rayDirectionInverse[2] = rayDir[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[2]; - unsigned int sign[3] = { rayDirectionInverse[0] < 0.0, rayDirectionInverse[1] < 0.0, rayDirectionInverse[2] < 0.0}; + unsigned int sign[3] = {rayDirectionInverse[0] < 0.0, rayDirectionInverse[1] < 0.0, rayDirectionInverse[2] < 0.0}; #endif b3Vector3 bounds[2]; @@ -486,7 +462,7 @@ void b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCall { b3Scalar param = 1.0; //catch bugs in tree data - b3Assert (walkIterations < m_curNodeIndex); + b3Assert(walkIterations < m_curNodeIndex); walkIterations++; @@ -496,34 +472,35 @@ void b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCall bounds[0] -= aabbMax; bounds[1] -= aabbMin; - aabbOverlap = b3TestAabbAgainstAabb2(rayAabbMin,rayAabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg); + aabbOverlap = b3TestAabbAgainstAabb2(rayAabbMin, rayAabbMax, rootNode->m_aabbMinOrg, rootNode->m_aabbMaxOrg); //perhaps profile if it is worth doing the aabbOverlap test first #ifdef RAYAABB2 - ///careful with this check: need to check division by zero (above) and fix the unQuantize method - ///thanks Joerg/hiker for the reproduction case! - ///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858 - rayBoxOverlap = aabbOverlap ? b3RayAabb2 (raySource, rayDirectionInverse, sign, bounds, param, 0.0f, lambda_max) : false; + ///careful with this check: need to check division by zero (above) and fix the unQuantize method + ///thanks Joerg/hiker for the reproduction case! + ///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858 + rayBoxOverlap = aabbOverlap ? b3RayAabb2(raySource, rayDirectionInverse, sign, bounds, param, 0.0f, lambda_max) : false; #else b3Vector3 normal; - rayBoxOverlap = b3RayAabb(raySource, rayTarget,bounds[0],bounds[1],param, normal); + rayBoxOverlap = b3RayAabb(raySource, rayTarget, bounds[0], bounds[1], param, normal); #endif isLeafNode = rootNode->m_escapeIndex == -1; - + //PCK: unsigned instead of bool if (isLeafNode && (rayBoxOverlap != 0)) { - nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex); - } - + nodeCallback->processNode(rootNode->m_subPart, rootNode->m_triangleIndex); + } + //PCK: unsigned instead of bool if ((rayBoxOverlap != 0) || isLeafNode) { rootNode++; curIndex++; - } else + } + else { escapeIndex = rootNode->m_escapeIndex; rootNode += escapeIndex; @@ -532,15 +509,12 @@ void b3QuantizedBvh::walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCall } if (b3s_maxIterations < walkIterations) b3s_maxIterations = walkIterations; - } - - -void b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const +void b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const { b3Assert(m_useQuantization); - + int curIndex = startNodeIndex; int walkIterations = 0; int subTreeSize = endNodeIndex - startNodeIndex; @@ -548,7 +522,7 @@ void b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex]; int escapeIndex; - + bool isLeafNode; //PCK: unsigned instead of bool unsigned boxBoxOverlap = 0; @@ -557,14 +531,14 @@ void b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* b3Scalar lambda_max = 1.0; #ifdef RAYAABB2 - b3Vector3 rayDirection = (rayTarget-raySource); - rayDirection.normalize (); - lambda_max = rayDirection.dot(rayTarget-raySource); + b3Vector3 rayDirection = (rayTarget - raySource); + rayDirection.normalize(); + lambda_max = rayDirection.dot(rayTarget - raySource); ///what about division by zero? --> just set rayDirection[i] to 1.0 rayDirection[0] = rayDirection[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[0]; rayDirection[1] = rayDirection[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[1]; rayDirection[2] = rayDirection[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDirection[2]; - unsigned int sign[3] = { rayDirection[0] < 0.0, rayDirection[1] < 0.0, rayDirection[2] < 0.0}; + unsigned int sign[3] = {rayDirection[0] < 0.0, rayDirection[1] < 0.0, rayDirection[2] < 0.0}; #endif /* Quick pruning by quantized box */ @@ -579,37 +553,36 @@ void b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* unsigned short int quantizedQueryAabbMin[3]; unsigned short int quantizedQueryAabbMax[3]; - quantizeWithClamp(quantizedQueryAabbMin,rayAabbMin,0); - quantizeWithClamp(quantizedQueryAabbMax,rayAabbMax,1); + quantizeWithClamp(quantizedQueryAabbMin, rayAabbMin, 0); + quantizeWithClamp(quantizedQueryAabbMax, rayAabbMax, 1); while (curIndex < endNodeIndex) { - //#define VISUALLY_ANALYZE_BVH 1 #ifdef VISUALLY_ANALYZE_BVH //some code snippet to debugDraw aabb, to visually analyze bvh structure static int drawPatch = 0; //need some global access to a debugDrawer extern b3IDebugDraw* debugDrawerPtr; - if (curIndex==drawPatch) + if (curIndex == drawPatch) { - b3Vector3 aabbMin,aabbMax; + b3Vector3 aabbMin, aabbMax; aabbMin = unQuantize(rootNode->m_quantizedAabbMin); aabbMax = unQuantize(rootNode->m_quantizedAabbMax); - b3Vector3 color(1,0,0); - debugDrawerPtr->drawAabb(aabbMin,aabbMax,color); + b3Vector3 color(1, 0, 0); + debugDrawerPtr->drawAabb(aabbMin, aabbMax, color); } -#endif//VISUALLY_ANALYZE_BVH +#endif //VISUALLY_ANALYZE_BVH //catch bugs in tree data - b3Assert (walkIterations < subTreeSize); + b3Assert(walkIterations < subTreeSize); walkIterations++; //PCK: unsigned instead of bool // only interested if this is closer than any previous hit b3Scalar param = 1.0; rayBoxOverlap = 0; - boxBoxOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax); + boxBoxOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin, quantizedQueryAabbMax, rootNode->m_quantizedAabbMin, rootNode->m_quantizedAabbMax); isLeafNode = rootNode->isLeafNode(); if (boxBoxOverlap) { @@ -634,24 +607,25 @@ void b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* ///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858 //B3_PROFILE("b3RayAabb2"); - rayBoxOverlap = b3RayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0f, lambda_max); - + rayBoxOverlap = b3RayAabb2(raySource, rayDirection, sign, bounds, param, 0.0f, lambda_max); + #else - rayBoxOverlap = true;//b3RayAabb(raySource, rayTarget, bounds[0], bounds[1], param, normal); + rayBoxOverlap = true; //b3RayAabb(raySource, rayTarget, bounds[0], bounds[1], param, normal); #endif } - + if (isLeafNode && rayBoxOverlap) { - nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex()); + nodeCallback->processNode(rootNode->getPartId(), rootNode->getTriangleIndex()); } - + //PCK: unsigned instead of bool if ((rayBoxOverlap != 0) || isLeafNode) { rootNode++; curIndex++; - } else + } + else { escapeIndex = rootNode->getEscapeIndex(); rootNode += escapeIndex; @@ -660,13 +634,12 @@ void b3QuantizedBvh::walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* } if (b3s_maxIterations < walkIterations) b3s_maxIterations = walkIterations; - } -void b3QuantizedBvh::walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const +void b3QuantizedBvh::walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax, int startNodeIndex, int endNodeIndex) const { b3Assert(m_useQuantization); - + int curIndex = startNodeIndex; int walkIterations = 0; int subTreeSize = endNodeIndex - startNodeIndex; @@ -674,49 +647,49 @@ void b3QuantizedBvh::walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallb const b3QuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex]; int escapeIndex; - + bool isLeafNode; //PCK: unsigned instead of bool unsigned aabbOverlap; while (curIndex < endNodeIndex) { - //#define VISUALLY_ANALYZE_BVH 1 #ifdef VISUALLY_ANALYZE_BVH //some code snippet to debugDraw aabb, to visually analyze bvh structure static int drawPatch = 0; //need some global access to a debugDrawer extern b3IDebugDraw* debugDrawerPtr; - if (curIndex==drawPatch) + if (curIndex == drawPatch) { - b3Vector3 aabbMin,aabbMax; + b3Vector3 aabbMin, aabbMax; aabbMin = unQuantize(rootNode->m_quantizedAabbMin); aabbMax = unQuantize(rootNode->m_quantizedAabbMax); - b3Vector3 color(1,0,0); - debugDrawerPtr->drawAabb(aabbMin,aabbMax,color); + b3Vector3 color(1, 0, 0); + debugDrawerPtr->drawAabb(aabbMin, aabbMax, color); } -#endif//VISUALLY_ANALYZE_BVH +#endif //VISUALLY_ANALYZE_BVH //catch bugs in tree data - b3Assert (walkIterations < subTreeSize); + b3Assert(walkIterations < subTreeSize); walkIterations++; //PCK: unsigned instead of bool - aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax); + aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin, quantizedQueryAabbMax, rootNode->m_quantizedAabbMin, rootNode->m_quantizedAabbMax); isLeafNode = rootNode->isLeafNode(); - + if (isLeafNode && aabbOverlap) { - nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex()); - } - + nodeCallback->processNode(rootNode->getPartId(), rootNode->getTriangleIndex()); + } + //PCK: unsigned instead of bool if ((aabbOverlap != 0) || isLeafNode) { rootNode++; curIndex++; - } else + } + else { escapeIndex = rootNode->getEscapeIndex(); rootNode += escapeIndex; @@ -725,40 +698,36 @@ void b3QuantizedBvh::walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallb } if (b3s_maxIterations < walkIterations) b3s_maxIterations = walkIterations; - } //This traversal can be called from Playstation 3 SPU -void b3QuantizedBvh::walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const +void b3QuantizedBvh::walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const { b3Assert(m_useQuantization); int i; - - for (i=0;i<this->m_SubtreeHeaders.size();i++) + for (i = 0; i < this->m_SubtreeHeaders.size(); i++) { const b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i]; //PCK: unsigned instead of bool - unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax); + unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin, quantizedQueryAabbMax, subtree.m_quantizedAabbMin, subtree.m_quantizedAabbMax); if (overlap != 0) { - walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax, - subtree.m_rootNodeIndex, - subtree.m_rootNodeIndex+subtree.m_subtreeSize); + walkStacklessQuantizedTree(nodeCallback, quantizedQueryAabbMin, quantizedQueryAabbMax, + subtree.m_rootNodeIndex, + subtree.m_rootNodeIndex + subtree.m_subtreeSize); } } } - -void b3QuantizedBvh::reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const +void b3QuantizedBvh::reportRayOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const { - reportBoxCastOverlappingNodex(nodeCallback,raySource,rayTarget,b3MakeVector3(0,0,0),b3MakeVector3(0,0,0)); + reportBoxCastOverlappingNodex(nodeCallback, raySource, rayTarget, b3MakeVector3(0, 0, 0), b3MakeVector3(0, 0, 0)); } - -void b3QuantizedBvh::reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const +void b3QuantizedBvh::reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const { //always use stackless @@ -782,31 +751,31 @@ void b3QuantizedBvh::reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCa reportAabbOverlappingNodex(nodeCallback,qaabbMin,qaabbMax); } */ - } - -void b3QuantizedBvh::swapLeafNodes(int i,int splitIndex) +void b3QuantizedBvh::swapLeafNodes(int i, int splitIndex) { if (m_useQuantization) { - b3QuantizedBvhNode tmp = m_quantizedLeafNodes[i]; - m_quantizedLeafNodes[i] = m_quantizedLeafNodes[splitIndex]; - m_quantizedLeafNodes[splitIndex] = tmp; - } else + b3QuantizedBvhNode tmp = m_quantizedLeafNodes[i]; + m_quantizedLeafNodes[i] = m_quantizedLeafNodes[splitIndex]; + m_quantizedLeafNodes[splitIndex] = tmp; + } + else { - b3OptimizedBvhNode tmp = m_leafNodes[i]; - m_leafNodes[i] = m_leafNodes[splitIndex]; - m_leafNodes[splitIndex] = tmp; + b3OptimizedBvhNode tmp = m_leafNodes[i]; + m_leafNodes[i] = m_leafNodes[splitIndex]; + m_leafNodes[splitIndex] = tmp; } } -void b3QuantizedBvh::assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex) +void b3QuantizedBvh::assignInternalNodeFromLeafNode(int internalNode, int leafNodeIndex) { if (m_useQuantization) { m_quantizedContiguousNodes[internalNode] = m_quantizedLeafNodes[leafNodeIndex]; - } else + } + else { m_contiguousNodes[internalNode] = m_leafNodes[leafNodeIndex]; } @@ -823,11 +792,10 @@ static const unsigned BVH_ALIGNMENT_MASK = BVH_ALIGNMENT-1; static const unsigned BVH_ALIGNMENT_BLOCKS = 2; #endif - unsigned int b3QuantizedBvh::getAlignmentSerializationPadding() { // I changed this to 0 since the extra padding is not needed or used. - return 0;//BVH_ALIGNMENT_BLOCKS * BVH_ALIGNMENT; + return 0; //BVH_ALIGNMENT_BLOCKS * BVH_ALIGNMENT; } unsigned b3QuantizedBvh::calculateSerializeBufferSize() const @@ -841,12 +809,12 @@ unsigned b3QuantizedBvh::calculateSerializeBufferSize() const return baseSize + m_curNodeIndex * sizeof(b3OptimizedBvhNode); } -bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBufferSize */, bool i_swapEndian) const +bool b3QuantizedBvh::serialize(void* o_alignedDataBuffer, unsigned /*i_dataBufferSize */, bool i_swapEndian) const { b3Assert(m_subtreeHeaderCount == m_SubtreeHeaders.size()); m_subtreeHeaderCount = m_SubtreeHeaders.size(); -/* if (i_dataBufferSize < calculateSerializeBufferSize() || o_alignedDataBuffer == NULL || (((unsigned)o_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0)) + /* if (i_dataBufferSize < calculateSerializeBufferSize() || o_alignedDataBuffer == NULL || (((unsigned)o_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0)) { ///check alignedment for buffer? b3Assert(0); @@ -854,7 +822,7 @@ bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBuffe } */ - b3QuantizedBvh *targetBvh = (b3QuantizedBvh *)o_alignedDataBuffer; + b3QuantizedBvh* targetBvh = (b3QuantizedBvh*)o_alignedDataBuffer; // construct the class so the virtual function table, etc will be set up // Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor @@ -864,10 +832,9 @@ bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBuffe { targetBvh->m_curNodeIndex = static_cast<int>(b3SwapEndian(m_curNodeIndex)); - - b3SwapVector3Endian(m_bvhAabbMin,targetBvh->m_bvhAabbMin); - b3SwapVector3Endian(m_bvhAabbMax,targetBvh->m_bvhAabbMax); - b3SwapVector3Endian(m_bvhQuantization,targetBvh->m_bvhQuantization); + b3SwapVector3Endian(m_bvhAabbMin, targetBvh->m_bvhAabbMin); + b3SwapVector3Endian(m_bvhAabbMax, targetBvh->m_bvhAabbMax); + b3SwapVector3Endian(m_bvhQuantization, targetBvh->m_bvhQuantization); targetBvh->m_traversalMode = (b3TraversalMode)b3SwapEndian(m_traversalMode); targetBvh->m_subtreeHeaderCount = static_cast<int>(b3SwapEndian(m_subtreeHeaderCount)); @@ -884,12 +851,12 @@ bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBuffe targetBvh->m_useQuantization = m_useQuantization; - unsigned char *nodeData = (unsigned char *)targetBvh; + unsigned char* nodeData = (unsigned char*)targetBvh; nodeData += sizeof(b3QuantizedBvh); - - unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; + + unsigned sizeToAdd = 0; //(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; nodeData += sizeToAdd; - + int nodeCount = m_curNodeIndex; if (m_useQuantization) @@ -915,7 +882,6 @@ bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBuffe { for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) { - targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]; targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]; targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]; @@ -925,8 +891,6 @@ bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBuffe targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]; targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex; - - } } nodeData += sizeof(b3QuantizedBvhNode) * nodeCount; @@ -972,7 +936,7 @@ bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBuffe targetBvh->m_contiguousNodes.initializeFromBuffer(NULL, 0, 0); } - sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; + sizeToAdd = 0; //(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; nodeData += sizeToAdd; // Now serialize the subtree headers @@ -1027,14 +991,13 @@ bool b3QuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBuffe return true; } -b3QuantizedBvh *b3QuantizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian) +b3QuantizedBvh* b3QuantizedBvh::deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian) { - - if (i_alignedDataBuffer == NULL)// || (((unsigned)i_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0)) + if (i_alignedDataBuffer == NULL) // || (((unsigned)i_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0)) { return NULL; } - b3QuantizedBvh *bvh = (b3QuantizedBvh *)i_alignedDataBuffer; + b3QuantizedBvh* bvh = (b3QuantizedBvh*)i_alignedDataBuffer; if (i_swapEndian) { @@ -1056,12 +1019,12 @@ b3QuantizedBvh *b3QuantizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, un return NULL; } - unsigned char *nodeData = (unsigned char *)bvh; + unsigned char* nodeData = (unsigned char*)bvh; nodeData += sizeof(b3QuantizedBvh); - - unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; + + unsigned sizeToAdd = 0; //(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; nodeData += sizeToAdd; - + int nodeCount = bvh->m_curNodeIndex; // Must call placement new to fill in virtual function table, etc, but we don't want to overwrite most data, so call a special version of the constructor @@ -1099,7 +1062,7 @@ b3QuantizedBvh *b3QuantizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, un { b3UnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg); b3UnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg); - + bvh->m_contiguousNodes[nodeIndex].m_escapeIndex = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_escapeIndex)); bvh->m_contiguousNodes[nodeIndex].m_subPart = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_subPart)); bvh->m_contiguousNodes[nodeIndex].m_triangleIndex = static_cast<int>(b3SwapEndian(bvh->m_contiguousNodes[nodeIndex].m_triangleIndex)); @@ -1108,7 +1071,7 @@ b3QuantizedBvh *b3QuantizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, un nodeData += sizeof(b3OptimizedBvhNode) * nodeCount; } - sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; + sizeToAdd = 0; //(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK; nodeData += sizeToAdd; // Now serialize the subtree headers @@ -1134,13 +1097,11 @@ b3QuantizedBvh *b3QuantizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, un } // Constructor that prevents b3Vector3's default constructor from being called -b3QuantizedBvh::b3QuantizedBvh(b3QuantizedBvh &self, bool /* ownsMemory */) : -m_bvhAabbMin(self.m_bvhAabbMin), -m_bvhAabbMax(self.m_bvhAabbMax), -m_bvhQuantization(self.m_bvhQuantization), -m_bulletVersion(B3_BULLET_VERSION) +b3QuantizedBvh::b3QuantizedBvh(b3QuantizedBvh& self, bool /* ownsMemory */) : m_bvhAabbMin(self.m_bvhAabbMin), + m_bvhAabbMax(self.m_bvhAabbMax), + m_bvhQuantization(self.m_bvhQuantization), + m_bulletVersion(B3_BULLET_VERSION) { - } void b3QuantizedBvh::deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData) @@ -1150,8 +1111,8 @@ void b3QuantizedBvh::deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedB m_bvhQuantization.deSerializeFloat(quantizedBvhFloatData.m_bvhQuantization); m_curNodeIndex = quantizedBvhFloatData.m_curNodeIndex; - m_useQuantization = quantizedBvhFloatData.m_useQuantization!=0; - + m_useQuantization = quantizedBvhFloatData.m_useQuantization != 0; + { int numElem = quantizedBvhFloatData.m_numContiguousLeafNodes; m_contiguousNodes.resize(numElem); @@ -1160,7 +1121,7 @@ void b3QuantizedBvh::deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedB { b3OptimizedBvhNodeFloatData* memPtr = quantizedBvhFloatData.m_contiguousNodesPtr; - for (int i=0;i<numElem;i++,memPtr++) + for (int i = 0; i < numElem; i++, memPtr++) { m_contiguousNodes[i].m_aabbMaxOrg.deSerializeFloat(memPtr->m_aabbMaxOrg); m_contiguousNodes[i].m_aabbMinOrg.deSerializeFloat(memPtr->m_aabbMinOrg); @@ -1174,11 +1135,11 @@ void b3QuantizedBvh::deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedB { int numElem = quantizedBvhFloatData.m_numQuantizedContiguousNodes; m_quantizedContiguousNodes.resize(numElem); - + if (numElem) { b3QuantizedBvhNodeData* memPtr = quantizedBvhFloatData.m_quantizedContiguousNodesPtr; - for (int i=0;i<numElem;i++,memPtr++) + for (int i = 0; i < numElem; i++, memPtr++) { m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex; m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0]; @@ -1192,16 +1153,16 @@ void b3QuantizedBvh::deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedB } m_traversalMode = b3TraversalMode(quantizedBvhFloatData.m_traversalMode); - + { int numElem = quantizedBvhFloatData.m_numSubtreeHeaders; m_SubtreeHeaders.resize(numElem); if (numElem) { b3BvhSubtreeInfoData* memPtr = quantizedBvhFloatData.m_subTreeInfoPtr; - for (int i=0;i<numElem;i++,memPtr++) + for (int i = 0; i < numElem; i++, memPtr++) { - m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ; + m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0]; m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1]; m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2]; m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0]; @@ -1221,8 +1182,8 @@ void b3QuantizedBvh::deSerializeDouble(struct b3QuantizedBvhDoubleData& quantize m_bvhQuantization.deSerializeDouble(quantizedBvhDoubleData.m_bvhQuantization); m_curNodeIndex = quantizedBvhDoubleData.m_curNodeIndex; - m_useQuantization = quantizedBvhDoubleData.m_useQuantization!=0; - + m_useQuantization = quantizedBvhDoubleData.m_useQuantization != 0; + { int numElem = quantizedBvhDoubleData.m_numContiguousLeafNodes; m_contiguousNodes.resize(numElem); @@ -1231,7 +1192,7 @@ void b3QuantizedBvh::deSerializeDouble(struct b3QuantizedBvhDoubleData& quantize { b3OptimizedBvhNodeDoubleData* memPtr = quantizedBvhDoubleData.m_contiguousNodesPtr; - for (int i=0;i<numElem;i++,memPtr++) + for (int i = 0; i < numElem; i++, memPtr++) { m_contiguousNodes[i].m_aabbMaxOrg.deSerializeDouble(memPtr->m_aabbMaxOrg); m_contiguousNodes[i].m_aabbMinOrg.deSerializeDouble(memPtr->m_aabbMinOrg); @@ -1245,11 +1206,11 @@ void b3QuantizedBvh::deSerializeDouble(struct b3QuantizedBvhDoubleData& quantize { int numElem = quantizedBvhDoubleData.m_numQuantizedContiguousNodes; m_quantizedContiguousNodes.resize(numElem); - + if (numElem) { b3QuantizedBvhNodeData* memPtr = quantizedBvhDoubleData.m_quantizedContiguousNodesPtr; - for (int i=0;i<numElem;i++,memPtr++) + for (int i = 0; i < numElem; i++, memPtr++) { m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex; m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0]; @@ -1263,16 +1224,16 @@ void b3QuantizedBvh::deSerializeDouble(struct b3QuantizedBvhDoubleData& quantize } m_traversalMode = b3TraversalMode(quantizedBvhDoubleData.m_traversalMode); - + { int numElem = quantizedBvhDoubleData.m_numSubtreeHeaders; m_SubtreeHeaders.resize(numElem); if (numElem) { b3BvhSubtreeInfoData* memPtr = quantizedBvhDoubleData.m_subTreeInfoPtr; - for (int i=0;i<numElem;i++,memPtr++) + for (int i = 0; i < numElem; i++, memPtr++) { - m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ; + m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0]; m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1]; m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2]; m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0]; @@ -1283,19 +1244,11 @@ void b3QuantizedBvh::deSerializeDouble(struct b3QuantizedBvhDoubleData& quantize } } } - } - - ///fills the dataBuffer and returns the struct name (and 0 on failure) -const char* b3QuantizedBvh::serialize(void* dataBuffer, b3Serializer* serializer) const +const char* b3QuantizedBvh::serialize(void* dataBuffer, b3Serializer* serializer) const { b3Assert(0); return 0; } - - - - - diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h index 63c523c758..48b41abcad 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h @@ -22,11 +22,11 @@ class b3Serializer; #ifdef DEBUG_CHECK_DEQUANTIZATION #ifdef __SPU__ #define printf spu_printf -#endif //__SPU__ +#endif //__SPU__ #include <stdio.h> #include <stdlib.h> -#endif //DEBUG_CHECK_DEQUANTIZATION +#endif //DEBUG_CHECK_DEQUANTIZATION #include "Bullet3Common/b3Vector3.h" #include "Bullet3Common/b3AlignedAllocator.h" @@ -44,13 +44,10 @@ class b3Serializer; #include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h" #include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h" - - //http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrf__m128.asp - //Note: currently we have 16 bytes per quantized node -#define MAX_SUBTREE_SIZE_IN_BYTES 2048 +#define MAX_SUBTREE_SIZE_IN_BYTES 2048 // 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one // actually) triangles each (since the sign bit is reserved @@ -58,7 +55,8 @@ class b3Serializer; ///b3QuantizedBvhNode is a compressed aabb node, 16 bytes. ///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range). -B3_ATTRIBUTE_ALIGNED16 (struct) b3QuantizedBvhNode : public b3QuantizedBvhNodeData +B3_ATTRIBUTE_ALIGNED16(struct) +b3QuantizedBvhNode : public b3QuantizedBvhNodeData { B3_DECLARE_ALIGNED_ALLOCATOR(); @@ -72,48 +70,48 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3QuantizedBvhNode : public b3QuantizedBvhNodeDa b3Assert(!isLeafNode()); return -m_escapeIndexOrTriangleIndex; } - int getTriangleIndex() const + int getTriangleIndex() const { b3Assert(isLeafNode()); - unsigned int x=0; - unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS); + unsigned int x = 0; + unsigned int y = (~(x & 0)) << (31 - MAX_NUM_PARTS_IN_BITS); // Get only the lower bits where the triangle index is stored - return (m_escapeIndexOrTriangleIndex&~(y)); + return (m_escapeIndexOrTriangleIndex & ~(y)); } - int getPartId() const + int getPartId() const { b3Assert(isLeafNode()); // Get only the highest bits where the part index is stored - return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS)); + return (m_escapeIndexOrTriangleIndex >> (31 - MAX_NUM_PARTS_IN_BITS)); } -} -; +}; /// b3OptimizedBvhNode contains both internal and leaf node information. /// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes. -B3_ATTRIBUTE_ALIGNED16 (struct) b3OptimizedBvhNode +B3_ATTRIBUTE_ALIGNED16(struct) +b3OptimizedBvhNode { B3_DECLARE_ALIGNED_ALLOCATOR(); //32 bytes - b3Vector3 m_aabbMinOrg; - b3Vector3 m_aabbMaxOrg; + b3Vector3 m_aabbMinOrg; + b3Vector3 m_aabbMaxOrg; //4 - int m_escapeIndex; + int m_escapeIndex; //8 //for child nodes - int m_subPart; - int m_triangleIndex; + int m_subPart; + int m_triangleIndex; -//pad the size to 64 bytes - char m_padding[20]; + //pad the size to 64 bytes + char m_padding[20]; }; - ///b3BvhSubtreeInfo provides info to gather a subtree of limited size -B3_ATTRIBUTE_ALIGNED16(class) b3BvhSubtreeInfo : public b3BvhSubtreeInfoData +B3_ATTRIBUTE_ALIGNED16(class) +b3BvhSubtreeInfo : public b3BvhSubtreeInfoData { public: B3_DECLARE_ALIGNED_ALLOCATOR(); @@ -123,8 +121,7 @@ public: //memset(&m_padding[0], 0, sizeof(m_padding)); } - - void setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode) + void setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode) { m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0]; m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1]; @@ -133,14 +130,12 @@ public: m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1]; m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2]; } -} -; - +}; class b3NodeOverlapCallback { public: - virtual ~b3NodeOverlapCallback() {}; + virtual ~b3NodeOverlapCallback(){}; virtual void processNode(int subPart, int triangleIndex) = 0; }; @@ -148,18 +143,16 @@ public: #include "Bullet3Common/b3AlignedAllocator.h" #include "Bullet3Common/b3AlignedObjectArray.h" - - ///for code readability: -typedef b3AlignedObjectArray<b3OptimizedBvhNode> NodeArray; -typedef b3AlignedObjectArray<b3QuantizedBvhNode> QuantizedNodeArray; -typedef b3AlignedObjectArray<b3BvhSubtreeInfo> BvhSubtreeInfoArray; - +typedef b3AlignedObjectArray<b3OptimizedBvhNode> NodeArray; +typedef b3AlignedObjectArray<b3QuantizedBvhNode> QuantizedNodeArray; +typedef b3AlignedObjectArray<b3BvhSubtreeInfo> BvhSubtreeInfoArray; ///The b3QuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU. ///It is used by the b3BvhTriangleMeshShape as midphase ///It is recommended to use quantization for better performance and lower memory requirements. -B3_ATTRIBUTE_ALIGNED16(class) b3QuantizedBvh +B3_ATTRIBUTE_ALIGNED16(class) +b3QuantizedBvh { public: enum b3TraversalMode @@ -169,56 +162,48 @@ public: TRAVERSAL_RECURSIVE }; - - - - b3Vector3 m_bvhAabbMin; - b3Vector3 m_bvhAabbMax; - b3Vector3 m_bvhQuantization; + b3Vector3 m_bvhAabbMin; + b3Vector3 m_bvhAabbMax; + b3Vector3 m_bvhQuantization; protected: - int m_bulletVersion; //for serialization versioning. It could also be used to detect endianess. + int m_bulletVersion; //for serialization versioning. It could also be used to detect endianess. - int m_curNodeIndex; + int m_curNodeIndex; //quantization data - bool m_useQuantization; + bool m_useQuantization; + NodeArray m_leafNodes; + NodeArray m_contiguousNodes; + QuantizedNodeArray m_quantizedLeafNodes; + QuantizedNodeArray m_quantizedContiguousNodes; - - NodeArray m_leafNodes; - NodeArray m_contiguousNodes; - QuantizedNodeArray m_quantizedLeafNodes; - QuantizedNodeArray m_quantizedContiguousNodes; - - b3TraversalMode m_traversalMode; - BvhSubtreeInfoArray m_SubtreeHeaders; + b3TraversalMode m_traversalMode; + BvhSubtreeInfoArray m_SubtreeHeaders; //This is only used for serialization so we don't have to add serialization directly to b3AlignedObjectArray mutable int m_subtreeHeaderCount; - - - - ///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!) ///this might be refactored into a virtual, it is usually not calculated at run-time - void setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin) + void setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin) { if (m_useQuantization) { - quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] ,aabbMin,0); - } else + quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0], aabbMin, 0); + } + else { m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin; - } } - void setInternalNodeAabbMax(int nodeIndex,const b3Vector3& aabbMax) + void setInternalNodeAabbMax(int nodeIndex, const b3Vector3& aabbMax) { if (m_useQuantization) { - quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0],aabbMax,1); - } else + quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0], aabbMax, 1); + } + else { m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax; } @@ -232,115 +217,102 @@ protected: } //non-quantized return m_leafNodes[nodeIndex].m_aabbMinOrg; - } b3Vector3 getAabbMax(int nodeIndex) const { if (m_useQuantization) { return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]); - } + } //non-quantized return m_leafNodes[nodeIndex].m_aabbMaxOrg; - } - - void setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex) + void setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex) { if (m_useQuantization) { m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex; - } + } else { m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex; } - } - void mergeInternalNodeAabb(int nodeIndex,const b3Vector3& newAabbMin,const b3Vector3& newAabbMax) + void mergeInternalNodeAabb(int nodeIndex, const b3Vector3& newAabbMin, const b3Vector3& newAabbMax) { if (m_useQuantization) { unsigned short int quantizedAabbMin[3]; unsigned short int quantizedAabbMax[3]; - quantize(quantizedAabbMin,newAabbMin,0); - quantize(quantizedAabbMax,newAabbMax,1); - for (int i=0;i<3;i++) + quantize(quantizedAabbMin, newAabbMin, 0); + quantize(quantizedAabbMax, newAabbMax, 1); + for (int i = 0; i < 3; i++) { if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i]) m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i]; if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i]) m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i]; - } - } else + } + else { //non-quantized m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin); - m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax); + m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax); } } - void swapLeafNodes(int firstIndex,int secondIndex); + void swapLeafNodes(int firstIndex, int secondIndex); - void assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex); + void assignInternalNodeFromLeafNode(int internalNode, int leafNodeIndex); protected: + void buildTree(int startIndex, int endIndex); - - - void buildTree (int startIndex,int endIndex); + int calcSplittingAxis(int startIndex, int endIndex); - int calcSplittingAxis(int startIndex,int endIndex); + int sortAndCalcSplittingIndex(int startIndex, int endIndex, int splitAxis); - int sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis); - - void walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const; + void walkStacklessTree(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const; - void walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const; - void walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const; - void walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const; + void walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const; + void walkStacklessQuantizedTree(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax, int startNodeIndex, int endNodeIndex) const; + void walkStacklessTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const; ///tree traversal designed for small-memory processors like PS3 SPU - void walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const; + void walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const; ///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal - void walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const; + void walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode, b3NodeOverlapCallback* nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const; ///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal - void walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA,const b3QuantizedBvhNode* treeNodeB,b3NodeOverlapCallback* nodeCallback) const; - - - + void walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA, const b3QuantizedBvhNode* treeNodeB, b3NodeOverlapCallback* nodeCallback) const; - void updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex); + void updateSubtreeHeaders(int leftChildNodexIndex, int rightChildNodexIndex); public: - B3_DECLARE_ALIGNED_ALLOCATOR(); b3QuantizedBvh(); virtual ~b3QuantizedBvh(); - ///***************************************** expert/internal use only ************************* - void setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin=b3Scalar(1.0)); - QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; } + void setQuantizationValues(const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax, b3Scalar quantizationMargin = b3Scalar(1.0)); + QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; } ///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized - void buildInternal(); + void buildInternal(); ///***************************************** expert/internal use only ************************* - void reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const; - void reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const; - void reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const; + void reportAabbOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const; + void reportRayOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const; + void reportBoxCastOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const; - B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point,int isMax) const + B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point, int isMax) const { - b3Assert(m_useQuantization); b3Assert(point.getX() <= m_bvhAabbMax.getX()); @@ -357,122 +329,114 @@ public: ///@todo: double-check this if (isMax) { - out[0] = (unsigned short) (((unsigned short)(v.getX()+b3Scalar(1.)) | 1)); - out[1] = (unsigned short) (((unsigned short)(v.getY()+b3Scalar(1.)) | 1)); - out[2] = (unsigned short) (((unsigned short)(v.getZ()+b3Scalar(1.)) | 1)); - } else + out[0] = (unsigned short)(((unsigned short)(v.getX() + b3Scalar(1.)) | 1)); + out[1] = (unsigned short)(((unsigned short)(v.getY() + b3Scalar(1.)) | 1)); + out[2] = (unsigned short)(((unsigned short)(v.getZ() + b3Scalar(1.)) | 1)); + } + else { - out[0] = (unsigned short) (((unsigned short)(v.getX()) & 0xfffe)); - out[1] = (unsigned short) (((unsigned short)(v.getY()) & 0xfffe)); - out[2] = (unsigned short) (((unsigned short)(v.getZ()) & 0xfffe)); + out[0] = (unsigned short)(((unsigned short)(v.getX()) & 0xfffe)); + out[1] = (unsigned short)(((unsigned short)(v.getY()) & 0xfffe)); + out[2] = (unsigned short)(((unsigned short)(v.getZ()) & 0xfffe)); } - #ifdef DEBUG_CHECK_DEQUANTIZATION b3Vector3 newPoint = unQuantize(out); if (isMax) { if (newPoint.getX() < point.getX()) { - printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX()); + printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX()); } if (newPoint.getY() < point.getY()) { - printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY()); + printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY()); } if (newPoint.getZ() < point.getZ()) { - - printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ()); + printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ()); } - } else + } + else { if (newPoint.getX() > point.getX()) { - printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX()); + printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX()); } if (newPoint.getY() > point.getY()) { - printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY()); + printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY()); } if (newPoint.getZ() > point.getZ()) { - printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ()); + printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ()); } } -#endif //DEBUG_CHECK_DEQUANTIZATION - +#endif //DEBUG_CHECK_DEQUANTIZATION } - - B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2,int isMax) const + B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2, int isMax) const { - b3Assert(m_useQuantization); b3Vector3 clampedPoint(point2); clampedPoint.setMax(m_bvhAabbMin); clampedPoint.setMin(m_bvhAabbMax); - quantize(out,clampedPoint,isMax); - + quantize(out, clampedPoint, isMax); } - - B3_FORCE_INLINE b3Vector3 unQuantize(const unsigned short* vecIn) const + + B3_FORCE_INLINE b3Vector3 unQuantize(const unsigned short* vecIn) const { - b3Vector3 vecOut; - vecOut.setValue( + b3Vector3 vecOut; + vecOut.setValue( (b3Scalar)(vecIn[0]) / (m_bvhQuantization.getX()), (b3Scalar)(vecIn[1]) / (m_bvhQuantization.getY()), (b3Scalar)(vecIn[2]) / (m_bvhQuantization.getZ())); - vecOut += m_bvhAabbMin; - return vecOut; + vecOut += m_bvhAabbMin; + return vecOut; } ///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees. - void setTraversalMode(b3TraversalMode traversalMode) + void setTraversalMode(b3TraversalMode traversalMode) { m_traversalMode = traversalMode; } - - B3_FORCE_INLINE QuantizedNodeArray& getQuantizedNodeArray() - { - return m_quantizedContiguousNodes; + B3_FORCE_INLINE QuantizedNodeArray& getQuantizedNodeArray() + { + return m_quantizedContiguousNodes; } - - B3_FORCE_INLINE BvhSubtreeInfoArray& getSubtreeInfoArray() + B3_FORCE_INLINE BvhSubtreeInfoArray& getSubtreeInfoArray() { return m_SubtreeHeaders; } -//////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////// /////Calculate space needed to store BVH for serialization unsigned calculateSerializeBufferSize() const; /// Data buffer MUST be 16 byte aligned - virtual bool serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const; + virtual bool serialize(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const; ///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place' - static b3QuantizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian); + static b3QuantizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian); static unsigned int getAlignmentSerializationPadding(); -////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////// - - virtual int calculateSerializeBufferSizeNew() const; + virtual int calculateSerializeBufferSizeNew() const; ///fills the dataBuffer and returns the struct name (and 0 on failure) - virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const; + virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const; - virtual void deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData); + virtual void deSerializeFloat(struct b3QuantizedBvhFloatData & quantizedBvhFloatData); - virtual void deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData); + virtual void deSerializeDouble(struct b3QuantizedBvhDoubleData & quantizedBvhDoubleData); - -//////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////// B3_FORCE_INLINE bool isQuantized() { @@ -483,74 +447,65 @@ private: // Special "copy" constructor that allows for in-place deserialization // Prevents b3Vector3's default constructor from being called, but doesn't inialize much else // ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need) - b3QuantizedBvh(b3QuantizedBvh &other, bool ownsMemory); - -} -; - + b3QuantizedBvh(b3QuantizedBvh & other, bool ownsMemory); +}; struct b3OptimizedBvhNodeFloatData { - b3Vector3FloatData m_aabbMinOrg; - b3Vector3FloatData m_aabbMaxOrg; - int m_escapeIndex; - int m_subPart; - int m_triangleIndex; + b3Vector3FloatData m_aabbMinOrg; + b3Vector3FloatData m_aabbMaxOrg; + int m_escapeIndex; + int m_subPart; + int m_triangleIndex; char m_pad[4]; }; struct b3OptimizedBvhNodeDoubleData { - b3Vector3DoubleData m_aabbMinOrg; - b3Vector3DoubleData m_aabbMaxOrg; - int m_escapeIndex; - int m_subPart; - int m_triangleIndex; - char m_pad[4]; + b3Vector3DoubleData m_aabbMinOrg; + b3Vector3DoubleData m_aabbMaxOrg; + int m_escapeIndex; + int m_subPart; + int m_triangleIndex; + char m_pad[4]; }; - - -struct b3QuantizedBvhFloatData +struct b3QuantizedBvhFloatData { - b3Vector3FloatData m_bvhAabbMin; - b3Vector3FloatData m_bvhAabbMax; - b3Vector3FloatData m_bvhQuantization; - int m_curNodeIndex; - int m_useQuantization; - int m_numContiguousLeafNodes; - int m_numQuantizedContiguousNodes; - b3OptimizedBvhNodeFloatData *m_contiguousNodesPtr; - b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr; - b3BvhSubtreeInfoData *m_subTreeInfoPtr; - int m_traversalMode; - int m_numSubtreeHeaders; - + b3Vector3FloatData m_bvhAabbMin; + b3Vector3FloatData m_bvhAabbMax; + b3Vector3FloatData m_bvhQuantization; + int m_curNodeIndex; + int m_useQuantization; + int m_numContiguousLeafNodes; + int m_numQuantizedContiguousNodes; + b3OptimizedBvhNodeFloatData* m_contiguousNodesPtr; + b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr; + b3BvhSubtreeInfoData* m_subTreeInfoPtr; + int m_traversalMode; + int m_numSubtreeHeaders; }; -struct b3QuantizedBvhDoubleData +struct b3QuantizedBvhDoubleData { - b3Vector3DoubleData m_bvhAabbMin; - b3Vector3DoubleData m_bvhAabbMax; - b3Vector3DoubleData m_bvhQuantization; - int m_curNodeIndex; - int m_useQuantization; - int m_numContiguousLeafNodes; - int m_numQuantizedContiguousNodes; - b3OptimizedBvhNodeDoubleData *m_contiguousNodesPtr; - b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr; - - int m_traversalMode; - int m_numSubtreeHeaders; - b3BvhSubtreeInfoData *m_subTreeInfoPtr; + b3Vector3DoubleData m_bvhAabbMin; + b3Vector3DoubleData m_bvhAabbMax; + b3Vector3DoubleData m_bvhQuantization; + int m_curNodeIndex; + int m_useQuantization; + int m_numContiguousLeafNodes; + int m_numQuantizedContiguousNodes; + b3OptimizedBvhNodeDoubleData* m_contiguousNodesPtr; + b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr; + + int m_traversalMode; + int m_numSubtreeHeaders; + b3BvhSubtreeInfoData* m_subTreeInfoPtr; }; - -B3_FORCE_INLINE int b3QuantizedBvh::calculateSerializeBufferSizeNew() const +B3_FORCE_INLINE int b3QuantizedBvh::calculateSerializeBufferSizeNew() const { return sizeof(b3QuantizedBvhData); } - - -#endif //B3_QUANTIZED_BVH_H +#endif //B3_QUANTIZED_BVH_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp index 4d97f7f62b..6b0c941f23 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp @@ -15,35 +15,32 @@ subject to the following restrictions: #include "b3StridingMeshInterface.h" - b3StridingMeshInterface::~b3StridingMeshInterface() { - } - -void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const +void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const { (void)aabbMin; (void)aabbMax; int numtotalphysicsverts = 0; - int part,graphicssubparts = getNumSubParts(); - const unsigned char * vertexbase; - const unsigned char * indexbase; + int part, graphicssubparts = getNumSubParts(); + const unsigned char* vertexbase; + const unsigned char* indexbase; int indexstride; PHY_ScalarType type; PHY_ScalarType gfxindextype; - int stride,numverts,numtriangles; + int stride, numverts, numtriangles; int gfxindex; b3Vector3 triangle[3]; b3Vector3 meshScaling = getScaling(); ///if the number of parts is big, the performance might drop due to the innerloop switch on indextype - for (part=0;part<graphicssubparts ;part++) + for (part = 0; part < graphicssubparts; part++) { - getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,type,stride,&indexbase,indexstride,numtriangles,gfxindextype,part); - numtotalphysicsverts+=numtriangles*3; //upper bound + getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numtriangles, gfxindextype, part); + numtotalphysicsverts += numtriangles * 3; //upper bound ///unlike that developers want to pass in double-precision meshes in single-precision Bullet build ///so disable this feature by default @@ -51,143 +48,141 @@ void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleInde switch (type) { - case PHY_FLOAT: - { - - float* graphicsbase; - - switch (gfxindextype) - { - case PHY_INTEGER: - { - for (gfxindex=0;gfxindex<numtriangles;gfxindex++) - { - unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride); - graphicsbase = (float*)(vertexbase+tri_indices[0]*stride); - triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (float*)(vertexbase+tri_indices[1]*stride); - triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (float*)(vertexbase+tri_indices[2]*stride); - triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); - callback->internalProcessTriangleIndex(triangle,part,gfxindex); - } - break; - } - case PHY_SHORT: - { - for (gfxindex=0;gfxindex<numtriangles;gfxindex++) - { - unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride); - graphicsbase = (float*)(vertexbase+tri_indices[0]*stride); - triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (float*)(vertexbase+tri_indices[1]*stride); - triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (float*)(vertexbase+tri_indices[2]*stride); - triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); - callback->internalProcessTriangleIndex(triangle,part,gfxindex); - } - break; - } - case PHY_UCHAR: - { - for (gfxindex=0;gfxindex<numtriangles;gfxindex++) - { - unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride); - graphicsbase = (float*)(vertexbase+tri_indices[0]*stride); - triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (float*)(vertexbase+tri_indices[1]*stride); - triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (float*)(vertexbase+tri_indices[2]*stride); - triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ()); - callback->internalProcessTriangleIndex(triangle,part,gfxindex); - } - break; - } - default: - b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT)); - } - break; - } - - case PHY_DOUBLE: + case PHY_FLOAT: + { + float* graphicsbase; + + switch (gfxindextype) + { + case PHY_INTEGER: + { + for (gfxindex = 0; gfxindex < numtriangles; gfxindex++) + { + unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride); + graphicsbase = (float*)(vertexbase + tri_indices[0] * stride); + triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (float*)(vertexbase + tri_indices[1] * stride); + triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (float*)(vertexbase + tri_indices[2] * stride); + triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle, part, gfxindex); + } + break; + } + case PHY_SHORT: + { + for (gfxindex = 0; gfxindex < numtriangles; gfxindex++) + { + unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride); + graphicsbase = (float*)(vertexbase + tri_indices[0] * stride); + triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (float*)(vertexbase + tri_indices[1] * stride); + triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (float*)(vertexbase + tri_indices[2] * stride); + triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle, part, gfxindex); + } + break; + } + case PHY_UCHAR: + { + for (gfxindex = 0; gfxindex < numtriangles; gfxindex++) + { + unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride); + graphicsbase = (float*)(vertexbase + tri_indices[0] * stride); + triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (float*)(vertexbase + tri_indices[1] * stride); + triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (float*)(vertexbase + tri_indices[2] * stride); + triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle, part, gfxindex); + } + break; + } + default: + b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT)); + } + break; + } + + case PHY_DOUBLE: { double* graphicsbase; switch (gfxindextype) { - case PHY_INTEGER: + case PHY_INTEGER: { - for (gfxindex=0;gfxindex<numtriangles;gfxindex++) + for (gfxindex = 0; gfxindex < numtriangles; gfxindex++) { - unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride); - graphicsbase = (double*)(vertexbase+tri_indices[0]*stride); - triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (double*)(vertexbase+tri_indices[1]*stride); - triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (double*)(vertexbase+tri_indices[2]*stride); - triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); - callback->internalProcessTriangleIndex(triangle,part,gfxindex); + unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride); + graphicsbase = (double*)(vertexbase + tri_indices[0] * stride); + triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (double*)(vertexbase + tri_indices[1] * stride); + triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (double*)(vertexbase + tri_indices[2] * stride); + triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle, part, gfxindex); } break; } - case PHY_SHORT: + case PHY_SHORT: { - for (gfxindex=0;gfxindex<numtriangles;gfxindex++) + for (gfxindex = 0; gfxindex < numtriangles; gfxindex++) { - unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride); - graphicsbase = (double*)(vertexbase+tri_indices[0]*stride); - triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (double*)(vertexbase+tri_indices[1]*stride); - triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (double*)(vertexbase+tri_indices[2]*stride); - triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); - callback->internalProcessTriangleIndex(triangle,part,gfxindex); + unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride); + graphicsbase = (double*)(vertexbase + tri_indices[0] * stride); + triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (double*)(vertexbase + tri_indices[1] * stride); + triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (double*)(vertexbase + tri_indices[2] * stride); + triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle, part, gfxindex); } break; } - case PHY_UCHAR: + case PHY_UCHAR: { - for (gfxindex=0;gfxindex<numtriangles;gfxindex++) + for (gfxindex = 0; gfxindex < numtriangles; gfxindex++) { - unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride); - graphicsbase = (double*)(vertexbase+tri_indices[0]*stride); - triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (double*)(vertexbase+tri_indices[1]*stride); - triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); - graphicsbase = (double*)(vertexbase+tri_indices[2]*stride); - triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ()); - callback->internalProcessTriangleIndex(triangle,part,gfxindex); + unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride); + graphicsbase = (double*)(vertexbase + tri_indices[0] * stride); + triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (double*)(vertexbase + tri_indices[1] * stride); + triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ()); + graphicsbase = (double*)(vertexbase + tri_indices[2] * stride); + triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ()); + callback->internalProcessTriangleIndex(triangle, part, gfxindex); } break; } - default: - b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT)); + default: + b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT)); } break; } - default: - b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE)); + default: + b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE)); } unLockReadOnlyVertexBase(part); } } -void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax) +void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin, b3Vector3& aabbMax) { - - struct AabbCalculationCallback : public b3InternalTriangleIndexCallback + struct AabbCalculationCallback : public b3InternalTriangleIndexCallback { - b3Vector3 m_aabbMin; - b3Vector3 m_aabbMax; + b3Vector3 m_aabbMin; + b3Vector3 m_aabbMax; AabbCalculationCallback() { - m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); - m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); + m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT)); + m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT)); } - virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) + virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex) { (void)partId; (void)triangleIndex; @@ -202,13 +197,11 @@ void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vecto }; //first calculate the total aabb for all triangles - AabbCalculationCallback aabbCallback; - aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); - aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); - InternalProcessAllTriangles(&aabbCallback,aabbMin,aabbMax); + AabbCalculationCallback aabbCallback; + aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT)); + aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT)); + InternalProcessAllTriangles(&aabbCallback, aabbMin, aabbMax); aabbMin = aabbCallback.m_aabbMin; aabbMax = aabbCallback.m_aabbMax; } - - diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h index 9513f68f77..087b30f3e6 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h @@ -20,148 +20,139 @@ subject to the following restrictions: #include "b3TriangleCallback.h" //#include "b3ConcaveShape.h" - -enum PHY_ScalarType { - PHY_FLOAT, PHY_DOUBLE, PHY_INTEGER, PHY_SHORT, - PHY_FIXEDPOINT88, PHY_UCHAR +enum PHY_ScalarType +{ + PHY_FLOAT, + PHY_DOUBLE, + PHY_INTEGER, + PHY_SHORT, + PHY_FIXEDPOINT88, + PHY_UCHAR }; - /// The b3StridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with b3BvhTriangleMeshShape and some other collision shapes. /// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips. /// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory. -B3_ATTRIBUTE_ALIGNED16(class ) b3StridingMeshInterface +B3_ATTRIBUTE_ALIGNED16(class) +b3StridingMeshInterface { - protected: - - b3Vector3 m_scaling; - - public: - B3_DECLARE_ALIGNED_ALLOCATOR(); - - b3StridingMeshInterface() :m_scaling(b3MakeVector3(b3Scalar(1.),b3Scalar(1.),b3Scalar(1.))) - { - - } - - virtual ~b3StridingMeshInterface(); - - - - virtual void InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const; - - ///brute force method to calculate aabb - void calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax); - - /// get read and write access to a subpart of a triangle mesh - /// this subpart has a continuous array of vertices and indices - /// in this way the mesh can be handled as chunks of memory with striding - /// very similar to OpenGL vertexarray support - /// make a call to unLockVertexBase when the read and write access is finished - virtual void getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0)=0; - - virtual void getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const=0; - - /// unLockVertexBase finishes the access to a subpart of the triangle mesh - /// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished - virtual void unLockVertexBase(int subpart)=0; - - virtual void unLockReadOnlyVertexBase(int subpart) const=0; - - - /// getNumSubParts returns the number of seperate subparts - /// each subpart has a continuous array of vertices and indices - virtual int getNumSubParts() const=0; - - virtual void preallocateVertices(int numverts)=0; - virtual void preallocateIndices(int numindices)=0; - - virtual bool hasPremadeAabb() const { return false; } - virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const - { - (void) aabbMin; - (void) aabbMax; - } - virtual void getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const - { - (void) aabbMin; - (void) aabbMax; - } - - const b3Vector3& getScaling() const { - return m_scaling; - } - void setScaling(const b3Vector3& scaling) - { - m_scaling = scaling; - } - - virtual int calculateSerializeBufferSize() const; - - ///fills the dataBuffer and returns the struct name (and 0 on failure) - //virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const; - - +protected: + b3Vector3 m_scaling; + +public: + B3_DECLARE_ALIGNED_ALLOCATOR(); + + b3StridingMeshInterface() : m_scaling(b3MakeVector3(b3Scalar(1.), b3Scalar(1.), b3Scalar(1.))) + { + } + + virtual ~b3StridingMeshInterface(); + + virtual void InternalProcessAllTriangles(b3InternalTriangleIndexCallback * callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const; + + ///brute force method to calculate aabb + void calculateAabbBruteForce(b3Vector3 & aabbMin, b3Vector3 & aabbMax); + + /// get read and write access to a subpart of a triangle mesh + /// this subpart has a continuous array of vertices and indices + /// in this way the mesh can be handled as chunks of memory with striding + /// very similar to OpenGL vertexarray support + /// make a call to unLockVertexBase when the read and write access is finished + virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) = 0; + + virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const = 0; + + /// unLockVertexBase finishes the access to a subpart of the triangle mesh + /// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished + virtual void unLockVertexBase(int subpart) = 0; + + virtual void unLockReadOnlyVertexBase(int subpart) const = 0; + + /// getNumSubParts returns the number of seperate subparts + /// each subpart has a continuous array of vertices and indices + virtual int getNumSubParts() const = 0; + + virtual void preallocateVertices(int numverts) = 0; + virtual void preallocateIndices(int numindices) = 0; + + virtual bool hasPremadeAabb() const { return false; } + virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const + { + (void)aabbMin; + (void)aabbMax; + } + virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const + { + (void)aabbMin; + (void)aabbMax; + } + + const b3Vector3& getScaling() const + { + return m_scaling; + } + void setScaling(const b3Vector3& scaling) + { + m_scaling = scaling; + } + + virtual int calculateSerializeBufferSize() const; + + ///fills the dataBuffer and returns the struct name (and 0 on failure) + //virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const; }; -struct b3IntIndexData +struct b3IntIndexData { - int m_value; + int m_value; }; -struct b3ShortIntIndexData +struct b3ShortIntIndexData { short m_value; char m_pad[2]; }; -struct b3ShortIntIndexTripletData +struct b3ShortIntIndexTripletData { - short m_values[3]; - char m_pad[2]; + short m_values[3]; + char m_pad[2]; }; -struct b3CharIndexTripletData +struct b3CharIndexTripletData { unsigned char m_values[3]; - char m_pad; + char m_pad; }; - ///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64 -struct b3MeshPartData +struct b3MeshPartData { - b3Vector3FloatData *m_vertices3f; - b3Vector3DoubleData *m_vertices3d; + b3Vector3FloatData* m_vertices3f; + b3Vector3DoubleData* m_vertices3d; - b3IntIndexData *m_indices32; - b3ShortIntIndexTripletData *m_3indices16; - b3CharIndexTripletData *m_3indices8; + b3IntIndexData* m_indices32; + b3ShortIntIndexTripletData* m_3indices16; + b3CharIndexTripletData* m_3indices8; - b3ShortIntIndexData *m_indices16;//backwards compatibility + b3ShortIntIndexData* m_indices16; //backwards compatibility - int m_numTriangles;//length of m_indices = m_numTriangles - int m_numVertices; + int m_numTriangles; //length of m_indices = m_numTriangles + int m_numVertices; }; - ///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64 -struct b3StridingMeshInterfaceData +struct b3StridingMeshInterfaceData { - b3MeshPartData *m_meshPartsPtr; - b3Vector3FloatData m_scaling; - int m_numMeshParts; + b3MeshPartData* m_meshPartsPtr; + b3Vector3FloatData m_scaling; + int m_numMeshParts; char m_padding[4]; }; - - - -B3_FORCE_INLINE int b3StridingMeshInterface::calculateSerializeBufferSize() const +B3_FORCE_INLINE int b3StridingMeshInterface::calculateSerializeBufferSize() const { return sizeof(b3StridingMeshInterfaceData); } - - -#endif //B3_STRIDING_MESHINTERFACE_H +#endif //B3_STRIDING_MESHINTERFACE_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h index d073ee57c3..9ca1e22949 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h @@ -6,33 +6,29 @@ #include "Bullet3Common/b3AlignedObjectArray.h" #include "b3VectorFloat4.h" - struct b3GjkPairDetector; - - -inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, - const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin) +inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull, + const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin) { - b3Vector3 supVec = b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + b3Vector3 supVec = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); b3Scalar maxDot = b3Scalar(-B3_LARGE_FLOAT); - // Here we take advantage of dot(a, b*c) = dot(a*b, c). Note: This is true mathematically, but not numerically. - if( 0 < hull->m_numVertices ) - { - const b3Vector3 scaled = supportVec; - int index = (int) scaled.maxDot( &verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot); - return verticesA[hull->m_vertexOffset+index]; - } - - return supVec; + // Here we take advantage of dot(a, b*c) = dot(a*b, c). Note: This is true mathematically, but not numerically. + if (0 < hull->m_numVertices) + { + const b3Vector3 scaled = supportVec; + int index = (int)scaled.maxDot(&verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot); + return verticesA[hull->m_vertexOffset + index]; + } + return supVec; } -inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, - const b3AlignedObjectArray<b3Vector3>& verticesA) +inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull, + const b3AlignedObjectArray<b3Vector3>& verticesA) { - return localGetSupportVertexWithMargin(supportVec,hull,verticesA,0.f); + return localGetSupportVertexWithMargin(supportVec, hull, verticesA, 0.f); } -#endif //B3_SUPPORT_MAPPINGS_H +#endif //B3_SUPPORT_MAPPINGS_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp index 9066451884..3908c6de89 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp @@ -17,12 +17,8 @@ subject to the following restrictions: b3TriangleCallback::~b3TriangleCallback() { - } - b3InternalTriangleIndexCallback::~b3InternalTriangleIndexCallback() { - } - diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h index 3059fa4f21..a0fd3e7ac7 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h @@ -18,13 +18,11 @@ subject to the following restrictions: #include "Bullet3Common/b3Vector3.h" - ///The b3TriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles. ///This callback is called by processAllTriangles for all b3ConcaveShape derived class, such as b3BvhTriangleMeshShape, b3StaticPlaneShape and b3HeightfieldTerrainShape. class b3TriangleCallback { public: - virtual ~b3TriangleCallback(); virtual void processTriangle(b3Vector3* triangle, int partId, int triangleIndex) = 0; }; @@ -32,11 +30,8 @@ public: class b3InternalTriangleIndexCallback { public: - virtual ~b3InternalTriangleIndexCallback(); - virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) = 0; + virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex) = 0; }; - - -#endif //B3_TRIANGLE_CALLBACK_H +#endif //B3_TRIANGLE_CALLBACK_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp index a0f59babbe..73faadbdd0 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp @@ -15,81 +15,76 @@ subject to the following restrictions: #include "b3TriangleIndexVertexArray.h" -b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride) -: m_hasAabb(0) +b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride) + : m_hasAabb(0) { b3IndexedMesh mesh; mesh.m_numTriangles = numTriangles; - mesh.m_triangleIndexBase = (const unsigned char *)triangleIndexBase; + mesh.m_triangleIndexBase = (const unsigned char*)triangleIndexBase; mesh.m_triangleIndexStride = triangleIndexStride; mesh.m_numVertices = numVertices; - mesh.m_vertexBase = (const unsigned char *)vertexBase; + mesh.m_vertexBase = (const unsigned char*)vertexBase; mesh.m_vertexStride = vertexStride; addIndexedMesh(mesh); - } b3TriangleIndexVertexArray::~b3TriangleIndexVertexArray() { - } -void b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) +void b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart) { - b3Assert(subpart< getNumSubParts() ); + b3Assert(subpart < getNumSubParts()); b3IndexedMesh& mesh = m_indexedMeshes[subpart]; numverts = mesh.m_numVertices; - (*vertexbase) = (unsigned char *) mesh.m_vertexBase; + (*vertexbase) = (unsigned char*)mesh.m_vertexBase; - type = mesh.m_vertexType; + type = mesh.m_vertexType; vertexStride = mesh.m_vertexStride; numfaces = mesh.m_numTriangles; - (*indexbase) = (unsigned char *)mesh.m_triangleIndexBase; + (*indexbase) = (unsigned char*)mesh.m_triangleIndexBase; indexstride = mesh.m_triangleIndexStride; indicestype = mesh.m_indexType; } -void b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) const +void b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart) const { const b3IndexedMesh& mesh = m_indexedMeshes[subpart]; numverts = mesh.m_numVertices; - (*vertexbase) = (const unsigned char *)mesh.m_vertexBase; + (*vertexbase) = (const unsigned char*)mesh.m_vertexBase; + + type = mesh.m_vertexType; - type = mesh.m_vertexType; - vertexStride = mesh.m_vertexStride; numfaces = mesh.m_numTriangles; - (*indexbase) = (const unsigned char *)mesh.m_triangleIndexBase; + (*indexbase) = (const unsigned char*)mesh.m_triangleIndexBase; indexstride = mesh.m_triangleIndexStride; indicestype = mesh.m_indexType; } -bool b3TriangleIndexVertexArray::hasPremadeAabb() const +bool b3TriangleIndexVertexArray::hasPremadeAabb() const { return (m_hasAabb == 1); } - -void b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const +void b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const { m_aabbMin = aabbMin; m_aabbMax = aabbMax; - m_hasAabb = 1; // this is intentionally an int see notes in header + m_hasAabb = 1; // this is intentionally an int see notes in header } -void b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const +void b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax) const { *aabbMin = m_aabbMin; *aabbMax = m_aabbMax; } - - diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h index d26b2893bc..b6ceb8df10 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h @@ -20,62 +20,59 @@ subject to the following restrictions: #include "Bullet3Common/b3AlignedObjectArray.h" #include "Bullet3Common/b3Scalar.h" - ///The b3IndexedMesh indexes a single vertex and index array. Multiple b3IndexedMesh objects can be passed into a b3TriangleIndexVertexArray using addIndexedMesh. ///Instead of the number of indices, we pass the number of triangles. -B3_ATTRIBUTE_ALIGNED16( struct) b3IndexedMesh +B3_ATTRIBUTE_ALIGNED16(struct) +b3IndexedMesh { B3_DECLARE_ALIGNED_ALLOCATOR(); - int m_numTriangles; - const unsigned char * m_triangleIndexBase; - // Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed) - int m_triangleIndexStride; - int m_numVertices; - const unsigned char * m_vertexBase; - // Size of a vertex, in bytes - int m_vertexStride; - - // The index type is set when adding an indexed mesh to the - // b3TriangleIndexVertexArray, do not set it manually - PHY_ScalarType m_indexType; - - // The vertex type has a default type similar to Bullet's precision mode (float or double) - // but can be set manually if you for example run Bullet with double precision but have - // mesh data in single precision.. - PHY_ScalarType m_vertexType; - - - b3IndexedMesh() - :m_indexType(PHY_INTEGER), + int m_numTriangles; + const unsigned char* m_triangleIndexBase; + // Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed) + int m_triangleIndexStride; + int m_numVertices; + const unsigned char* m_vertexBase; + // Size of a vertex, in bytes + int m_vertexStride; + + // The index type is set when adding an indexed mesh to the + // b3TriangleIndexVertexArray, do not set it manually + PHY_ScalarType m_indexType; + + // The vertex type has a default type similar to Bullet's precision mode (float or double) + // but can be set manually if you for example run Bullet with double precision but have + // mesh data in single precision.. + PHY_ScalarType m_vertexType; + + b3IndexedMesh() + : m_indexType(PHY_INTEGER), #ifdef B3_USE_DOUBLE_PRECISION - m_vertexType(PHY_DOUBLE) -#else // B3_USE_DOUBLE_PRECISION - m_vertexType(PHY_FLOAT) -#endif // B3_USE_DOUBLE_PRECISION - { - } -} -; - + m_vertexType(PHY_DOUBLE) +#else // B3_USE_DOUBLE_PRECISION + m_vertexType(PHY_FLOAT) +#endif // B3_USE_DOUBLE_PRECISION + { + } +}; -typedef b3AlignedObjectArray<b3IndexedMesh> IndexedMeshArray; +typedef b3AlignedObjectArray<b3IndexedMesh> IndexedMeshArray; ///The b3TriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays. ///Additional meshes can be added using addIndexedMesh ///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays. ///So keep those arrays around during the lifetime of this b3TriangleIndexVertexArray. -B3_ATTRIBUTE_ALIGNED16( class) b3TriangleIndexVertexArray : public b3StridingMeshInterface +B3_ATTRIBUTE_ALIGNED16(class) +b3TriangleIndexVertexArray : public b3StridingMeshInterface { protected: - IndexedMeshArray m_indexedMeshes; + IndexedMeshArray m_indexedMeshes; int m_pad[2]; - mutable int m_hasAabb; // using int instead of bool to maintain alignment + mutable int m_hasAabb; // using int instead of bool to maintain alignment mutable b3Vector3 m_aabbMin; mutable b3Vector3 m_aabbMax; public: - B3_DECLARE_ALIGNED_ALLOCATOR(); b3TriangleIndexVertexArray() : m_hasAabb(0) @@ -85,49 +82,47 @@ public: virtual ~b3TriangleIndexVertexArray(); //just to be backwards compatible - b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride); - - void addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER) + b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride); + + void addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER) { m_indexedMeshes.push_back(mesh); - m_indexedMeshes[m_indexedMeshes.size()-1].m_indexType = indexType; + m_indexedMeshes[m_indexedMeshes.size() - 1].m_indexType = indexType; } - - - virtual void getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0); - virtual void getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const; + virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0); + + virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const; /// unLockVertexBase finishes the access to a subpart of the triangle mesh /// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished - virtual void unLockVertexBase(int subpart) {(void)subpart;} + virtual void unLockVertexBase(int subpart) { (void)subpart; } - virtual void unLockReadOnlyVertexBase(int subpart) const {(void)subpart;} + virtual void unLockReadOnlyVertexBase(int subpart) const { (void)subpart; } /// getNumSubParts returns the number of seperate subparts /// each subpart has a continuous array of vertices and indices - virtual int getNumSubParts() const { + virtual int getNumSubParts() const + { return (int)m_indexedMeshes.size(); } - IndexedMeshArray& getIndexedMeshArray() + IndexedMeshArray& getIndexedMeshArray() { return m_indexedMeshes; } - const IndexedMeshArray& getIndexedMeshArray() const + const IndexedMeshArray& getIndexedMeshArray() const { return m_indexedMeshes; } - virtual void preallocateVertices(int numverts){(void) numverts;} - virtual void preallocateIndices(int numindices){(void) numindices;} - - virtual bool hasPremadeAabb() const; - virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const; - virtual void getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const; + virtual void preallocateVertices(int numverts) { (void)numverts; } + virtual void preallocateIndices(int numindices) { (void)numindices; } -} -; + virtual bool hasPremadeAabb() const; + virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const; + virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const; +}; -#endif //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H +#endif //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h index f6f65f7719..5cc4b5a626 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h @@ -7,5 +7,4 @@ #define float4 b3Vector3 //#define make_float4(x,y,z,w) b3Vector4(x,y,z,w) - -#endif //B3_VECTOR_FLOAT4_H +#endif //B3_VECTOR_FLOAT4_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp index cf3d5ef49d..dae61d4581 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp @@ -23,26 +23,24 @@ subject to the following restrictions: */ - #include "b3VoronoiSimplexSolver.h" -#define VERTA 0 -#define VERTB 1 -#define VERTC 2 -#define VERTD 3 +#define VERTA 0 +#define VERTB 1 +#define VERTC 2 +#define VERTD 3 #define B3_CATCH_DEGENERATE_TETRAHEDRON 1 -void b3VoronoiSimplexSolver::removeVertex(int index) +void b3VoronoiSimplexSolver::removeVertex(int index) { - - b3Assert(m_numVertices>0); + b3Assert(m_numVertices > 0); m_numVertices--; m_simplexVectorW[index] = m_simplexVectorW[m_numVertices]; m_simplexPointsP[index] = m_simplexPointsP[m_numVertices]; m_simplexPointsQ[index] = m_simplexPointsQ[m_numVertices]; } -void b3VoronoiSimplexSolver::reduceVertices (const b3UsageBitfield& usedVerts) +void b3VoronoiSimplexSolver::reduceVertices(const b3UsageBitfield& usedVerts) { if ((numVertices() >= 4) && (!usedVerts.usedVertexD)) removeVertex(3); @@ -52,29 +50,22 @@ void b3VoronoiSimplexSolver::reduceVertices (const b3UsageBitfield& usedVerts) if ((numVertices() >= 2) && (!usedVerts.usedVertexB)) removeVertex(1); - + if ((numVertices() >= 1) && (!usedVerts.usedVertexA)) removeVertex(0); - } - - - - //clear the simplex, remove all the vertices void b3VoronoiSimplexSolver::reset() { m_cachedValidClosest = false; m_numVertices = 0; m_needsUpdate = true; - m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT)); + m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT)); m_cachedBC.reset(); } - - - //add a vertex +//add a vertex void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q) { m_lastW = w; @@ -87,9 +78,8 @@ void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, c m_numVertices++; } -bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints() +bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints() { - if (m_needsUpdate) { m_cachedBC.reset(); @@ -98,127 +88,131 @@ bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints() switch (numVertices()) { - case 0: + case 0: m_cachedValidClosest = false; break; - case 1: + case 1: { m_cachedP1 = m_simplexPointsP[0]; m_cachedP2 = m_simplexPointsQ[0]; - m_cachedV = m_cachedP1-m_cachedP2; //== m_simplexVectorW[0] + m_cachedV = m_cachedP1 - m_cachedP2; //== m_simplexVectorW[0] m_cachedBC.reset(); - m_cachedBC.setBarycentricCoordinates(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + m_cachedBC.setBarycentricCoordinates(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); m_cachedValidClosest = m_cachedBC.isValid(); break; }; - case 2: + case 2: { - //closest point origin from line segment - const b3Vector3& from = m_simplexVectorW[0]; - const b3Vector3& to = m_simplexVectorW[1]; - b3Vector3 nearest; - - b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); - b3Vector3 diff = p - from; - b3Vector3 v = to - from; - b3Scalar t = v.dot(diff); - - if (t > 0) { - b3Scalar dotVV = v.dot(v); - if (t < dotVV) { - t /= dotVV; - diff -= t*v; - m_cachedBC.m_usedVertices.usedVertexA = true; - m_cachedBC.m_usedVertices.usedVertexB = true; - } else { - t = 1; - diff -= v; - //reduce to 1 point - m_cachedBC.m_usedVertices.usedVertexB = true; - } - } else + //closest point origin from line segment + const b3Vector3& from = m_simplexVectorW[0]; + const b3Vector3& to = m_simplexVectorW[1]; + b3Vector3 nearest; + + b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); + b3Vector3 diff = p - from; + b3Vector3 v = to - from; + b3Scalar t = v.dot(diff); + + if (t > 0) + { + b3Scalar dotVV = v.dot(v); + if (t < dotVV) { - t = 0; - //reduce to 1 point + t /= dotVV; + diff -= t * v; m_cachedBC.m_usedVertices.usedVertexA = true; + m_cachedBC.m_usedVertices.usedVertexB = true; + } + else + { + t = 1; + diff -= v; + //reduce to 1 point + m_cachedBC.m_usedVertices.usedVertexB = true; } - m_cachedBC.setBarycentricCoordinates(1-t,t); - nearest = from + t*v; + } + else + { + t = 0; + //reduce to 1 point + m_cachedBC.m_usedVertices.usedVertexA = true; + } + m_cachedBC.setBarycentricCoordinates(1 - t, t); + nearest = from + t * v; - m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]); - m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]); - m_cachedV = m_cachedP1 - m_cachedP2; - - reduceVertices(m_cachedBC.m_usedVertices); + m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]); + m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]); + m_cachedV = m_cachedP1 - m_cachedP2; - m_cachedValidClosest = m_cachedBC.isValid(); - break; + reduceVertices(m_cachedBC.m_usedVertices); + + m_cachedValidClosest = m_cachedBC.isValid(); + break; } - case 3: - { - //closest point origin from triangle - b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + case 3: + { + //closest point origin from triangle + b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); - const b3Vector3& a = m_simplexVectorW[0]; - const b3Vector3& b = m_simplexVectorW[1]; - const b3Vector3& c = m_simplexVectorW[2]; + const b3Vector3& a = m_simplexVectorW[0]; + const b3Vector3& b = m_simplexVectorW[1]; + const b3Vector3& c = m_simplexVectorW[2]; - closestPtPointTriangle(p,a,b,c,m_cachedBC); - m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] + - m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] + - m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2]; + closestPtPointTriangle(p, a, b, c, m_cachedBC); + m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] + + m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] + + m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2]; - m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] + - m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] + - m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2]; + m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] + + m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] + + m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2]; - m_cachedV = m_cachedP1-m_cachedP2; + m_cachedV = m_cachedP1 - m_cachedP2; - reduceVertices (m_cachedBC.m_usedVertices); - m_cachedValidClosest = m_cachedBC.isValid(); + reduceVertices(m_cachedBC.m_usedVertices); + m_cachedValidClosest = m_cachedBC.isValid(); - break; + break; } - case 4: + case 4: { + b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); - - b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); - const b3Vector3& a = m_simplexVectorW[0]; const b3Vector3& b = m_simplexVectorW[1]; const b3Vector3& c = m_simplexVectorW[2]; const b3Vector3& d = m_simplexVectorW[3]; - bool hasSeperation = closestPtPointTetrahedron(p,a,b,c,d,m_cachedBC); + bool hasSeperation = closestPtPointTetrahedron(p, a, b, c, d, m_cachedBC); if (hasSeperation) { - m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] + - m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] + - m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] + - m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3]; + m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] + + m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] + + m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3]; m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] + - m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] + - m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] + - m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3]; + m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] + + m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] + + m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3]; - m_cachedV = m_cachedP1-m_cachedP2; - reduceVertices (m_cachedBC.m_usedVertices); - } else + m_cachedV = m_cachedP1 - m_cachedP2; + reduceVertices(m_cachedBC.m_usedVertices); + } + else { -// printf("sub distance got penetration\n"); + // printf("sub distance got penetration\n"); if (m_cachedBC.m_degenerate) { m_cachedValidClosest = false; - } else + } + else { m_cachedValidClosest = true; //degenerate case == false, penetration = true + zero - m_cachedV.setValue(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); + m_cachedV.setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)); } break; } @@ -228,7 +222,7 @@ bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints() //closest point origin from tetrahedron break; } - default: + default: { m_cachedValidClosest = false; } @@ -236,7 +230,6 @@ bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints() } return m_cachedValidClosest; - } //return/calculate the closest vertex @@ -247,13 +240,11 @@ bool b3VoronoiSimplexSolver::closest(b3Vector3& v) return succes; } - - b3Scalar b3VoronoiSimplexSolver::maxVertex() { int i, numverts = numVertices(); b3Scalar maxV = b3Scalar(0.); - for (i=0;i<numverts;i++) + for (i = 0; i < numverts; i++) { b3Scalar curLen2 = m_simplexVectorW[i].length2(); if (maxV < curLen2) @@ -262,13 +253,11 @@ b3Scalar b3VoronoiSimplexSolver::maxVertex() return maxV; } - - - //return the current simplex -int b3VoronoiSimplexSolver::getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const +//return the current simplex +int b3VoronoiSimplexSolver::getSimplex(b3Vector3* pBuf, b3Vector3* qBuf, b3Vector3* yBuf) const { int i; - for (i=0;i<numVertices();i++) + for (i = 0; i < numVertices(); i++) { yBuf[i] = m_simplexVectorW[i]; pBuf[i] = m_simplexPointsP[i]; @@ -277,20 +266,17 @@ int b3VoronoiSimplexSolver::getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vecto return numVertices(); } - - - bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w) { bool found = false; int i, numverts = numVertices(); //b3Scalar maxV = b3Scalar(0.); - + //w is in the current (reduced) simplex - for (i=0;i<numverts;i++) + for (i = 0; i < numverts; i++) { #ifdef BT_USE_EQUAL_VERTEX_THRESHOLD - if ( m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold) + if (m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold) #else if (m_simplexVectorW[i] == w) #endif @@ -300,199 +286,190 @@ bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w) //check in case lastW is already removed if (w == m_lastW) return true; - + return found; } -void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v) +void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v) { v = m_cachedV; } - -bool b3VoronoiSimplexSolver::emptySimplex() const +bool b3VoronoiSimplexSolver::emptySimplex() const { return (numVertices() == 0); - } -void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2) +void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2) { updateClosestVectorAndPoints(); p1 = m_cachedP1; p2 = m_cachedP2; - } - - - -bool b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result) +bool b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result) { result.m_usedVertices.reset(); - // Check if P in vertex region outside A - b3Vector3 ab = b - a; - b3Vector3 ac = c - a; - b3Vector3 ap = p - a; - b3Scalar d1 = ab.dot(ap); - b3Scalar d2 = ac.dot(ap); - if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0)) + // Check if P in vertex region outside A + b3Vector3 ab = b - a; + b3Vector3 ac = c - a; + b3Vector3 ap = p - a; + b3Scalar d1 = ab.dot(ap); + b3Scalar d2 = ac.dot(ap); + if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0)) { result.m_closestPointOnSimplex = a; result.m_usedVertices.usedVertexA = true; - result.setBarycentricCoordinates(1,0,0); - return true;// a; // barycentric coordinates (1,0,0) + result.setBarycentricCoordinates(1, 0, 0); + return true; // a; // barycentric coordinates (1,0,0) } - // Check if P in vertex region outside B - b3Vector3 bp = p - b; - b3Scalar d3 = ab.dot(bp); - b3Scalar d4 = ac.dot(bp); - if (d3 >= b3Scalar(0.0) && d4 <= d3) + // Check if P in vertex region outside B + b3Vector3 bp = p - b; + b3Scalar d3 = ab.dot(bp); + b3Scalar d4 = ac.dot(bp); + if (d3 >= b3Scalar(0.0) && d4 <= d3) { result.m_closestPointOnSimplex = b; result.m_usedVertices.usedVertexB = true; - result.setBarycentricCoordinates(0,1,0); + result.setBarycentricCoordinates(0, 1, 0); - return true; // b; // barycentric coordinates (0,1,0) + return true; // b; // barycentric coordinates (0,1,0) } - // Check if P in edge region of AB, if so return projection of P onto AB - b3Scalar vc = d1*d4 - d3*d2; - if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0)) { - b3Scalar v = d1 / (d1 - d3); + // Check if P in edge region of AB, if so return projection of P onto AB + b3Scalar vc = d1 * d4 - d3 * d2; + if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0)) + { + b3Scalar v = d1 / (d1 - d3); result.m_closestPointOnSimplex = a + v * ab; result.m_usedVertices.usedVertexA = true; result.m_usedVertices.usedVertexB = true; - result.setBarycentricCoordinates(1-v,v,0); + result.setBarycentricCoordinates(1 - v, v, 0); return true; - //return a + v * ab; // barycentric coordinates (1-v,v,0) - } - - // Check if P in vertex region outside C - b3Vector3 cp = p - c; - b3Scalar d5 = ab.dot(cp); - b3Scalar d6 = ac.dot(cp); - if (d6 >= b3Scalar(0.0) && d5 <= d6) + //return a + v * ab; // barycentric coordinates (1-v,v,0) + } + + // Check if P in vertex region outside C + b3Vector3 cp = p - c; + b3Scalar d5 = ab.dot(cp); + b3Scalar d6 = ac.dot(cp); + if (d6 >= b3Scalar(0.0) && d5 <= d6) { result.m_closestPointOnSimplex = c; result.m_usedVertices.usedVertexC = true; - result.setBarycentricCoordinates(0,0,1); - return true;//c; // barycentric coordinates (0,0,1) + result.setBarycentricCoordinates(0, 0, 1); + return true; //c; // barycentric coordinates (0,0,1) } - // Check if P in edge region of AC, if so return projection of P onto AC - b3Scalar vb = d5*d2 - d1*d6; - if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0)) { - b3Scalar w = d2 / (d2 - d6); + // Check if P in edge region of AC, if so return projection of P onto AC + b3Scalar vb = d5 * d2 - d1 * d6; + if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0)) + { + b3Scalar w = d2 / (d2 - d6); result.m_closestPointOnSimplex = a + w * ac; result.m_usedVertices.usedVertexA = true; result.m_usedVertices.usedVertexC = true; - result.setBarycentricCoordinates(1-w,0,w); + result.setBarycentricCoordinates(1 - w, 0, w); return true; - //return a + w * ac; // barycentric coordinates (1-w,0,w) - } + //return a + w * ac; // barycentric coordinates (1-w,0,w) + } + + // Check if P in edge region of BC, if so return projection of P onto BC + b3Scalar va = d3 * d6 - d5 * d4; + if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0)) + { + b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6)); - // Check if P in edge region of BC, if so return projection of P onto BC - b3Scalar va = d3*d6 - d5*d4; - if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0)) { - b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6)); - result.m_closestPointOnSimplex = b + w * (c - b); result.m_usedVertices.usedVertexB = true; result.m_usedVertices.usedVertexC = true; - result.setBarycentricCoordinates(0,1-w,w); - return true; - // return b + w * (c - b); // barycentric coordinates (0,1-w,w) - } - - // P inside face region. Compute Q through its barycentric coordinates (u,v,w) - b3Scalar denom = b3Scalar(1.0) / (va + vb + vc); - b3Scalar v = vb * denom; - b3Scalar w = vc * denom; - + result.setBarycentricCoordinates(0, 1 - w, w); + return true; + // return b + w * (c - b); // barycentric coordinates (0,1-w,w) + } + + // P inside face region. Compute Q through its barycentric coordinates (u,v,w) + b3Scalar denom = b3Scalar(1.0) / (va + vb + vc); + b3Scalar v = vb * denom; + b3Scalar w = vc * denom; + result.m_closestPointOnSimplex = a + ab * v + ac * w; result.m_usedVertices.usedVertexA = true; result.m_usedVertices.usedVertexB = true; result.m_usedVertices.usedVertexC = true; - result.setBarycentricCoordinates(1-v-w,v,w); - - return true; -// return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w + result.setBarycentricCoordinates(1 - v - w, v, w); + return true; + // return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w } - - - - /// Test if point p and d lie on opposite sides of plane through abc int b3VoronoiSimplexSolver::pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d) { - b3Vector3 normal = (b-a).cross(c-a); + b3Vector3 normal = (b - a).cross(c - a); - b3Scalar signp = (p - a).dot(normal); // [AP AB AC] - b3Scalar signd = (d - a).dot( normal); // [AD AB AC] + b3Scalar signp = (p - a).dot(normal); // [AP AB AC] + b3Scalar signd = (d - a).dot(normal); // [AD AB AC] #ifdef B3_CATCH_DEGENERATE_TETRAHEDRON #ifdef BT_USE_DOUBLE_PRECISION -if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8))) + if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8))) { return -1; } #else if (signd * signd < (b3Scalar(1e-4) * b3Scalar(1e-4))) { -// printf("affine dependent/degenerate\n");// + // printf("affine dependent/degenerate\n");// return -1; } #endif #endif // Points on opposite sides if expression signs are opposite - return signp * signd < b3Scalar(0.); + return signp * signd < b3Scalar(0.); } - -bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult) +bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult) { b3SubSimplexClosestResult tempResult; - // Start out assuming point inside all halfspaces, so closest to itself + // Start out assuming point inside all halfspaces, so closest to itself finalResult.m_closestPointOnSimplex = p; finalResult.m_usedVertices.reset(); - finalResult.m_usedVertices.usedVertexA = true; + finalResult.m_usedVertices.usedVertexA = true; finalResult.m_usedVertices.usedVertexB = true; finalResult.m_usedVertices.usedVertexC = true; finalResult.m_usedVertices.usedVertexD = true; - int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d); + int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d); int pointOutsideACD = pointOutsideOfPlane(p, a, c, d, b); - int pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c); - int pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a); - - if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0) - { - finalResult.m_degenerate = true; - return false; - } + int pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c); + int pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a); - if (!pointOutsideABC && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC) - { - return false; - } + if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0) + { + finalResult.m_degenerate = true; + return false; + } + if (!pointOutsideABC && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC) + { + return false; + } - b3Scalar bestSqDist = FLT_MAX; - // If point outside face abc then compute closest point on abc - if (pointOutsideABC) + b3Scalar bestSqDist = FLT_MAX; + // If point outside face abc then compute closest point on abc + if (pointOutsideABC) { - closestPtPointTriangle(p, a, b, c,tempResult); + closestPtPointTriangle(p, a, b, c, tempResult); b3Vector3 q = tempResult.m_closestPointOnSimplex; - - b3Scalar sqDist = (q - p).dot( q - p); - // Update best closest point if (squared) distance is less than current best - if (sqDist < bestSqDist) { + + b3Scalar sqDist = (q - p).dot(q - p); + // Update best closest point if (squared) distance is less than current best + if (sqDist < bestSqDist) + { bestSqDist = sqDist; finalResult.m_closestPointOnSimplex = q; //convert result bitmask! @@ -501,25 +478,22 @@ bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexB; finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC; finalResult.setBarycentricCoordinates( - tempResult.m_barycentricCoords[VERTA], - tempResult.m_barycentricCoords[VERTB], - tempResult.m_barycentricCoords[VERTC], - 0 - ); - + tempResult.m_barycentricCoords[VERTA], + tempResult.m_barycentricCoords[VERTB], + tempResult.m_barycentricCoords[VERTC], + 0); } - } - + } // Repeat test for face acd - if (pointOutsideACD) + if (pointOutsideACD) { - closestPtPointTriangle(p, a, c, d,tempResult); + closestPtPointTriangle(p, a, c, d, tempResult); b3Vector3 q = tempResult.m_closestPointOnSimplex; //convert result bitmask! - b3Scalar sqDist = (q - p).dot( q - p); - if (sqDist < bestSqDist) + b3Scalar sqDist = (q - p).dot(q - p); + if (sqDist < bestSqDist) { bestSqDist = sqDist; finalResult.m_closestPointOnSimplex = q; @@ -529,52 +503,46 @@ bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexB; finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexC; finalResult.setBarycentricCoordinates( - tempResult.m_barycentricCoords[VERTA], - 0, - tempResult.m_barycentricCoords[VERTB], - tempResult.m_barycentricCoords[VERTC] - ); - + tempResult.m_barycentricCoords[VERTA], + 0, + tempResult.m_barycentricCoords[VERTB], + tempResult.m_barycentricCoords[VERTC]); } - } - // Repeat test for face adb + } + // Repeat test for face adb - if (pointOutsideADB) { - closestPtPointTriangle(p, a, d, b,tempResult); + closestPtPointTriangle(p, a, d, b, tempResult); b3Vector3 q = tempResult.m_closestPointOnSimplex; //convert result bitmask! - b3Scalar sqDist = (q - p).dot( q - p); - if (sqDist < bestSqDist) + b3Scalar sqDist = (q - p).dot(q - p); + if (sqDist < bestSqDist) { bestSqDist = sqDist; finalResult.m_closestPointOnSimplex = q; finalResult.m_usedVertices.reset(); finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA; finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexC; - + finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB; finalResult.setBarycentricCoordinates( - tempResult.m_barycentricCoords[VERTA], - tempResult.m_barycentricCoords[VERTC], - 0, - tempResult.m_barycentricCoords[VERTB] - ); - + tempResult.m_barycentricCoords[VERTA], + tempResult.m_barycentricCoords[VERTC], + 0, + tempResult.m_barycentricCoords[VERTB]); } - } - // Repeat test for face bdc - + } + // Repeat test for face bdc if (pointOutsideBDC) { - closestPtPointTriangle(p, b, d, c,tempResult); + closestPtPointTriangle(p, b, d, c, tempResult); b3Vector3 q = tempResult.m_closestPointOnSimplex; //convert result bitmask! - b3Scalar sqDist = (q - p).dot( q - p); - if (sqDist < bestSqDist) + b3Scalar sqDist = (q - p).dot(q - p); + if (sqDist < bestSqDist) { bestSqDist = sqDist; finalResult.m_closestPointOnSimplex = q; @@ -585,25 +553,22 @@ bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB; finalResult.setBarycentricCoordinates( - 0, - tempResult.m_barycentricCoords[VERTA], - tempResult.m_barycentricCoords[VERTC], - tempResult.m_barycentricCoords[VERTB] - ); - + 0, + tempResult.m_barycentricCoords[VERTA], + tempResult.m_barycentricCoords[VERTC], + tempResult.m_barycentricCoords[VERTB]); } - } + } //help! we ended up full ! - + if (finalResult.m_usedVertices.usedVertexA && finalResult.m_usedVertices.usedVertexB && finalResult.m_usedVertices.usedVertexC && - finalResult.m_usedVertices.usedVertexD) + finalResult.m_usedVertices.usedVertexD) { return true; } - return true; + return true; } - diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h index a6e27667d8..b40b169978 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h @@ -13,22 +13,19 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - - #ifndef B3_VORONOI_SIMPLEX_SOLVER_H #define B3_VORONOI_SIMPLEX_SOLVER_H #include "Bullet3Common/b3Vector3.h" - #define VORONOI_SIMPLEX_MAX_VERTS 5 ///disable next define, or use defaultCollisionConfiguration->getSimplexSolver()->setEqualVertexThreshold(0.f) to disable/configure //#define BT_USE_EQUAL_VERTEX_THRESHOLD #define VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD 0.0001f - -struct b3UsageBitfield{ +struct b3UsageBitfield +{ b3UsageBitfield() { reset(); @@ -41,137 +38,127 @@ struct b3UsageBitfield{ usedVertexC = false; usedVertexD = false; } - unsigned short usedVertexA : 1; - unsigned short usedVertexB : 1; - unsigned short usedVertexC : 1; - unsigned short usedVertexD : 1; - unsigned short unused1 : 1; - unsigned short unused2 : 1; - unsigned short unused3 : 1; - unsigned short unused4 : 1; + unsigned short usedVertexA : 1; + unsigned short usedVertexB : 1; + unsigned short usedVertexC : 1; + unsigned short usedVertexD : 1; + unsigned short unused1 : 1; + unsigned short unused2 : 1; + unsigned short unused3 : 1; + unsigned short unused4 : 1; }; - -struct b3SubSimplexClosestResult +struct b3SubSimplexClosestResult { - b3Vector3 m_closestPointOnSimplex; + b3Vector3 m_closestPointOnSimplex; //MASK for m_usedVertices - //stores the simplex vertex-usage, using the MASK, + //stores the simplex vertex-usage, using the MASK, // if m_usedVertices & MASK then the related vertex is used - b3UsageBitfield m_usedVertices; - b3Scalar m_barycentricCoords[4]; + b3UsageBitfield m_usedVertices; + b3Scalar m_barycentricCoords[4]; bool m_degenerate; - void reset() + void reset() { m_degenerate = false; setBarycentricCoordinates(); m_usedVertices.reset(); } - bool isValid() + bool isValid() { bool valid = (m_barycentricCoords[0] >= b3Scalar(0.)) && - (m_barycentricCoords[1] >= b3Scalar(0.)) && - (m_barycentricCoords[2] >= b3Scalar(0.)) && - (m_barycentricCoords[3] >= b3Scalar(0.)); - + (m_barycentricCoords[1] >= b3Scalar(0.)) && + (m_barycentricCoords[2] >= b3Scalar(0.)) && + (m_barycentricCoords[3] >= b3Scalar(0.)); return valid; } - void setBarycentricCoordinates(b3Scalar a=b3Scalar(0.),b3Scalar b=b3Scalar(0.),b3Scalar c=b3Scalar(0.),b3Scalar d=b3Scalar(0.)) + void setBarycentricCoordinates(b3Scalar a = b3Scalar(0.), b3Scalar b = b3Scalar(0.), b3Scalar c = b3Scalar(0.), b3Scalar d = b3Scalar(0.)) { m_barycentricCoords[0] = a; m_barycentricCoords[1] = b; m_barycentricCoords[2] = c; m_barycentricCoords[3] = d; } - }; /// b3VoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin. /// Can be used with GJK, as an alternative to Johnson distance algorithm. -B3_ATTRIBUTE_ALIGNED16(class) b3VoronoiSimplexSolver +B3_ATTRIBUTE_ALIGNED16(class) +b3VoronoiSimplexSolver { public: - B3_DECLARE_ALIGNED_ALLOCATOR(); - int m_numVertices; - - b3Vector3 m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS]; - b3Vector3 m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS]; - b3Vector3 m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS]; + int m_numVertices; - + b3Vector3 m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS]; + b3Vector3 m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS]; + b3Vector3 m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS]; - b3Vector3 m_cachedP1; - b3Vector3 m_cachedP2; - b3Vector3 m_cachedV; - b3Vector3 m_lastW; - - b3Scalar m_equalVertexThreshold; - bool m_cachedValidClosest; + b3Vector3 m_cachedP1; + b3Vector3 m_cachedP2; + b3Vector3 m_cachedV; + b3Vector3 m_lastW; + b3Scalar m_equalVertexThreshold; + bool m_cachedValidClosest; b3SubSimplexClosestResult m_cachedBC; - bool m_needsUpdate; - - void removeVertex(int index); - void reduceVertices (const b3UsageBitfield& usedVerts); - bool updateClosestVectorAndPoints(); + bool m_needsUpdate; - bool closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult); - int pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d); - bool closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result); + void removeVertex(int index); + void reduceVertices(const b3UsageBitfield& usedVerts); + bool updateClosestVectorAndPoints(); -public: + bool closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult); + int pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d); + bool closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result); +public: b3VoronoiSimplexSolver() - : m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD) + : m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD) { } - void reset(); - - void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q); + void reset(); - void setEqualVertexThreshold(b3Scalar threshold) - { - m_equalVertexThreshold = threshold; - } + void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q); - b3Scalar getEqualVertexThreshold() const - { - return m_equalVertexThreshold; - } + void setEqualVertexThreshold(b3Scalar threshold) + { + m_equalVertexThreshold = threshold; + } - bool closest(b3Vector3& v); + b3Scalar getEqualVertexThreshold() const + { + return m_equalVertexThreshold; + } - b3Scalar maxVertex(); + bool closest(b3Vector3 & v); - bool fullSimplex() const - { - return (m_numVertices == 4); - } + b3Scalar maxVertex(); - int getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const; + bool fullSimplex() const + { + return (m_numVertices == 4); + } - bool inSimplex(const b3Vector3& w); - - void backup_closest(b3Vector3& v) ; + int getSimplex(b3Vector3 * pBuf, b3Vector3 * qBuf, b3Vector3 * yBuf) const; - bool emptySimplex() const ; + bool inSimplex(const b3Vector3& w); - void compute_points(b3Vector3& p1, b3Vector3& p2) ; + void backup_closest(b3Vector3 & v); - int numVertices() const - { - return m_numVertices; - } + bool emptySimplex() const; + void compute_points(b3Vector3 & p1, b3Vector3 & p2); + int numVertices() const + { + return m_numVertices; + } }; -#endif //B3_VORONOI_SIMPLEX_SOLVER_H - +#endif //B3_VORONOI_SIMPLEX_SOLVER_H diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h index 4b3b49eae8..f1df8a6970 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h @@ -1,258 +1,257 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* bvhTraversalKernelCL= \ -"//keep this enum in sync with the CPU version (in btCollidable.h)\n" -"//written by Erwin Coumans\n" -"#define SHAPE_CONVEX_HULL 3\n" -"#define SHAPE_CONCAVE_TRIMESH 5\n" -"#define TRIANGLE_NUM_CONVEX_FACES 5\n" -"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" -"#define SHAPE_SPHERE 7\n" -"typedef unsigned int u32;\n" -"#define MAX_NUM_PARTS_IN_BITS 10\n" -"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n" -"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" -"typedef struct\n" -"{\n" -" //12 bytes\n" -" unsigned short int m_quantizedAabbMin[3];\n" -" unsigned short int m_quantizedAabbMax[3];\n" -" //4 bytes\n" -" int m_escapeIndexOrTriangleIndex;\n" -"} btQuantizedBvhNode;\n" -"typedef struct\n" -"{\n" -" float4 m_aabbMin;\n" -" float4 m_aabbMax;\n" -" float4 m_quantization;\n" -" int m_numNodes;\n" -" int m_numSubTrees;\n" -" int m_nodeOffset;\n" -" int m_subTreeOffset;\n" -"} b3BvhInfo;\n" -"int getTriangleIndex(const btQuantizedBvhNode* rootNode)\n" -"{\n" -" unsigned int x=0;\n" -" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" -" // Get only the lower bits where the triangle index is stored\n" -" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" -"}\n" -"int isLeaf(const btQuantizedBvhNode* rootNode)\n" -"{\n" -" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" -" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" -"}\n" -" \n" -"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n" -"{\n" -" return -rootNode->m_escapeIndexOrTriangleIndex;\n" -"}\n" -"typedef struct\n" -"{\n" -" //12 bytes\n" -" unsigned short int m_quantizedAabbMin[3];\n" -" unsigned short int m_quantizedAabbMax[3];\n" -" //4 bytes, points to the root of the subtree\n" -" int m_rootNodeIndex;\n" -" //4 bytes\n" -" int m_subtreeSize;\n" -" int m_padding[3];\n" -"} btBvhSubtreeInfo;\n" -"///keep this in sync with btCollidable.h\n" -"typedef struct\n" -"{\n" -" int m_numChildShapes;\n" -" int blaat2;\n" -" int m_shapeType;\n" -" int m_shapeIndex;\n" -" \n" -"} btCollidableGpu;\n" -"typedef struct\n" -"{\n" -" float4 m_childPosition;\n" -" float4 m_childOrientation;\n" -" int m_shapeIndex;\n" -" int m_unused0;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"} btGpuChildShape;\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" float4 m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" u32 m_collidableIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} BodyData;\n" -"typedef struct \n" -"{\n" -" union\n" -" {\n" -" float4 m_min;\n" -" float m_minElems[4];\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float4 m_max;\n" -" float m_maxElems[4];\n" -" int m_maxIndices[4];\n" -" };\n" -"} btAabbCL;\n" -"int testQuantizedAabbAgainstQuantizedAabb(\n" -" const unsigned short int* aabbMin1,\n" -" const unsigned short int* aabbMax1,\n" -" const unsigned short int* aabbMin2,\n" -" const unsigned short int* aabbMax2)\n" -"{\n" -" //int overlap = 1;\n" -" if (aabbMin1[0] > aabbMax2[0])\n" -" return 0;\n" -" if (aabbMax1[0] < aabbMin2[0])\n" -" return 0;\n" -" if (aabbMin1[1] > aabbMax2[1])\n" -" return 0;\n" -" if (aabbMax1[1] < aabbMin2[1])\n" -" return 0;\n" -" if (aabbMin1[2] > aabbMax2[2])\n" -" return 0;\n" -" if (aabbMax1[2] < aabbMin2[2])\n" -" return 0;\n" -" return 1;\n" -" //overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n" -" //overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n" -" //overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n" -" //return overlap;\n" -"}\n" -"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n" -"{\n" -" float4 clampedPoint = max(point2,bvhAabbMin);\n" -" clampedPoint = min (clampedPoint, bvhAabbMax);\n" -" float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n" -" if (isMax)\n" -" {\n" -" out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n" -" out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n" -" out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n" -" } else\n" -" {\n" -" out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n" -" out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n" -" out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n" -" }\n" -"}\n" -"// work-in-progress\n" -"__kernel void bvhTraversalKernel( __global const int4* pairs, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global btAabbCL* aabbs,\n" -" __global int4* concavePairsOut,\n" -" __global volatile int* numConcavePairsOut,\n" -" __global const btBvhSubtreeInfo* subtreeHeadersRoot,\n" -" __global const btQuantizedBvhNode* quantizedNodesRoot,\n" -" __global const b3BvhInfo* bvhInfos,\n" -" int numPairs,\n" -" int maxNumConcavePairsCapacity)\n" -"{\n" -" int id = get_global_id(0);\n" -" if (id>=numPairs)\n" -" return;\n" -" \n" -" int bodyIndexA = pairs[id].x;\n" -" int bodyIndexB = pairs[id].y;\n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" //once the broadphase avoids static-static pairs, we can remove this test\n" -" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" -" {\n" -" return;\n" -" }\n" -" \n" -" if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n" -" return;\n" -" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" -" \n" -" if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n" -" shapeTypeB!=SHAPE_SPHERE &&\n" -" shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n" -" )\n" -" return;\n" -" b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n" -" float4 bvhAabbMin = bvhInfo.m_aabbMin;\n" -" float4 bvhAabbMax = bvhInfo.m_aabbMax;\n" -" float4 bvhQuantization = bvhInfo.m_quantization;\n" -" int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n" -" __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n" -" __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n" -" \n" -" unsigned short int quantizedQueryAabbMin[3];\n" -" unsigned short int quantizedQueryAabbMax[3];\n" -" quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n" -" quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n" -" \n" -" for (int i=0;i<numSubtreeHeaders;i++)\n" -" {\n" -" btBvhSubtreeInfo subtree = subtreeHeaders[i];\n" -" \n" -" int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n" -" if (overlap != 0)\n" -" {\n" -" int startNodeIndex = subtree.m_rootNodeIndex;\n" -" int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n" -" int curIndex = startNodeIndex;\n" -" int escapeIndex;\n" -" int isLeafNode;\n" -" int aabbOverlap;\n" -" while (curIndex < endNodeIndex)\n" -" {\n" -" btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n" -" aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n" -" isLeafNode = isLeaf(&rootNode);\n" -" if (aabbOverlap)\n" -" {\n" -" if (isLeafNode)\n" -" {\n" -" int triangleIndex = getTriangleIndex(&rootNode);\n" -" if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" {\n" -" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n" -" int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n" -" for (int b=0;b<numChildrenB;b++)\n" -" {\n" -" if ((pairIdx+b)<maxNumConcavePairsCapacity)\n" -" {\n" -" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n" -" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n" -" concavePairsOut[pairIdx+b] = newPair;\n" -" }\n" -" }\n" -" } else\n" -" {\n" -" int pairIdx = atomic_inc(numConcavePairsOut);\n" -" if (pairIdx<maxNumConcavePairsCapacity)\n" -" {\n" -" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n" -" concavePairsOut[pairIdx] = newPair;\n" -" }\n" -" }\n" -" } \n" -" curIndex++;\n" -" } else\n" -" {\n" -" if (isLeafNode)\n" -" {\n" -" curIndex++;\n" -" } else\n" -" {\n" -" escapeIndex = getEscapeIndex(&rootNode);\n" -" curIndex += escapeIndex;\n" -" }\n" -" }\n" -" }\n" -" }\n" -" }\n" -"}\n" -; +static const char* bvhTraversalKernelCL = + "//keep this enum in sync with the CPU version (in btCollidable.h)\n" + "//written by Erwin Coumans\n" + "#define SHAPE_CONVEX_HULL 3\n" + "#define SHAPE_CONCAVE_TRIMESH 5\n" + "#define TRIANGLE_NUM_CONVEX_FACES 5\n" + "#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" + "#define SHAPE_SPHERE 7\n" + "typedef unsigned int u32;\n" + "#define MAX_NUM_PARTS_IN_BITS 10\n" + "///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n" + "///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" + "typedef struct\n" + "{\n" + " //12 bytes\n" + " unsigned short int m_quantizedAabbMin[3];\n" + " unsigned short int m_quantizedAabbMax[3];\n" + " //4 bytes\n" + " int m_escapeIndexOrTriangleIndex;\n" + "} btQuantizedBvhNode;\n" + "typedef struct\n" + "{\n" + " float4 m_aabbMin;\n" + " float4 m_aabbMax;\n" + " float4 m_quantization;\n" + " int m_numNodes;\n" + " int m_numSubTrees;\n" + " int m_nodeOffset;\n" + " int m_subTreeOffset;\n" + "} b3BvhInfo;\n" + "int getTriangleIndex(const btQuantizedBvhNode* rootNode)\n" + "{\n" + " unsigned int x=0;\n" + " unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" + " // Get only the lower bits where the triangle index is stored\n" + " return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" + "}\n" + "int isLeaf(const btQuantizedBvhNode* rootNode)\n" + "{\n" + " //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" + " return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" + "}\n" + " \n" + "int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n" + "{\n" + " return -rootNode->m_escapeIndexOrTriangleIndex;\n" + "}\n" + "typedef struct\n" + "{\n" + " //12 bytes\n" + " unsigned short int m_quantizedAabbMin[3];\n" + " unsigned short int m_quantizedAabbMax[3];\n" + " //4 bytes, points to the root of the subtree\n" + " int m_rootNodeIndex;\n" + " //4 bytes\n" + " int m_subtreeSize;\n" + " int m_padding[3];\n" + "} btBvhSubtreeInfo;\n" + "///keep this in sync with btCollidable.h\n" + "typedef struct\n" + "{\n" + " int m_numChildShapes;\n" + " int blaat2;\n" + " int m_shapeType;\n" + " int m_shapeIndex;\n" + " \n" + "} btCollidableGpu;\n" + "typedef struct\n" + "{\n" + " float4 m_childPosition;\n" + " float4 m_childOrientation;\n" + " int m_shapeIndex;\n" + " int m_unused0;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "} btGpuChildShape;\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " float4 m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " u32 m_collidableIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} BodyData;\n" + "typedef struct \n" + "{\n" + " union\n" + " {\n" + " float4 m_min;\n" + " float m_minElems[4];\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float4 m_max;\n" + " float m_maxElems[4];\n" + " int m_maxIndices[4];\n" + " };\n" + "} btAabbCL;\n" + "int testQuantizedAabbAgainstQuantizedAabb(\n" + " const unsigned short int* aabbMin1,\n" + " const unsigned short int* aabbMax1,\n" + " const unsigned short int* aabbMin2,\n" + " const unsigned short int* aabbMax2)\n" + "{\n" + " //int overlap = 1;\n" + " if (aabbMin1[0] > aabbMax2[0])\n" + " return 0;\n" + " if (aabbMax1[0] < aabbMin2[0])\n" + " return 0;\n" + " if (aabbMin1[1] > aabbMax2[1])\n" + " return 0;\n" + " if (aabbMax1[1] < aabbMin2[1])\n" + " return 0;\n" + " if (aabbMin1[2] > aabbMax2[2])\n" + " return 0;\n" + " if (aabbMax1[2] < aabbMin2[2])\n" + " return 0;\n" + " return 1;\n" + " //overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n" + " //overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n" + " //overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n" + " //return overlap;\n" + "}\n" + "void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n" + "{\n" + " float4 clampedPoint = max(point2,bvhAabbMin);\n" + " clampedPoint = min (clampedPoint, bvhAabbMax);\n" + " float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n" + " if (isMax)\n" + " {\n" + " out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n" + " out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n" + " out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n" + " } else\n" + " {\n" + " out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n" + " out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n" + " out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n" + " }\n" + "}\n" + "// work-in-progress\n" + "__kernel void bvhTraversalKernel( __global const int4* pairs, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global btAabbCL* aabbs,\n" + " __global int4* concavePairsOut,\n" + " __global volatile int* numConcavePairsOut,\n" + " __global const btBvhSubtreeInfo* subtreeHeadersRoot,\n" + " __global const btQuantizedBvhNode* quantizedNodesRoot,\n" + " __global const b3BvhInfo* bvhInfos,\n" + " int numPairs,\n" + " int maxNumConcavePairsCapacity)\n" + "{\n" + " int id = get_global_id(0);\n" + " if (id>=numPairs)\n" + " return;\n" + " \n" + " int bodyIndexA = pairs[id].x;\n" + " int bodyIndexB = pairs[id].y;\n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " //once the broadphase avoids static-static pairs, we can remove this test\n" + " if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" + " {\n" + " return;\n" + " }\n" + " \n" + " if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n" + " return;\n" + " int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" + " \n" + " if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n" + " shapeTypeB!=SHAPE_SPHERE &&\n" + " shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n" + " )\n" + " return;\n" + " b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n" + " float4 bvhAabbMin = bvhInfo.m_aabbMin;\n" + " float4 bvhAabbMax = bvhInfo.m_aabbMax;\n" + " float4 bvhQuantization = bvhInfo.m_quantization;\n" + " int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n" + " __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n" + " __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n" + " \n" + " unsigned short int quantizedQueryAabbMin[3];\n" + " unsigned short int quantizedQueryAabbMax[3];\n" + " quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n" + " quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n" + " \n" + " for (int i=0;i<numSubtreeHeaders;i++)\n" + " {\n" + " btBvhSubtreeInfo subtree = subtreeHeaders[i];\n" + " \n" + " int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n" + " if (overlap != 0)\n" + " {\n" + " int startNodeIndex = subtree.m_rootNodeIndex;\n" + " int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n" + " int curIndex = startNodeIndex;\n" + " int escapeIndex;\n" + " int isLeafNode;\n" + " int aabbOverlap;\n" + " while (curIndex < endNodeIndex)\n" + " {\n" + " btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n" + " aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n" + " isLeafNode = isLeaf(&rootNode);\n" + " if (aabbOverlap)\n" + " {\n" + " if (isLeafNode)\n" + " {\n" + " int triangleIndex = getTriangleIndex(&rootNode);\n" + " if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " {\n" + " int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n" + " int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n" + " for (int b=0;b<numChildrenB;b++)\n" + " {\n" + " if ((pairIdx+b)<maxNumConcavePairsCapacity)\n" + " {\n" + " int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n" + " int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n" + " concavePairsOut[pairIdx+b] = newPair;\n" + " }\n" + " }\n" + " } else\n" + " {\n" + " int pairIdx = atomic_inc(numConcavePairsOut);\n" + " if (pairIdx<maxNumConcavePairsCapacity)\n" + " {\n" + " int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n" + " concavePairsOut[pairIdx] = newPair;\n" + " }\n" + " }\n" + " } \n" + " curIndex++;\n" + " } else\n" + " {\n" + " if (isLeafNode)\n" + " {\n" + " curIndex++;\n" + " } else\n" + " {\n" + " escapeIndex = getEscapeIndex(&rootNode);\n" + " curIndex += escapeIndex;\n" + " }\n" + " }\n" + " }\n" + " }\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h index 7ed4b382c3..74959a931c 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h @@ -1,1446 +1,1445 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* mprKernelsCL= \ -"/***\n" -" * ---------------------------------\n" -" * Copyright (c)2012 Daniel Fiser <danfis@danfis.cz>\n" -" *\n" -" * This file was ported from mpr.c file, part of libccd.\n" -" * The Minkoski Portal Refinement implementation was ported \n" -" * to OpenCL by Erwin Coumans for the Bullet 3 Physics library.\n" -" * at http://github.com/erwincoumans/bullet3\n" -" *\n" -" * Distributed under the OSI-approved BSD License (the \"License\");\n" -" * see <http://www.opensource.org/licenses/bsd-license.php>.\n" -" * This software is distributed WITHOUT ANY WARRANTY; without even the\n" -" * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n" -" * See the License for more information.\n" -" */\n" -"#ifndef B3_MPR_PENETRATION_H\n" -"#define B3_MPR_PENETRATION_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_RIGIDBODY_DATA_H\n" -"#define B3_RIGIDBODY_DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#define B3_QUAT_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif\n" -"#endif\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Quat;\n" -" #define b3QuatConstArg const b3Quat\n" -" \n" -" \n" -"inline float4 b3FastNormalize4(float4 v)\n" -"{\n" -" v = (float4)(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -" \n" -"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" -"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" -"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" -"{\n" -" b3Quat ans;\n" -" ans = b3Cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" -"{\n" -" b3Quat q;\n" -" q=in;\n" -" //return b3FastNormalize4(in);\n" -" float len = native_sqrt(dot(q, q));\n" -" if(len > 0.f)\n" -" {\n" -" q *= 1.f / len;\n" -" }\n" -" else\n" -" {\n" -" q.x = q.y = q.z = 0.f;\n" -" q.w = 1.f;\n" -" }\n" -" return q;\n" -"}\n" -"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" b3Quat qInv = b3QuatInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" return b3QuatRotate( b3QuatInvert( q ), vec );\n" -"}\n" -"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" -"{\n" -" return b3QuatRotate( orientation, point ) + (translation);\n" -"}\n" -" \n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifndef B3_MAT3x3_H\n" -"#define B3_MAT3x3_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"typedef struct\n" -"{\n" -" b3Float4 m_row[3];\n" -"}b3Mat3x3;\n" -"#define b3Mat3x3ConstArg const b3Mat3x3\n" -"#define b3GetRow(m,row) (m.m_row[row])\n" -"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" -"{\n" -" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" -" b3Mat3x3 out;\n" -" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" -" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" -" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" -" out.m_row[0].w = 0.f;\n" -" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" -" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" -" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" -" out.m_row[1].w = 0.f;\n" -" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" -" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" -" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" -" out.m_row[2].w = 0.f;\n" -" return out;\n" -"}\n" -"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = fabs(matIn.m_row[0]);\n" -" out.m_row[1] = fabs(matIn.m_row[1]);\n" -" out.m_row[2] = fabs(matIn.m_row[2]);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtZero();\n" -"__inline\n" -"b3Mat3x3 mtIdentity();\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Mat3x3 mtZero()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(0.f);\n" -" m.m_row[1] = (b3Float4)(0.f);\n" -" m.m_row[2] = (b3Float4)(0.f);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtIdentity()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(1,0,0,0);\n" -" m.m_row[1] = (b3Float4)(0,1,0,0);\n" -" m.m_row[2] = (b3Float4)(0,0,1,0);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" -" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" -" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" -"{\n" -" b3Mat3x3 transB;\n" -" transB = mtTranspose( b );\n" -" b3Mat3x3 ans;\n" -" // why this doesn't run when 0ing in the for{}\n" -" a.m_row[0].w = 0.f;\n" -" a.m_row[1].w = 0.f;\n" -" a.m_row[2].w = 0.f;\n" -" for(int i=0; i<3; i++)\n" -" {\n" -"// a.m_row[i].w = 0.f;\n" -" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" -" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" -" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" -" ans.m_row[i].w = 0.f;\n" -" }\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" -"{\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a.m_row[0], b );\n" -" ans.y = b3Dot3F4( a.m_row[1], b );\n" -" ans.z = b3Dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" -"{\n" -" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a, colx );\n" -" ans.y = b3Dot3F4( a, coly );\n" -" ans.z = b3Dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"#endif\n" -"#endif //B3_MAT3x3_H\n" -"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" -"struct b3RigidBodyData\n" -"{\n" -" b3Float4 m_pos;\n" -" b3Quat m_quat;\n" -" b3Float4 m_linVel;\n" -" b3Float4 m_angVel;\n" -" int m_collidableIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"};\n" -"typedef struct b3InertiaData b3InertiaData_t;\n" -"struct b3InertiaData\n" -"{\n" -" b3Mat3x3 m_invInertiaWorld;\n" -" b3Mat3x3 m_initInvInertia;\n" -"};\n" -"#endif //B3_RIGIDBODY_DATA_H\n" -" \n" -"#ifndef B3_CONVEX_POLYHEDRON_DATA_H\n" -"#define B3_CONVEX_POLYHEDRON_DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"typedef struct b3GpuFace b3GpuFace_t;\n" -"struct b3GpuFace\n" -"{\n" -" b3Float4 m_plane;\n" -" int m_indexOffset;\n" -" int m_numIndices;\n" -" int m_unusedPadding1;\n" -" int m_unusedPadding2;\n" -"};\n" -"typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n" -"struct b3ConvexPolyhedronData\n" -"{\n" -" b3Float4 m_localCenter;\n" -" b3Float4 m_extents;\n" -" b3Float4 mC;\n" -" b3Float4 mE;\n" -" float m_radius;\n" -" int m_faceOffset;\n" -" int m_numFaces;\n" -" int m_numVertices;\n" -" int m_vertexOffset;\n" -" int m_uniqueEdgesOffset;\n" -" int m_numUniqueEdges;\n" -" int m_unused;\n" -"};\n" -"#endif //B3_CONVEX_POLYHEDRON_DATA_H\n" -"#ifndef B3_COLLIDABLE_H\n" -"#define B3_COLLIDABLE_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"enum b3ShapeTypes\n" -"{\n" -" SHAPE_HEIGHT_FIELD=1,\n" -" SHAPE_CONVEX_HULL=3,\n" -" SHAPE_PLANE=4,\n" -" SHAPE_CONCAVE_TRIMESH=5,\n" -" SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" -" SHAPE_SPHERE=7,\n" -" MAX_NUM_SHAPE_TYPES,\n" -"};\n" -"typedef struct b3Collidable b3Collidable_t;\n" -"struct b3Collidable\n" -"{\n" -" union {\n" -" int m_numChildShapes;\n" -" int m_bvhIndex;\n" -" };\n" -" union\n" -" {\n" -" float m_radius;\n" -" int m_compoundBvhIndex;\n" -" };\n" -" int m_shapeType;\n" -" int m_shapeIndex;\n" -"};\n" -"typedef struct b3GpuChildShape b3GpuChildShape_t;\n" -"struct b3GpuChildShape\n" -"{\n" -" b3Float4 m_childPosition;\n" -" b3Quat m_childOrientation;\n" -" int m_shapeIndex;\n" -" int m_unused0;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"struct b3CompoundOverlappingPair\n" -"{\n" -" int m_bodyIndexA;\n" -" int m_bodyIndexB;\n" -"// int m_pairType;\n" -" int m_childShapeIndexA;\n" -" int m_childShapeIndexB;\n" -"};\n" -"#endif //B3_COLLIDABLE_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#define B3_MPR_SQRT sqrt\n" -"#endif\n" -"#define B3_MPR_FMIN(x, y) ((x) < (y) ? (x) : (y))\n" -"#define B3_MPR_FABS fabs\n" -"#define B3_MPR_TOLERANCE 1E-6f\n" -"#define B3_MPR_MAX_ITERATIONS 1000\n" -"struct _b3MprSupport_t \n" -"{\n" -" b3Float4 v; //!< Support point in minkowski sum\n" -" b3Float4 v1; //!< Support point in obj1\n" -" b3Float4 v2; //!< Support point in obj2\n" -"};\n" -"typedef struct _b3MprSupport_t b3MprSupport_t;\n" -"struct _b3MprSimplex_t \n" -"{\n" -" b3MprSupport_t ps[4];\n" -" int last; //!< index of last added point\n" -"};\n" -"typedef struct _b3MprSimplex_t b3MprSimplex_t;\n" -"inline b3MprSupport_t* b3MprSimplexPointW(b3MprSimplex_t *s, int idx)\n" -"{\n" -" return &s->ps[idx];\n" -"}\n" -"inline void b3MprSimplexSetSize(b3MprSimplex_t *s, int size)\n" -"{\n" -" s->last = size - 1;\n" -"}\n" -"inline int b3MprSimplexSize(const b3MprSimplex_t *s)\n" -"{\n" -" return s->last + 1;\n" -"}\n" -"inline const b3MprSupport_t* b3MprSimplexPoint(const b3MprSimplex_t* s, int idx)\n" -"{\n" -" // here is no check on boundaries\n" -" return &s->ps[idx];\n" -"}\n" -"inline void b3MprSupportCopy(b3MprSupport_t *d, const b3MprSupport_t *s)\n" -"{\n" -" *d = *s;\n" -"}\n" -"inline void b3MprSimplexSet(b3MprSimplex_t *s, size_t pos, const b3MprSupport_t *a)\n" -"{\n" -" b3MprSupportCopy(s->ps + pos, a);\n" -"}\n" -"inline void b3MprSimplexSwap(b3MprSimplex_t *s, size_t pos1, size_t pos2)\n" -"{\n" -" b3MprSupport_t supp;\n" -" b3MprSupportCopy(&supp, &s->ps[pos1]);\n" -" b3MprSupportCopy(&s->ps[pos1], &s->ps[pos2]);\n" -" b3MprSupportCopy(&s->ps[pos2], &supp);\n" -"}\n" -"inline int b3MprIsZero(float val)\n" -"{\n" -" return B3_MPR_FABS(val) < FLT_EPSILON;\n" -"}\n" -"inline int b3MprEq(float _a, float _b)\n" -"{\n" -" float ab;\n" -" float a, b;\n" -" ab = B3_MPR_FABS(_a - _b);\n" -" if (B3_MPR_FABS(ab) < FLT_EPSILON)\n" -" return 1;\n" -" a = B3_MPR_FABS(_a);\n" -" b = B3_MPR_FABS(_b);\n" -" if (b > a){\n" -" return ab < FLT_EPSILON * b;\n" -" }else{\n" -" return ab < FLT_EPSILON * a;\n" -" }\n" -"}\n" -"inline int b3MprVec3Eq(const b3Float4* a, const b3Float4 *b)\n" -"{\n" -" return b3MprEq((*a).x, (*b).x)\n" -" && b3MprEq((*a).y, (*b).y)\n" -" && b3MprEq((*a).z, (*b).z);\n" -"}\n" -"inline b3Float4 b3LocalGetSupportVertex(b3Float4ConstArg supportVec,__global const b3ConvexPolyhedronData_t* hull, b3ConstArray(b3Float4) verticesA)\n" -"{\n" -" b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n" -" float maxDot = -B3_LARGE_FLOAT;\n" -" if( 0 < hull->m_numVertices )\n" -" {\n" -" const b3Float4 scaled = supportVec;\n" -" int index = b3MaxDot(scaled, &verticesA[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n" -" return verticesA[hull->m_vertexOffset+index];\n" -" }\n" -" return supVec;\n" -"}\n" -"B3_STATIC void b3MprConvexSupport(int pairIndex,int bodyIndex, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" -" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" -" b3ConstArray(b3Collidable_t) cpuCollidables,\n" -" b3ConstArray(b3Float4) cpuVertices,\n" -" __global b3Float4* sepAxis,\n" -" const b3Float4* _dir, b3Float4* outp, int logme)\n" -"{\n" -" //dir is in worldspace, move to local space\n" -" \n" -" b3Float4 pos = cpuBodyBuf[bodyIndex].m_pos;\n" -" b3Quat orn = cpuBodyBuf[bodyIndex].m_quat;\n" -" \n" -" b3Float4 dir = b3MakeFloat4((*_dir).x,(*_dir).y,(*_dir).z,0.f);\n" -" \n" -" const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn),dir);\n" -" \n" -" //find local support vertex\n" -" int colIndex = cpuBodyBuf[bodyIndex].m_collidableIdx;\n" -" \n" -" b3Assert(cpuCollidables[colIndex].m_shapeType==SHAPE_CONVEX_HULL);\n" -" __global const b3ConvexPolyhedronData_t* hull = &cpuConvexData[cpuCollidables[colIndex].m_shapeIndex];\n" -" \n" -" b3Float4 pInA;\n" -" if (logme)\n" -" {\n" -" b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n" -" float maxDot = -B3_LARGE_FLOAT;\n" -" if( 0 < hull->m_numVertices )\n" -" {\n" -" const b3Float4 scaled = localDir;\n" -" int index = b3MaxDot(scaled, &cpuVertices[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n" -" pInA = cpuVertices[hull->m_vertexOffset+index];\n" -" \n" -" }\n" -" } else\n" -" {\n" -" pInA = b3LocalGetSupportVertex(localDir,hull,cpuVertices);\n" -" }\n" -" //move vertex to world space\n" -" *outp = b3TransformPoint(pInA,pos,orn);\n" -" \n" -"}\n" -"inline void b3MprSupport(int pairIndex,int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" -" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" -" b3ConstArray(b3Collidable_t) cpuCollidables,\n" -" b3ConstArray(b3Float4) cpuVertices,\n" -" __global b3Float4* sepAxis,\n" -" const b3Float4* _dir, b3MprSupport_t *supp)\n" -"{\n" -" b3Float4 dir;\n" -" dir = *_dir;\n" -" b3MprConvexSupport(pairIndex,bodyIndexA,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v1,0);\n" -" dir = *_dir*-1.f;\n" -" b3MprConvexSupport(pairIndex,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v2,0);\n" -" supp->v = supp->v1 - supp->v2;\n" -"}\n" -"inline void b3FindOrigin(int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, b3MprSupport_t *center)\n" -"{\n" -" center->v1 = cpuBodyBuf[bodyIndexA].m_pos;\n" -" center->v2 = cpuBodyBuf[bodyIndexB].m_pos;\n" -" center->v = center->v1 - center->v2;\n" -"}\n" -"inline void b3MprVec3Set(b3Float4 *v, float x, float y, float z)\n" -"{\n" -" (*v).x = x;\n" -" (*v).y = y;\n" -" (*v).z = z;\n" -" (*v).w = 0.f;\n" -"}\n" -"inline void b3MprVec3Add(b3Float4 *v, const b3Float4 *w)\n" -"{\n" -" (*v).x += (*w).x;\n" -" (*v).y += (*w).y;\n" -" (*v).z += (*w).z;\n" -"}\n" -"inline void b3MprVec3Copy(b3Float4 *v, const b3Float4 *w)\n" -"{\n" -" *v = *w;\n" -"}\n" -"inline void b3MprVec3Scale(b3Float4 *d, float k)\n" -"{\n" -" *d *= k;\n" -"}\n" -"inline float b3MprVec3Dot(const b3Float4 *a, const b3Float4 *b)\n" -"{\n" -" float dot;\n" -" dot = b3Dot3F4(*a,*b);\n" -" return dot;\n" -"}\n" -"inline float b3MprVec3Len2(const b3Float4 *v)\n" -"{\n" -" return b3MprVec3Dot(v, v);\n" -"}\n" -"inline void b3MprVec3Normalize(b3Float4 *d)\n" -"{\n" -" float k = 1.f / B3_MPR_SQRT(b3MprVec3Len2(d));\n" -" b3MprVec3Scale(d, k);\n" -"}\n" -"inline void b3MprVec3Cross(b3Float4 *d, const b3Float4 *a, const b3Float4 *b)\n" -"{\n" -" *d = b3Cross3(*a,*b);\n" -" \n" -"}\n" -"inline void b3MprVec3Sub2(b3Float4 *d, const b3Float4 *v, const b3Float4 *w)\n" -"{\n" -" *d = *v - *w;\n" -"}\n" -"inline void b3PortalDir(const b3MprSimplex_t *portal, b3Float4 *dir)\n" -"{\n" -" b3Float4 v2v1, v3v1;\n" -" b3MprVec3Sub2(&v2v1, &b3MprSimplexPoint(portal, 2)->v,\n" -" &b3MprSimplexPoint(portal, 1)->v);\n" -" b3MprVec3Sub2(&v3v1, &b3MprSimplexPoint(portal, 3)->v,\n" -" &b3MprSimplexPoint(portal, 1)->v);\n" -" b3MprVec3Cross(dir, &v2v1, &v3v1);\n" -" b3MprVec3Normalize(dir);\n" -"}\n" -"inline int portalEncapsulesOrigin(const b3MprSimplex_t *portal,\n" -" const b3Float4 *dir)\n" -"{\n" -" float dot;\n" -" dot = b3MprVec3Dot(dir, &b3MprSimplexPoint(portal, 1)->v);\n" -" return b3MprIsZero(dot) || dot > 0.f;\n" -"}\n" -"inline int portalReachTolerance(const b3MprSimplex_t *portal,\n" -" const b3MprSupport_t *v4,\n" -" const b3Float4 *dir)\n" -"{\n" -" float dv1, dv2, dv3, dv4;\n" -" float dot1, dot2, dot3;\n" -" // find the smallest dot product of dir and {v1-v4, v2-v4, v3-v4}\n" -" dv1 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, dir);\n" -" dv2 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, dir);\n" -" dv3 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, dir);\n" -" dv4 = b3MprVec3Dot(&v4->v, dir);\n" -" dot1 = dv4 - dv1;\n" -" dot2 = dv4 - dv2;\n" -" dot3 = dv4 - dv3;\n" -" dot1 = B3_MPR_FMIN(dot1, dot2);\n" -" dot1 = B3_MPR_FMIN(dot1, dot3);\n" -" return b3MprEq(dot1, B3_MPR_TOLERANCE) || dot1 < B3_MPR_TOLERANCE;\n" -"}\n" -"inline int portalCanEncapsuleOrigin(const b3MprSimplex_t *portal, \n" -" const b3MprSupport_t *v4,\n" -" const b3Float4 *dir)\n" -"{\n" -" float dot;\n" -" dot = b3MprVec3Dot(&v4->v, dir);\n" -" return b3MprIsZero(dot) || dot > 0.f;\n" -"}\n" -"inline void b3ExpandPortal(b3MprSimplex_t *portal,\n" -" const b3MprSupport_t *v4)\n" -"{\n" -" float dot;\n" -" b3Float4 v4v0;\n" -" b3MprVec3Cross(&v4v0, &v4->v, &b3MprSimplexPoint(portal, 0)->v);\n" -" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &v4v0);\n" -" if (dot > 0.f){\n" -" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &v4v0);\n" -" if (dot > 0.f){\n" -" b3MprSimplexSet(portal, 1, v4);\n" -" }else{\n" -" b3MprSimplexSet(portal, 3, v4);\n" -" }\n" -" }else{\n" -" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &v4v0);\n" -" if (dot > 0.f){\n" -" b3MprSimplexSet(portal, 2, v4);\n" -" }else{\n" -" b3MprSimplexSet(portal, 1, v4);\n" -" }\n" -" }\n" -"}\n" -"B3_STATIC int b3DiscoverPortal(int pairIndex, int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" -" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" -" b3ConstArray(b3Collidable_t) cpuCollidables,\n" -" b3ConstArray(b3Float4) cpuVertices,\n" -" __global b3Float4* sepAxis,\n" -" __global int* hasSepAxis,\n" -" b3MprSimplex_t *portal)\n" -"{\n" -" b3Float4 dir, va, vb;\n" -" float dot;\n" -" int cont;\n" -" \n" -" \n" -" // vertex 0 is center of portal\n" -" b3FindOrigin(bodyIndexA,bodyIndexB,cpuBodyBuf, b3MprSimplexPointW(portal, 0));\n" -" // vertex 0 is center of portal\n" -" b3MprSimplexSetSize(portal, 1);\n" -" \n" -" b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" -" b3Float4* b3mpr_vec3_origin = &zero;\n" -" if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 0)->v, b3mpr_vec3_origin)){\n" -" // Portal's center lies on origin (0,0,0) => we know that objects\n" -" // intersect but we would need to know penetration info.\n" -" // So move center little bit...\n" -" b3MprVec3Set(&va, FLT_EPSILON * 10.f, 0.f, 0.f);\n" -" b3MprVec3Add(&b3MprSimplexPointW(portal, 0)->v, &va);\n" -" }\n" -" // vertex 1 = support in direction of origin\n" -" b3MprVec3Copy(&dir, &b3MprSimplexPoint(portal, 0)->v);\n" -" b3MprVec3Scale(&dir, -1.f);\n" -" b3MprVec3Normalize(&dir);\n" -" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 1));\n" -" b3MprSimplexSetSize(portal, 2);\n" -" // test if origin isn't outside of v1\n" -" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &dir);\n" -" \n" -" if (b3MprIsZero(dot) || dot < 0.f)\n" -" return -1;\n" -" // vertex 2\n" -" b3MprVec3Cross(&dir, &b3MprSimplexPoint(portal, 0)->v,\n" -" &b3MprSimplexPoint(portal, 1)->v);\n" -" if (b3MprIsZero(b3MprVec3Len2(&dir))){\n" -" if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 1)->v, b3mpr_vec3_origin)){\n" -" // origin lies on v1\n" -" return 1;\n" -" }else{\n" -" // origin lies on v0-v1 segment\n" -" return 2;\n" -" }\n" -" }\n" -" b3MprVec3Normalize(&dir);\n" -" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 2));\n" -" \n" -" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &dir);\n" -" if (b3MprIsZero(dot) || dot < 0.f)\n" -" return -1;\n" -" b3MprSimplexSetSize(portal, 3);\n" -" // vertex 3 direction\n" -" b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n" -" &b3MprSimplexPoint(portal, 0)->v);\n" -" b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n" -" &b3MprSimplexPoint(portal, 0)->v);\n" -" b3MprVec3Cross(&dir, &va, &vb);\n" -" b3MprVec3Normalize(&dir);\n" -" // it is better to form portal faces to be oriented \"outside\" origin\n" -" dot = b3MprVec3Dot(&dir, &b3MprSimplexPoint(portal, 0)->v);\n" -" if (dot > 0.f){\n" -" b3MprSimplexSwap(portal, 1, 2);\n" -" b3MprVec3Scale(&dir, -1.f);\n" -" }\n" -" while (b3MprSimplexSize(portal) < 4){\n" -" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 3));\n" -" \n" -" dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &dir);\n" -" if (b3MprIsZero(dot) || dot < 0.f)\n" -" return -1;\n" -" cont = 0;\n" -" // test if origin is outside (v1, v0, v3) - set v2 as v3 and\n" -" // continue\n" -" b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 1)->v,\n" -" &b3MprSimplexPoint(portal, 3)->v);\n" -" dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n" -" if (dot < 0.f && !b3MprIsZero(dot)){\n" -" b3MprSimplexSet(portal, 2, b3MprSimplexPoint(portal, 3));\n" -" cont = 1;\n" -" }\n" -" if (!cont){\n" -" // test if origin is outside (v3, v0, v2) - set v1 as v3 and\n" -" // continue\n" -" b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 3)->v,\n" -" &b3MprSimplexPoint(portal, 2)->v);\n" -" dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n" -" if (dot < 0.f && !b3MprIsZero(dot)){\n" -" b3MprSimplexSet(portal, 1, b3MprSimplexPoint(portal, 3));\n" -" cont = 1;\n" -" }\n" -" }\n" -" if (cont){\n" -" b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n" -" &b3MprSimplexPoint(portal, 0)->v);\n" -" b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n" -" &b3MprSimplexPoint(portal, 0)->v);\n" -" b3MprVec3Cross(&dir, &va, &vb);\n" -" b3MprVec3Normalize(&dir);\n" -" }else{\n" -" b3MprSimplexSetSize(portal, 4);\n" -" }\n" -" }\n" -" return 0;\n" -"}\n" -"B3_STATIC int b3RefinePortal(int pairIndex,int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" -" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" -" b3ConstArray(b3Collidable_t) cpuCollidables,\n" -" b3ConstArray(b3Float4) cpuVertices,\n" -" __global b3Float4* sepAxis,\n" -" b3MprSimplex_t *portal)\n" -"{\n" -" b3Float4 dir;\n" -" b3MprSupport_t v4;\n" -" for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n" -" //while (1)\n" -" {\n" -" // compute direction outside the portal (from v0 throught v1,v2,v3\n" -" // face)\n" -" b3PortalDir(portal, &dir);\n" -" // test if origin is inside the portal\n" -" if (portalEncapsulesOrigin(portal, &dir))\n" -" return 0;\n" -" // get next support point\n" -" \n" -" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n" -" // test if v4 can expand portal to contain origin and if portal\n" -" // expanding doesn't reach given tolerance\n" -" if (!portalCanEncapsuleOrigin(portal, &v4, &dir)\n" -" || portalReachTolerance(portal, &v4, &dir))\n" -" {\n" -" return -1;\n" -" }\n" -" // v1-v2-v3 triangle must be rearranged to face outside Minkowski\n" -" // difference (direction from v0).\n" -" b3ExpandPortal(portal, &v4);\n" -" }\n" -" return -1;\n" -"}\n" -"B3_STATIC void b3FindPos(const b3MprSimplex_t *portal, b3Float4 *pos)\n" -"{\n" -" b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" -" b3Float4* b3mpr_vec3_origin = &zero;\n" -" b3Float4 dir;\n" -" size_t i;\n" -" float b[4], sum, inv;\n" -" b3Float4 vec, p1, p2;\n" -" b3PortalDir(portal, &dir);\n" -" // use barycentric coordinates of tetrahedron to find origin\n" -" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n" -" &b3MprSimplexPoint(portal, 2)->v);\n" -" b[0] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n" -" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n" -" &b3MprSimplexPoint(portal, 2)->v);\n" -" b[1] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n" -" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 0)->v,\n" -" &b3MprSimplexPoint(portal, 1)->v);\n" -" b[2] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n" -" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n" -" &b3MprSimplexPoint(portal, 1)->v);\n" -" b[3] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n" -" sum = b[0] + b[1] + b[2] + b[3];\n" -" if (b3MprIsZero(sum) || sum < 0.f){\n" -" b[0] = 0.f;\n" -" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n" -" &b3MprSimplexPoint(portal, 3)->v);\n" -" b[1] = b3MprVec3Dot(&vec, &dir);\n" -" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n" -" &b3MprSimplexPoint(portal, 1)->v);\n" -" b[2] = b3MprVec3Dot(&vec, &dir);\n" -" b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n" -" &b3MprSimplexPoint(portal, 2)->v);\n" -" b[3] = b3MprVec3Dot(&vec, &dir);\n" -" sum = b[1] + b[2] + b[3];\n" -" }\n" -" inv = 1.f / sum;\n" -" b3MprVec3Copy(&p1, b3mpr_vec3_origin);\n" -" b3MprVec3Copy(&p2, b3mpr_vec3_origin);\n" -" for (i = 0; i < 4; i++){\n" -" b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v1);\n" -" b3MprVec3Scale(&vec, b[i]);\n" -" b3MprVec3Add(&p1, &vec);\n" -" b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v2);\n" -" b3MprVec3Scale(&vec, b[i]);\n" -" b3MprVec3Add(&p2, &vec);\n" -" }\n" -" b3MprVec3Scale(&p1, inv);\n" -" b3MprVec3Scale(&p2, inv);\n" -" b3MprVec3Copy(pos, &p1);\n" -" b3MprVec3Add(pos, &p2);\n" -" b3MprVec3Scale(pos, 0.5);\n" -"}\n" -"inline float b3MprVec3Dist2(const b3Float4 *a, const b3Float4 *b)\n" -"{\n" -" b3Float4 ab;\n" -" b3MprVec3Sub2(&ab, a, b);\n" -" return b3MprVec3Len2(&ab);\n" -"}\n" -"inline float _b3MprVec3PointSegmentDist2(const b3Float4 *P,\n" -" const b3Float4 *x0,\n" -" const b3Float4 *b,\n" -" b3Float4 *witness)\n" -"{\n" -" // The computation comes from solving equation of segment:\n" -" // S(t) = x0 + t.d\n" -" // where - x0 is initial point of segment\n" -" // - d is direction of segment from x0 (|d| > 0)\n" -" // - t belongs to <0, 1> interval\n" -" // \n" -" // Than, distance from a segment to some point P can be expressed:\n" -" // D(t) = |x0 + t.d - P|^2\n" -" // which is distance from any point on segment. Minimization\n" -" // of this function brings distance from P to segment.\n" -" // Minimization of D(t) leads to simple quadratic equation that's\n" -" // solving is straightforward.\n" -" //\n" -" // Bonus of this method is witness point for free.\n" -" float dist, t;\n" -" b3Float4 d, a;\n" -" // direction of segment\n" -" b3MprVec3Sub2(&d, b, x0);\n" -" // precompute vector from P to x0\n" -" b3MprVec3Sub2(&a, x0, P);\n" -" t = -1.f * b3MprVec3Dot(&a, &d);\n" -" t /= b3MprVec3Len2(&d);\n" -" if (t < 0.f || b3MprIsZero(t)){\n" -" dist = b3MprVec3Dist2(x0, P);\n" -" if (witness)\n" -" b3MprVec3Copy(witness, x0);\n" -" }else if (t > 1.f || b3MprEq(t, 1.f)){\n" -" dist = b3MprVec3Dist2(b, P);\n" -" if (witness)\n" -" b3MprVec3Copy(witness, b);\n" -" }else{\n" -" if (witness){\n" -" b3MprVec3Copy(witness, &d);\n" -" b3MprVec3Scale(witness, t);\n" -" b3MprVec3Add(witness, x0);\n" -" dist = b3MprVec3Dist2(witness, P);\n" -" }else{\n" -" // recycling variables\n" -" b3MprVec3Scale(&d, t);\n" -" b3MprVec3Add(&d, &a);\n" -" dist = b3MprVec3Len2(&d);\n" -" }\n" -" }\n" -" return dist;\n" -"}\n" -"inline float b3MprVec3PointTriDist2(const b3Float4 *P,\n" -" const b3Float4 *x0, const b3Float4 *B,\n" -" const b3Float4 *C,\n" -" b3Float4 *witness)\n" -"{\n" -" // Computation comes from analytic expression for triangle (x0, B, C)\n" -" // T(s, t) = x0 + s.d1 + t.d2, where d1 = B - x0 and d2 = C - x0 and\n" -" // Then equation for distance is:\n" -" // D(s, t) = | T(s, t) - P |^2\n" -" // This leads to minimization of quadratic function of two variables.\n" -" // The solution from is taken only if s is between 0 and 1, t is\n" -" // between 0 and 1 and t + s < 1, otherwise distance from segment is\n" -" // computed.\n" -" b3Float4 d1, d2, a;\n" -" float u, v, w, p, q, r;\n" -" float s, t, dist, dist2;\n" -" b3Float4 witness2;\n" -" b3MprVec3Sub2(&d1, B, x0);\n" -" b3MprVec3Sub2(&d2, C, x0);\n" -" b3MprVec3Sub2(&a, x0, P);\n" -" u = b3MprVec3Dot(&a, &a);\n" -" v = b3MprVec3Dot(&d1, &d1);\n" -" w = b3MprVec3Dot(&d2, &d2);\n" -" p = b3MprVec3Dot(&a, &d1);\n" -" q = b3MprVec3Dot(&a, &d2);\n" -" r = b3MprVec3Dot(&d1, &d2);\n" -" s = (q * r - w * p) / (w * v - r * r);\n" -" t = (-s * r - q) / w;\n" -" if ((b3MprIsZero(s) || s > 0.f)\n" -" && (b3MprEq(s, 1.f) || s < 1.f)\n" -" && (b3MprIsZero(t) || t > 0.f)\n" -" && (b3MprEq(t, 1.f) || t < 1.f)\n" -" && (b3MprEq(t + s, 1.f) || t + s < 1.f)){\n" -" if (witness){\n" -" b3MprVec3Scale(&d1, s);\n" -" b3MprVec3Scale(&d2, t);\n" -" b3MprVec3Copy(witness, x0);\n" -" b3MprVec3Add(witness, &d1);\n" -" b3MprVec3Add(witness, &d2);\n" -" dist = b3MprVec3Dist2(witness, P);\n" -" }else{\n" -" dist = s * s * v;\n" -" dist += t * t * w;\n" -" dist += 2.f * s * t * r;\n" -" dist += 2.f * s * p;\n" -" dist += 2.f * t * q;\n" -" dist += u;\n" -" }\n" -" }else{\n" -" dist = _b3MprVec3PointSegmentDist2(P, x0, B, witness);\n" -" dist2 = _b3MprVec3PointSegmentDist2(P, x0, C, &witness2);\n" -" if (dist2 < dist){\n" -" dist = dist2;\n" -" if (witness)\n" -" b3MprVec3Copy(witness, &witness2);\n" -" }\n" -" dist2 = _b3MprVec3PointSegmentDist2(P, B, C, &witness2);\n" -" if (dist2 < dist){\n" -" dist = dist2;\n" -" if (witness)\n" -" b3MprVec3Copy(witness, &witness2);\n" -" }\n" -" }\n" -" return dist;\n" -"}\n" -"B3_STATIC void b3FindPenetr(int pairIndex,int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" -" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" -" b3ConstArray(b3Collidable_t) cpuCollidables,\n" -" b3ConstArray(b3Float4) cpuVertices,\n" -" __global b3Float4* sepAxis,\n" -" b3MprSimplex_t *portal,\n" -" float *depth, b3Float4 *pdir, b3Float4 *pos)\n" -"{\n" -" b3Float4 dir;\n" -" b3MprSupport_t v4;\n" -" unsigned long iterations;\n" -" b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" -" b3Float4* b3mpr_vec3_origin = &zero;\n" -" iterations = 1UL;\n" -" for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n" -" //while (1)\n" -" {\n" -" // compute portal direction and obtain next support point\n" -" b3PortalDir(portal, &dir);\n" -" \n" -" b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n" -" // reached tolerance -> find penetration info\n" -" if (portalReachTolerance(portal, &v4, &dir)\n" -" || iterations ==B3_MPR_MAX_ITERATIONS)\n" -" {\n" -" *depth = b3MprVec3PointTriDist2(b3mpr_vec3_origin,&b3MprSimplexPoint(portal, 1)->v,&b3MprSimplexPoint(portal, 2)->v,&b3MprSimplexPoint(portal, 3)->v,pdir);\n" -" *depth = B3_MPR_SQRT(*depth);\n" -" \n" -" if (b3MprIsZero((*pdir).x) && b3MprIsZero((*pdir).y) && b3MprIsZero((*pdir).z))\n" -" {\n" -" \n" -" *pdir = dir;\n" -" } \n" -" b3MprVec3Normalize(pdir);\n" -" \n" -" // barycentric coordinates:\n" -" b3FindPos(portal, pos);\n" -" return;\n" -" }\n" -" b3ExpandPortal(portal, &v4);\n" -" iterations++;\n" -" }\n" -"}\n" -"B3_STATIC void b3FindPenetrTouch(b3MprSimplex_t *portal,float *depth, b3Float4 *dir, b3Float4 *pos)\n" -"{\n" -" // Touching contact on portal's v1 - so depth is zero and direction\n" -" // is unimportant and pos can be guessed\n" -" *depth = 0.f;\n" -" b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" -" b3Float4* b3mpr_vec3_origin = &zero;\n" -" b3MprVec3Copy(dir, b3mpr_vec3_origin);\n" -" b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n" -" b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n" -" b3MprVec3Scale(pos, 0.5);\n" -"}\n" -"B3_STATIC void b3FindPenetrSegment(b3MprSimplex_t *portal,\n" -" float *depth, b3Float4 *dir, b3Float4 *pos)\n" -"{\n" -" \n" -" // Origin lies on v0-v1 segment.\n" -" // Depth is distance to v1, direction also and position must be\n" -" // computed\n" -" b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n" -" b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n" -" b3MprVec3Scale(pos, 0.5f);\n" -" \n" -" b3MprVec3Copy(dir, &b3MprSimplexPoint(portal, 1)->v);\n" -" *depth = B3_MPR_SQRT(b3MprVec3Len2(dir));\n" -" b3MprVec3Normalize(dir);\n" -"}\n" -"inline int b3MprPenetration(int pairIndex, int bodyIndexA, int bodyIndexB,\n" -" b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,\n" -" b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" -" b3ConstArray(b3Collidable_t) cpuCollidables,\n" -" b3ConstArray(b3Float4) cpuVertices,\n" -" __global b3Float4* sepAxis,\n" -" __global int* hasSepAxis,\n" -" float *depthOut, b3Float4* dirOut, b3Float4* posOut)\n" -"{\n" -" \n" -" b3MprSimplex_t portal;\n" -" \n" -"// if (!hasSepAxis[pairIndex])\n" -" // return -1;\n" -" \n" -" hasSepAxis[pairIndex] = 0;\n" -" int res;\n" -" // Phase 1: Portal discovery\n" -" res = b3DiscoverPortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,hasSepAxis, &portal);\n" -" \n" -" \n" -" //sepAxis[pairIndex] = *pdir;//or -dir?\n" -" switch (res)\n" -" {\n" -" case 0:\n" -" {\n" -" // Phase 2: Portal refinement\n" -" \n" -" res = b3RefinePortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal);\n" -" if (res < 0)\n" -" return -1;\n" -" // Phase 3. Penetration info\n" -" b3FindPenetr(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal, depthOut, dirOut, posOut);\n" -" hasSepAxis[pairIndex] = 1;\n" -" sepAxis[pairIndex] = -*dirOut;\n" -" break;\n" -" }\n" -" case 1:\n" -" {\n" -" // Touching contact on portal's v1.\n" -" b3FindPenetrTouch(&portal, depthOut, dirOut, posOut);\n" -" break;\n" -" }\n" -" case 2:\n" -" {\n" -" \n" -" b3FindPenetrSegment( &portal, depthOut, dirOut, posOut);\n" -" break;\n" -" }\n" -" default:\n" -" {\n" -" hasSepAxis[pairIndex]=0;\n" -" //if (res < 0)\n" -" //{\n" -" // Origin isn't inside portal - no collision.\n" -" return -1;\n" -" //}\n" -" }\n" -" };\n" -" \n" -" return 0;\n" -"};\n" -"#endif //B3_MPR_PENETRATION_H\n" -"#ifndef B3_CONTACT4DATA_H\n" -"#define B3_CONTACT4DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"typedef struct b3Contact4Data b3Contact4Data_t;\n" -"struct b3Contact4Data\n" -"{\n" -" b3Float4 m_worldPosB[4];\n" -"// b3Float4 m_localPosA[4];\n" -"// b3Float4 m_localPosB[4];\n" -" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" -" unsigned short m_restituitionCoeffCmp;\n" -" unsigned short m_frictionCoeffCmp;\n" -" int m_batchIdx;\n" -" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" -" int m_bodyBPtrAndSignBit;\n" -" int m_childIndexA;\n" -" int m_childIndexB;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" -"{\n" -" return (int)contact->m_worldNormalOnB.w;\n" -"};\n" -"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" -"{\n" -" contact->m_worldNormalOnB.w = (float)numPoints;\n" -"};\n" -"#endif //B3_CONTACT4DATA_H\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n" -"#ifdef cl_ext_atomic_counters_32\n" -" #pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -" #define counter32_t volatile __global int*\n" -"#endif\n" -"__kernel void mprPenetrationKernel( __global int4* pairs,\n" -" __global const b3RigidBodyData_t* rigidBodies, \n" -" __global const b3Collidable_t* collidables,\n" -" __global const b3ConvexPolyhedronData_t* convexShapes, \n" -" __global const float4* vertices,\n" -" __global float4* separatingNormals,\n" -" __global int* hasSeparatingAxis,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int contactCapacity,\n" -" int numPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" int pairIndex = i;\n" -" if (i<numPairs)\n" -" {\n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" \n" -" //once the broadphase avoids static-static pairs, we can remove this test\n" -" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" -" {\n" -" return;\n" -" }\n" -" \n" -" if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n" -" {\n" -" return;\n" -" }\n" -" float depthOut;\n" -" b3Float4 dirOut;\n" -" b3Float4 posOut;\n" -" int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut);\n" -" \n" -" \n" -" \n" -" \n" -" if (res==0)\n" -" {\n" -" //add a contact\n" -" int dstIdx;\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" if (dstIdx<contactCapacity)\n" -" {\n" -" pairs[pairIndex].z = dstIdx;\n" -" __global struct b3Contact4Data* c = globalContactsOut + dstIdx;\n" -" c->m_worldNormalOnB = -dirOut;//normal;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" int bodyA = pairs[pairIndex].x;\n" -" int bodyB = pairs[pairIndex].y;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n" -" c->m_childIndexA = -1;\n" -" c->m_childIndexB = -1;\n" -" //for (int i=0;i<nContacts;i++)\n" -" posOut.w = -depthOut;\n" -" c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]];\n" -" GET_NPOINTS(*c) = 1;//nContacts;\n" -" }\n" -" }\n" -" }\n" -"}\n" -"typedef float4 Quaternion;\n" -"#define make_float4 (float4)\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = make_float4(a.xyz,0.f);\n" -" float4 b1 = make_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -"}\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b)\n" -"{\n" -" Quaternion ans;\n" -" ans = cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q)\n" -"{\n" -" return (Quaternion)(-q.xyz, q.w);\n" -"}\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec)\n" -"{\n" -" Quaternion qInv = qtInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"__inline\n" -"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" -"{\n" -" return qtRotate( *orientation, *p ) + (*translation);\n" -"}\n" -"__inline\n" -"float4 qtInvRotate(const Quaternion q, float4 vec)\n" -"{\n" -" return qtRotate( qtInvert( q ), vec );\n" -"}\n" -"inline void project(__global const b3ConvexPolyhedronData_t* hull, const float4 pos, const float4 orn, \n" -"const float4* dir, __global const float4* vertices, float* min, float* max)\n" -"{\n" -" min[0] = FLT_MAX;\n" -" max[0] = -FLT_MAX;\n" -" int numVerts = hull->m_numVertices;\n" -" const float4 localDir = qtInvRotate(orn,*dir);\n" -" float offset = dot(pos,*dir);\n" -" for(int i=0;i<numVerts;i++)\n" -" {\n" -" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" -" if(dp < min[0]) \n" -" min[0] = dp;\n" -" if(dp > max[0]) \n" -" max[0] = dp;\n" -" }\n" -" if(min[0]>max[0])\n" -" {\n" -" float tmp = min[0];\n" -" min[0] = max[0];\n" -" max[0] = tmp;\n" -" }\n" -" min[0] += offset;\n" -" max[0] += offset;\n" -"}\n" -"bool findSeparatingAxisUnitSphere( __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" __global const float4* vertices,\n" -" __global const float4* unitSphereDirections,\n" -" int numUnitSphereDirections,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" \n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" int curPlaneTests=0;\n" -" int curEdgeEdge = 0;\n" -" // Test unit sphere directions\n" -" for (int i=0;i<numUnitSphereDirections;i++)\n" -" {\n" -" float4 crossje;\n" -" crossje = unitSphereDirections[i]; \n" -" if (dot3F4(DeltaC2,crossje)>0)\n" -" crossje *= -1.f;\n" -" {\n" -" float dist;\n" -" bool result = true;\n" -" float Min0,Max0;\n" -" float Min1,Max1;\n" -" project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n" -" project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n" -" \n" -" if(Max0<Min1 || Max1<Min0)\n" -" return false;\n" -" \n" -" float d0 = Max0 - Min1;\n" -" float d1 = Max1 - Min0;\n" -" dist = d0<d1 ? d0:d1;\n" -" result = true;\n" -" \n" -" if(dist<*dmin)\n" -" {\n" -" *dmin = dist;\n" -" *sep = crossje;\n" -" }\n" -" }\n" -" }\n" -" \n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" return true;\n" -"}\n" -"__kernel void findSeparatingAxisUnitSphereKernel( __global const int4* pairs, \n" -" __global const b3RigidBodyData_t* rigidBodies, \n" -" __global const b3Collidable_t* collidables,\n" -" __global const b3ConvexPolyhedronData_t* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* unitSphereDirections,\n" -" __global float4* separatingNormals,\n" -" __global int* hasSeparatingAxis,\n" -" __global float* dmins,\n" -" int numUnitSphereDirections,\n" -" int numPairs\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" \n" -" if (i<numPairs)\n" -" {\n" -" if (hasSeparatingAxis[i])\n" -" {\n" -" \n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" \n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" \n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" \n" -" float dmin = dmins[i];\n" -" \n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" posA.w = 0.f;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" posB.w = 0.f;\n" -" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" float4 sepNormal = separatingNormals[i];\n" -" \n" -" int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n" -" if (numEdgeEdgeDirections>numUnitSphereDirections)\n" -" {\n" -" bool sepEE = findSeparatingAxisUnitSphere( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin);\n" -" if (!sepEE)\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" } else\n" -" {\n" -" hasSeparatingAxis[i] = 1;\n" -" separatingNormals[i] = sepNormal;\n" -" }\n" -" }\n" -" } //if (hasSeparatingAxis[i])\n" -" }//(i<numPairs)\n" -"}\n" -; +static const char* mprKernelsCL = + "/***\n" + " * ---------------------------------\n" + " * Copyright (c)2012 Daniel Fiser <danfis@danfis.cz>\n" + " *\n" + " * This file was ported from mpr.c file, part of libccd.\n" + " * The Minkoski Portal Refinement implementation was ported \n" + " * to OpenCL by Erwin Coumans for the Bullet 3 Physics library.\n" + " * at http://github.com/erwincoumans/bullet3\n" + " *\n" + " * Distributed under the OSI-approved BSD License (the \"License\");\n" + " * see <http://www.opensource.org/licenses/bsd-license.php>.\n" + " * This software is distributed WITHOUT ANY WARRANTY; without even the\n" + " * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n" + " * See the License for more information.\n" + " */\n" + "#ifndef B3_MPR_PENETRATION_H\n" + "#define B3_MPR_PENETRATION_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_RIGIDBODY_DATA_H\n" + "#define B3_RIGIDBODY_DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#define B3_QUAT_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif\n" + "#endif\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Quat;\n" + " #define b3QuatConstArg const b3Quat\n" + " \n" + " \n" + "inline float4 b3FastNormalize4(float4 v)\n" + "{\n" + " v = (float4)(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + " \n" + "inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" + "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" + "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" + "{\n" + " b3Quat ans;\n" + " ans = b3Cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - b3Dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" + "{\n" + " b3Quat q;\n" + " q=in;\n" + " //return b3FastNormalize4(in);\n" + " float len = native_sqrt(dot(q, q));\n" + " if(len > 0.f)\n" + " {\n" + " q *= 1.f / len;\n" + " }\n" + " else\n" + " {\n" + " q.x = q.y = q.z = 0.f;\n" + " q.w = 1.f;\n" + " }\n" + " return q;\n" + "}\n" + "inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " b3Quat qInv = b3QuatInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " return b3QuatRotate( b3QuatInvert( q ), vec );\n" + "}\n" + "inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" + "{\n" + " return b3QuatRotate( orientation, point ) + (translation);\n" + "}\n" + " \n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifndef B3_MAT3x3_H\n" + "#define B3_MAT3x3_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "typedef struct\n" + "{\n" + " b3Float4 m_row[3];\n" + "}b3Mat3x3;\n" + "#define b3Mat3x3ConstArg const b3Mat3x3\n" + "#define b3GetRow(m,row) (m.m_row[row])\n" + "inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" + "{\n" + " b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" + " b3Mat3x3 out;\n" + " out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" + " out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" + " out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" + " out.m_row[0].w = 0.f;\n" + " out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" + " out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" + " out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" + " out.m_row[1].w = 0.f;\n" + " out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" + " out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" + " out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" + " out.m_row[2].w = 0.f;\n" + " return out;\n" + "}\n" + "inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = fabs(matIn.m_row[0]);\n" + " out.m_row[1] = fabs(matIn.m_row[1]);\n" + " out.m_row[2] = fabs(matIn.m_row[2]);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtZero();\n" + "__inline\n" + "b3Mat3x3 mtIdentity();\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Mat3x3 mtZero()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(0.f);\n" + " m.m_row[1] = (b3Float4)(0.f);\n" + " m.m_row[2] = (b3Float4)(0.f);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtIdentity()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(1,0,0,0);\n" + " m.m_row[1] = (b3Float4)(0,1,0,0);\n" + " m.m_row[2] = (b3Float4)(0,0,1,0);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" + " out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" + " out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" + "{\n" + " b3Mat3x3 transB;\n" + " transB = mtTranspose( b );\n" + " b3Mat3x3 ans;\n" + " // why this doesn't run when 0ing in the for{}\n" + " a.m_row[0].w = 0.f;\n" + " a.m_row[1].w = 0.f;\n" + " a.m_row[2].w = 0.f;\n" + " for(int i=0; i<3; i++)\n" + " {\n" + "// a.m_row[i].w = 0.f;\n" + " ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" + " ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" + " ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" + " ans.m_row[i].w = 0.f;\n" + " }\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" + "{\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a.m_row[0], b );\n" + " ans.y = b3Dot3F4( a.m_row[1], b );\n" + " ans.z = b3Dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" + "{\n" + " b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a, colx );\n" + " ans.y = b3Dot3F4( a, coly );\n" + " ans.z = b3Dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "#endif\n" + "#endif //B3_MAT3x3_H\n" + "typedef struct b3RigidBodyData b3RigidBodyData_t;\n" + "struct b3RigidBodyData\n" + "{\n" + " b3Float4 m_pos;\n" + " b3Quat m_quat;\n" + " b3Float4 m_linVel;\n" + " b3Float4 m_angVel;\n" + " int m_collidableIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "};\n" + "typedef struct b3InertiaData b3InertiaData_t;\n" + "struct b3InertiaData\n" + "{\n" + " b3Mat3x3 m_invInertiaWorld;\n" + " b3Mat3x3 m_initInvInertia;\n" + "};\n" + "#endif //B3_RIGIDBODY_DATA_H\n" + " \n" + "#ifndef B3_CONVEX_POLYHEDRON_DATA_H\n" + "#define B3_CONVEX_POLYHEDRON_DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "typedef struct b3GpuFace b3GpuFace_t;\n" + "struct b3GpuFace\n" + "{\n" + " b3Float4 m_plane;\n" + " int m_indexOffset;\n" + " int m_numIndices;\n" + " int m_unusedPadding1;\n" + " int m_unusedPadding2;\n" + "};\n" + "typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n" + "struct b3ConvexPolyhedronData\n" + "{\n" + " b3Float4 m_localCenter;\n" + " b3Float4 m_extents;\n" + " b3Float4 mC;\n" + " b3Float4 mE;\n" + " float m_radius;\n" + " int m_faceOffset;\n" + " int m_numFaces;\n" + " int m_numVertices;\n" + " int m_vertexOffset;\n" + " int m_uniqueEdgesOffset;\n" + " int m_numUniqueEdges;\n" + " int m_unused;\n" + "};\n" + "#endif //B3_CONVEX_POLYHEDRON_DATA_H\n" + "#ifndef B3_COLLIDABLE_H\n" + "#define B3_COLLIDABLE_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "enum b3ShapeTypes\n" + "{\n" + " SHAPE_HEIGHT_FIELD=1,\n" + " SHAPE_CONVEX_HULL=3,\n" + " SHAPE_PLANE=4,\n" + " SHAPE_CONCAVE_TRIMESH=5,\n" + " SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" + " SHAPE_SPHERE=7,\n" + " MAX_NUM_SHAPE_TYPES,\n" + "};\n" + "typedef struct b3Collidable b3Collidable_t;\n" + "struct b3Collidable\n" + "{\n" + " union {\n" + " int m_numChildShapes;\n" + " int m_bvhIndex;\n" + " };\n" + " union\n" + " {\n" + " float m_radius;\n" + " int m_compoundBvhIndex;\n" + " };\n" + " int m_shapeType;\n" + " int m_shapeIndex;\n" + "};\n" + "typedef struct b3GpuChildShape b3GpuChildShape_t;\n" + "struct b3GpuChildShape\n" + "{\n" + " b3Float4 m_childPosition;\n" + " b3Quat m_childOrientation;\n" + " int m_shapeIndex;\n" + " int m_unused0;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "struct b3CompoundOverlappingPair\n" + "{\n" + " int m_bodyIndexA;\n" + " int m_bodyIndexB;\n" + "// int m_pairType;\n" + " int m_childShapeIndexA;\n" + " int m_childShapeIndexB;\n" + "};\n" + "#endif //B3_COLLIDABLE_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#define B3_MPR_SQRT sqrt\n" + "#endif\n" + "#define B3_MPR_FMIN(x, y) ((x) < (y) ? (x) : (y))\n" + "#define B3_MPR_FABS fabs\n" + "#define B3_MPR_TOLERANCE 1E-6f\n" + "#define B3_MPR_MAX_ITERATIONS 1000\n" + "struct _b3MprSupport_t \n" + "{\n" + " b3Float4 v; //!< Support point in minkowski sum\n" + " b3Float4 v1; //!< Support point in obj1\n" + " b3Float4 v2; //!< Support point in obj2\n" + "};\n" + "typedef struct _b3MprSupport_t b3MprSupport_t;\n" + "struct _b3MprSimplex_t \n" + "{\n" + " b3MprSupport_t ps[4];\n" + " int last; //!< index of last added point\n" + "};\n" + "typedef struct _b3MprSimplex_t b3MprSimplex_t;\n" + "inline b3MprSupport_t* b3MprSimplexPointW(b3MprSimplex_t *s, int idx)\n" + "{\n" + " return &s->ps[idx];\n" + "}\n" + "inline void b3MprSimplexSetSize(b3MprSimplex_t *s, int size)\n" + "{\n" + " s->last = size - 1;\n" + "}\n" + "inline int b3MprSimplexSize(const b3MprSimplex_t *s)\n" + "{\n" + " return s->last + 1;\n" + "}\n" + "inline const b3MprSupport_t* b3MprSimplexPoint(const b3MprSimplex_t* s, int idx)\n" + "{\n" + " // here is no check on boundaries\n" + " return &s->ps[idx];\n" + "}\n" + "inline void b3MprSupportCopy(b3MprSupport_t *d, const b3MprSupport_t *s)\n" + "{\n" + " *d = *s;\n" + "}\n" + "inline void b3MprSimplexSet(b3MprSimplex_t *s, size_t pos, const b3MprSupport_t *a)\n" + "{\n" + " b3MprSupportCopy(s->ps + pos, a);\n" + "}\n" + "inline void b3MprSimplexSwap(b3MprSimplex_t *s, size_t pos1, size_t pos2)\n" + "{\n" + " b3MprSupport_t supp;\n" + " b3MprSupportCopy(&supp, &s->ps[pos1]);\n" + " b3MprSupportCopy(&s->ps[pos1], &s->ps[pos2]);\n" + " b3MprSupportCopy(&s->ps[pos2], &supp);\n" + "}\n" + "inline int b3MprIsZero(float val)\n" + "{\n" + " return B3_MPR_FABS(val) < FLT_EPSILON;\n" + "}\n" + "inline int b3MprEq(float _a, float _b)\n" + "{\n" + " float ab;\n" + " float a, b;\n" + " ab = B3_MPR_FABS(_a - _b);\n" + " if (B3_MPR_FABS(ab) < FLT_EPSILON)\n" + " return 1;\n" + " a = B3_MPR_FABS(_a);\n" + " b = B3_MPR_FABS(_b);\n" + " if (b > a){\n" + " return ab < FLT_EPSILON * b;\n" + " }else{\n" + " return ab < FLT_EPSILON * a;\n" + " }\n" + "}\n" + "inline int b3MprVec3Eq(const b3Float4* a, const b3Float4 *b)\n" + "{\n" + " return b3MprEq((*a).x, (*b).x)\n" + " && b3MprEq((*a).y, (*b).y)\n" + " && b3MprEq((*a).z, (*b).z);\n" + "}\n" + "inline b3Float4 b3LocalGetSupportVertex(b3Float4ConstArg supportVec,__global const b3ConvexPolyhedronData_t* hull, b3ConstArray(b3Float4) verticesA)\n" + "{\n" + " b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n" + " float maxDot = -B3_LARGE_FLOAT;\n" + " if( 0 < hull->m_numVertices )\n" + " {\n" + " const b3Float4 scaled = supportVec;\n" + " int index = b3MaxDot(scaled, &verticesA[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n" + " return verticesA[hull->m_vertexOffset+index];\n" + " }\n" + " return supVec;\n" + "}\n" + "B3_STATIC void b3MprConvexSupport(int pairIndex,int bodyIndex, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" + " b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" + " b3ConstArray(b3Collidable_t) cpuCollidables,\n" + " b3ConstArray(b3Float4) cpuVertices,\n" + " __global b3Float4* sepAxis,\n" + " const b3Float4* _dir, b3Float4* outp, int logme)\n" + "{\n" + " //dir is in worldspace, move to local space\n" + " \n" + " b3Float4 pos = cpuBodyBuf[bodyIndex].m_pos;\n" + " b3Quat orn = cpuBodyBuf[bodyIndex].m_quat;\n" + " \n" + " b3Float4 dir = b3MakeFloat4((*_dir).x,(*_dir).y,(*_dir).z,0.f);\n" + " \n" + " const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn),dir);\n" + " \n" + " //find local support vertex\n" + " int colIndex = cpuBodyBuf[bodyIndex].m_collidableIdx;\n" + " \n" + " b3Assert(cpuCollidables[colIndex].m_shapeType==SHAPE_CONVEX_HULL);\n" + " __global const b3ConvexPolyhedronData_t* hull = &cpuConvexData[cpuCollidables[colIndex].m_shapeIndex];\n" + " \n" + " b3Float4 pInA;\n" + " if (logme)\n" + " {\n" + " b3Float4 supVec = b3MakeFloat4(0,0,0,0);\n" + " float maxDot = -B3_LARGE_FLOAT;\n" + " if( 0 < hull->m_numVertices )\n" + " {\n" + " const b3Float4 scaled = localDir;\n" + " int index = b3MaxDot(scaled, &cpuVertices[hull->m_vertexOffset], hull->m_numVertices, &maxDot);\n" + " pInA = cpuVertices[hull->m_vertexOffset+index];\n" + " \n" + " }\n" + " } else\n" + " {\n" + " pInA = b3LocalGetSupportVertex(localDir,hull,cpuVertices);\n" + " }\n" + " //move vertex to world space\n" + " *outp = b3TransformPoint(pInA,pos,orn);\n" + " \n" + "}\n" + "inline void b3MprSupport(int pairIndex,int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" + " b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" + " b3ConstArray(b3Collidable_t) cpuCollidables,\n" + " b3ConstArray(b3Float4) cpuVertices,\n" + " __global b3Float4* sepAxis,\n" + " const b3Float4* _dir, b3MprSupport_t *supp)\n" + "{\n" + " b3Float4 dir;\n" + " dir = *_dir;\n" + " b3MprConvexSupport(pairIndex,bodyIndexA,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v1,0);\n" + " dir = *_dir*-1.f;\n" + " b3MprConvexSupport(pairIndex,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v2,0);\n" + " supp->v = supp->v1 - supp->v2;\n" + "}\n" + "inline void b3FindOrigin(int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, b3MprSupport_t *center)\n" + "{\n" + " center->v1 = cpuBodyBuf[bodyIndexA].m_pos;\n" + " center->v2 = cpuBodyBuf[bodyIndexB].m_pos;\n" + " center->v = center->v1 - center->v2;\n" + "}\n" + "inline void b3MprVec3Set(b3Float4 *v, float x, float y, float z)\n" + "{\n" + " (*v).x = x;\n" + " (*v).y = y;\n" + " (*v).z = z;\n" + " (*v).w = 0.f;\n" + "}\n" + "inline void b3MprVec3Add(b3Float4 *v, const b3Float4 *w)\n" + "{\n" + " (*v).x += (*w).x;\n" + " (*v).y += (*w).y;\n" + " (*v).z += (*w).z;\n" + "}\n" + "inline void b3MprVec3Copy(b3Float4 *v, const b3Float4 *w)\n" + "{\n" + " *v = *w;\n" + "}\n" + "inline void b3MprVec3Scale(b3Float4 *d, float k)\n" + "{\n" + " *d *= k;\n" + "}\n" + "inline float b3MprVec3Dot(const b3Float4 *a, const b3Float4 *b)\n" + "{\n" + " float dot;\n" + " dot = b3Dot3F4(*a,*b);\n" + " return dot;\n" + "}\n" + "inline float b3MprVec3Len2(const b3Float4 *v)\n" + "{\n" + " return b3MprVec3Dot(v, v);\n" + "}\n" + "inline void b3MprVec3Normalize(b3Float4 *d)\n" + "{\n" + " float k = 1.f / B3_MPR_SQRT(b3MprVec3Len2(d));\n" + " b3MprVec3Scale(d, k);\n" + "}\n" + "inline void b3MprVec3Cross(b3Float4 *d, const b3Float4 *a, const b3Float4 *b)\n" + "{\n" + " *d = b3Cross3(*a,*b);\n" + " \n" + "}\n" + "inline void b3MprVec3Sub2(b3Float4 *d, const b3Float4 *v, const b3Float4 *w)\n" + "{\n" + " *d = *v - *w;\n" + "}\n" + "inline void b3PortalDir(const b3MprSimplex_t *portal, b3Float4 *dir)\n" + "{\n" + " b3Float4 v2v1, v3v1;\n" + " b3MprVec3Sub2(&v2v1, &b3MprSimplexPoint(portal, 2)->v,\n" + " &b3MprSimplexPoint(portal, 1)->v);\n" + " b3MprVec3Sub2(&v3v1, &b3MprSimplexPoint(portal, 3)->v,\n" + " &b3MprSimplexPoint(portal, 1)->v);\n" + " b3MprVec3Cross(dir, &v2v1, &v3v1);\n" + " b3MprVec3Normalize(dir);\n" + "}\n" + "inline int portalEncapsulesOrigin(const b3MprSimplex_t *portal,\n" + " const b3Float4 *dir)\n" + "{\n" + " float dot;\n" + " dot = b3MprVec3Dot(dir, &b3MprSimplexPoint(portal, 1)->v);\n" + " return b3MprIsZero(dot) || dot > 0.f;\n" + "}\n" + "inline int portalReachTolerance(const b3MprSimplex_t *portal,\n" + " const b3MprSupport_t *v4,\n" + " const b3Float4 *dir)\n" + "{\n" + " float dv1, dv2, dv3, dv4;\n" + " float dot1, dot2, dot3;\n" + " // find the smallest dot product of dir and {v1-v4, v2-v4, v3-v4}\n" + " dv1 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, dir);\n" + " dv2 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, dir);\n" + " dv3 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, dir);\n" + " dv4 = b3MprVec3Dot(&v4->v, dir);\n" + " dot1 = dv4 - dv1;\n" + " dot2 = dv4 - dv2;\n" + " dot3 = dv4 - dv3;\n" + " dot1 = B3_MPR_FMIN(dot1, dot2);\n" + " dot1 = B3_MPR_FMIN(dot1, dot3);\n" + " return b3MprEq(dot1, B3_MPR_TOLERANCE) || dot1 < B3_MPR_TOLERANCE;\n" + "}\n" + "inline int portalCanEncapsuleOrigin(const b3MprSimplex_t *portal, \n" + " const b3MprSupport_t *v4,\n" + " const b3Float4 *dir)\n" + "{\n" + " float dot;\n" + " dot = b3MprVec3Dot(&v4->v, dir);\n" + " return b3MprIsZero(dot) || dot > 0.f;\n" + "}\n" + "inline void b3ExpandPortal(b3MprSimplex_t *portal,\n" + " const b3MprSupport_t *v4)\n" + "{\n" + " float dot;\n" + " b3Float4 v4v0;\n" + " b3MprVec3Cross(&v4v0, &v4->v, &b3MprSimplexPoint(portal, 0)->v);\n" + " dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &v4v0);\n" + " if (dot > 0.f){\n" + " dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &v4v0);\n" + " if (dot > 0.f){\n" + " b3MprSimplexSet(portal, 1, v4);\n" + " }else{\n" + " b3MprSimplexSet(portal, 3, v4);\n" + " }\n" + " }else{\n" + " dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &v4v0);\n" + " if (dot > 0.f){\n" + " b3MprSimplexSet(portal, 2, v4);\n" + " }else{\n" + " b3MprSimplexSet(portal, 1, v4);\n" + " }\n" + " }\n" + "}\n" + "B3_STATIC int b3DiscoverPortal(int pairIndex, int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" + " b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" + " b3ConstArray(b3Collidable_t) cpuCollidables,\n" + " b3ConstArray(b3Float4) cpuVertices,\n" + " __global b3Float4* sepAxis,\n" + " __global int* hasSepAxis,\n" + " b3MprSimplex_t *portal)\n" + "{\n" + " b3Float4 dir, va, vb;\n" + " float dot;\n" + " int cont;\n" + " \n" + " \n" + " // vertex 0 is center of portal\n" + " b3FindOrigin(bodyIndexA,bodyIndexB,cpuBodyBuf, b3MprSimplexPointW(portal, 0));\n" + " // vertex 0 is center of portal\n" + " b3MprSimplexSetSize(portal, 1);\n" + " \n" + " b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" + " b3Float4* b3mpr_vec3_origin = &zero;\n" + " if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 0)->v, b3mpr_vec3_origin)){\n" + " // Portal's center lies on origin (0,0,0) => we know that objects\n" + " // intersect but we would need to know penetration info.\n" + " // So move center little bit...\n" + " b3MprVec3Set(&va, FLT_EPSILON * 10.f, 0.f, 0.f);\n" + " b3MprVec3Add(&b3MprSimplexPointW(portal, 0)->v, &va);\n" + " }\n" + " // vertex 1 = support in direction of origin\n" + " b3MprVec3Copy(&dir, &b3MprSimplexPoint(portal, 0)->v);\n" + " b3MprVec3Scale(&dir, -1.f);\n" + " b3MprVec3Normalize(&dir);\n" + " b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 1));\n" + " b3MprSimplexSetSize(portal, 2);\n" + " // test if origin isn't outside of v1\n" + " dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &dir);\n" + " \n" + " if (b3MprIsZero(dot) || dot < 0.f)\n" + " return -1;\n" + " // vertex 2\n" + " b3MprVec3Cross(&dir, &b3MprSimplexPoint(portal, 0)->v,\n" + " &b3MprSimplexPoint(portal, 1)->v);\n" + " if (b3MprIsZero(b3MprVec3Len2(&dir))){\n" + " if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 1)->v, b3mpr_vec3_origin)){\n" + " // origin lies on v1\n" + " return 1;\n" + " }else{\n" + " // origin lies on v0-v1 segment\n" + " return 2;\n" + " }\n" + " }\n" + " b3MprVec3Normalize(&dir);\n" + " b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 2));\n" + " \n" + " dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &dir);\n" + " if (b3MprIsZero(dot) || dot < 0.f)\n" + " return -1;\n" + " b3MprSimplexSetSize(portal, 3);\n" + " // vertex 3 direction\n" + " b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n" + " &b3MprSimplexPoint(portal, 0)->v);\n" + " b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n" + " &b3MprSimplexPoint(portal, 0)->v);\n" + " b3MprVec3Cross(&dir, &va, &vb);\n" + " b3MprVec3Normalize(&dir);\n" + " // it is better to form portal faces to be oriented \"outside\" origin\n" + " dot = b3MprVec3Dot(&dir, &b3MprSimplexPoint(portal, 0)->v);\n" + " if (dot > 0.f){\n" + " b3MprSimplexSwap(portal, 1, 2);\n" + " b3MprVec3Scale(&dir, -1.f);\n" + " }\n" + " while (b3MprSimplexSize(portal) < 4){\n" + " b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 3));\n" + " \n" + " dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &dir);\n" + " if (b3MprIsZero(dot) || dot < 0.f)\n" + " return -1;\n" + " cont = 0;\n" + " // test if origin is outside (v1, v0, v3) - set v2 as v3 and\n" + " // continue\n" + " b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 1)->v,\n" + " &b3MprSimplexPoint(portal, 3)->v);\n" + " dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n" + " if (dot < 0.f && !b3MprIsZero(dot)){\n" + " b3MprSimplexSet(portal, 2, b3MprSimplexPoint(portal, 3));\n" + " cont = 1;\n" + " }\n" + " if (!cont){\n" + " // test if origin is outside (v3, v0, v2) - set v1 as v3 and\n" + " // continue\n" + " b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 3)->v,\n" + " &b3MprSimplexPoint(portal, 2)->v);\n" + " dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);\n" + " if (dot < 0.f && !b3MprIsZero(dot)){\n" + " b3MprSimplexSet(portal, 1, b3MprSimplexPoint(portal, 3));\n" + " cont = 1;\n" + " }\n" + " }\n" + " if (cont){\n" + " b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,\n" + " &b3MprSimplexPoint(portal, 0)->v);\n" + " b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,\n" + " &b3MprSimplexPoint(portal, 0)->v);\n" + " b3MprVec3Cross(&dir, &va, &vb);\n" + " b3MprVec3Normalize(&dir);\n" + " }else{\n" + " b3MprSimplexSetSize(portal, 4);\n" + " }\n" + " }\n" + " return 0;\n" + "}\n" + "B3_STATIC int b3RefinePortal(int pairIndex,int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" + " b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" + " b3ConstArray(b3Collidable_t) cpuCollidables,\n" + " b3ConstArray(b3Float4) cpuVertices,\n" + " __global b3Float4* sepAxis,\n" + " b3MprSimplex_t *portal)\n" + "{\n" + " b3Float4 dir;\n" + " b3MprSupport_t v4;\n" + " for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n" + " //while (1)\n" + " {\n" + " // compute direction outside the portal (from v0 throught v1,v2,v3\n" + " // face)\n" + " b3PortalDir(portal, &dir);\n" + " // test if origin is inside the portal\n" + " if (portalEncapsulesOrigin(portal, &dir))\n" + " return 0;\n" + " // get next support point\n" + " \n" + " b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n" + " // test if v4 can expand portal to contain origin and if portal\n" + " // expanding doesn't reach given tolerance\n" + " if (!portalCanEncapsuleOrigin(portal, &v4, &dir)\n" + " || portalReachTolerance(portal, &v4, &dir))\n" + " {\n" + " return -1;\n" + " }\n" + " // v1-v2-v3 triangle must be rearranged to face outside Minkowski\n" + " // difference (direction from v0).\n" + " b3ExpandPortal(portal, &v4);\n" + " }\n" + " return -1;\n" + "}\n" + "B3_STATIC void b3FindPos(const b3MprSimplex_t *portal, b3Float4 *pos)\n" + "{\n" + " b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" + " b3Float4* b3mpr_vec3_origin = &zero;\n" + " b3Float4 dir;\n" + " size_t i;\n" + " float b[4], sum, inv;\n" + " b3Float4 vec, p1, p2;\n" + " b3PortalDir(portal, &dir);\n" + " // use barycentric coordinates of tetrahedron to find origin\n" + " b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n" + " &b3MprSimplexPoint(portal, 2)->v);\n" + " b[0] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n" + " b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n" + " &b3MprSimplexPoint(portal, 2)->v);\n" + " b[1] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n" + " b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 0)->v,\n" + " &b3MprSimplexPoint(portal, 1)->v);\n" + " b[2] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);\n" + " b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n" + " &b3MprSimplexPoint(portal, 1)->v);\n" + " b[3] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);\n" + " sum = b[0] + b[1] + b[2] + b[3];\n" + " if (b3MprIsZero(sum) || sum < 0.f){\n" + " b[0] = 0.f;\n" + " b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,\n" + " &b3MprSimplexPoint(portal, 3)->v);\n" + " b[1] = b3MprVec3Dot(&vec, &dir);\n" + " b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,\n" + " &b3MprSimplexPoint(portal, 1)->v);\n" + " b[2] = b3MprVec3Dot(&vec, &dir);\n" + " b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,\n" + " &b3MprSimplexPoint(portal, 2)->v);\n" + " b[3] = b3MprVec3Dot(&vec, &dir);\n" + " sum = b[1] + b[2] + b[3];\n" + " }\n" + " inv = 1.f / sum;\n" + " b3MprVec3Copy(&p1, b3mpr_vec3_origin);\n" + " b3MprVec3Copy(&p2, b3mpr_vec3_origin);\n" + " for (i = 0; i < 4; i++){\n" + " b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v1);\n" + " b3MprVec3Scale(&vec, b[i]);\n" + " b3MprVec3Add(&p1, &vec);\n" + " b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v2);\n" + " b3MprVec3Scale(&vec, b[i]);\n" + " b3MprVec3Add(&p2, &vec);\n" + " }\n" + " b3MprVec3Scale(&p1, inv);\n" + " b3MprVec3Scale(&p2, inv);\n" + " b3MprVec3Copy(pos, &p1);\n" + " b3MprVec3Add(pos, &p2);\n" + " b3MprVec3Scale(pos, 0.5);\n" + "}\n" + "inline float b3MprVec3Dist2(const b3Float4 *a, const b3Float4 *b)\n" + "{\n" + " b3Float4 ab;\n" + " b3MprVec3Sub2(&ab, a, b);\n" + " return b3MprVec3Len2(&ab);\n" + "}\n" + "inline float _b3MprVec3PointSegmentDist2(const b3Float4 *P,\n" + " const b3Float4 *x0,\n" + " const b3Float4 *b,\n" + " b3Float4 *witness)\n" + "{\n" + " // The computation comes from solving equation of segment:\n" + " // S(t) = x0 + t.d\n" + " // where - x0 is initial point of segment\n" + " // - d is direction of segment from x0 (|d| > 0)\n" + " // - t belongs to <0, 1> interval\n" + " // \n" + " // Than, distance from a segment to some point P can be expressed:\n" + " // D(t) = |x0 + t.d - P|^2\n" + " // which is distance from any point on segment. Minimization\n" + " // of this function brings distance from P to segment.\n" + " // Minimization of D(t) leads to simple quadratic equation that's\n" + " // solving is straightforward.\n" + " //\n" + " // Bonus of this method is witness point for free.\n" + " float dist, t;\n" + " b3Float4 d, a;\n" + " // direction of segment\n" + " b3MprVec3Sub2(&d, b, x0);\n" + " // precompute vector from P to x0\n" + " b3MprVec3Sub2(&a, x0, P);\n" + " t = -1.f * b3MprVec3Dot(&a, &d);\n" + " t /= b3MprVec3Len2(&d);\n" + " if (t < 0.f || b3MprIsZero(t)){\n" + " dist = b3MprVec3Dist2(x0, P);\n" + " if (witness)\n" + " b3MprVec3Copy(witness, x0);\n" + " }else if (t > 1.f || b3MprEq(t, 1.f)){\n" + " dist = b3MprVec3Dist2(b, P);\n" + " if (witness)\n" + " b3MprVec3Copy(witness, b);\n" + " }else{\n" + " if (witness){\n" + " b3MprVec3Copy(witness, &d);\n" + " b3MprVec3Scale(witness, t);\n" + " b3MprVec3Add(witness, x0);\n" + " dist = b3MprVec3Dist2(witness, P);\n" + " }else{\n" + " // recycling variables\n" + " b3MprVec3Scale(&d, t);\n" + " b3MprVec3Add(&d, &a);\n" + " dist = b3MprVec3Len2(&d);\n" + " }\n" + " }\n" + " return dist;\n" + "}\n" + "inline float b3MprVec3PointTriDist2(const b3Float4 *P,\n" + " const b3Float4 *x0, const b3Float4 *B,\n" + " const b3Float4 *C,\n" + " b3Float4 *witness)\n" + "{\n" + " // Computation comes from analytic expression for triangle (x0, B, C)\n" + " // T(s, t) = x0 + s.d1 + t.d2, where d1 = B - x0 and d2 = C - x0 and\n" + " // Then equation for distance is:\n" + " // D(s, t) = | T(s, t) - P |^2\n" + " // This leads to minimization of quadratic function of two variables.\n" + " // The solution from is taken only if s is between 0 and 1, t is\n" + " // between 0 and 1 and t + s < 1, otherwise distance from segment is\n" + " // computed.\n" + " b3Float4 d1, d2, a;\n" + " float u, v, w, p, q, r;\n" + " float s, t, dist, dist2;\n" + " b3Float4 witness2;\n" + " b3MprVec3Sub2(&d1, B, x0);\n" + " b3MprVec3Sub2(&d2, C, x0);\n" + " b3MprVec3Sub2(&a, x0, P);\n" + " u = b3MprVec3Dot(&a, &a);\n" + " v = b3MprVec3Dot(&d1, &d1);\n" + " w = b3MprVec3Dot(&d2, &d2);\n" + " p = b3MprVec3Dot(&a, &d1);\n" + " q = b3MprVec3Dot(&a, &d2);\n" + " r = b3MprVec3Dot(&d1, &d2);\n" + " s = (q * r - w * p) / (w * v - r * r);\n" + " t = (-s * r - q) / w;\n" + " if ((b3MprIsZero(s) || s > 0.f)\n" + " && (b3MprEq(s, 1.f) || s < 1.f)\n" + " && (b3MprIsZero(t) || t > 0.f)\n" + " && (b3MprEq(t, 1.f) || t < 1.f)\n" + " && (b3MprEq(t + s, 1.f) || t + s < 1.f)){\n" + " if (witness){\n" + " b3MprVec3Scale(&d1, s);\n" + " b3MprVec3Scale(&d2, t);\n" + " b3MprVec3Copy(witness, x0);\n" + " b3MprVec3Add(witness, &d1);\n" + " b3MprVec3Add(witness, &d2);\n" + " dist = b3MprVec3Dist2(witness, P);\n" + " }else{\n" + " dist = s * s * v;\n" + " dist += t * t * w;\n" + " dist += 2.f * s * t * r;\n" + " dist += 2.f * s * p;\n" + " dist += 2.f * t * q;\n" + " dist += u;\n" + " }\n" + " }else{\n" + " dist = _b3MprVec3PointSegmentDist2(P, x0, B, witness);\n" + " dist2 = _b3MprVec3PointSegmentDist2(P, x0, C, &witness2);\n" + " if (dist2 < dist){\n" + " dist = dist2;\n" + " if (witness)\n" + " b3MprVec3Copy(witness, &witness2);\n" + " }\n" + " dist2 = _b3MprVec3PointSegmentDist2(P, B, C, &witness2);\n" + " if (dist2 < dist){\n" + " dist = dist2;\n" + " if (witness)\n" + " b3MprVec3Copy(witness, &witness2);\n" + " }\n" + " }\n" + " return dist;\n" + "}\n" + "B3_STATIC void b3FindPenetr(int pairIndex,int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, \n" + " b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" + " b3ConstArray(b3Collidable_t) cpuCollidables,\n" + " b3ConstArray(b3Float4) cpuVertices,\n" + " __global b3Float4* sepAxis,\n" + " b3MprSimplex_t *portal,\n" + " float *depth, b3Float4 *pdir, b3Float4 *pos)\n" + "{\n" + " b3Float4 dir;\n" + " b3MprSupport_t v4;\n" + " unsigned long iterations;\n" + " b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" + " b3Float4* b3mpr_vec3_origin = &zero;\n" + " iterations = 1UL;\n" + " for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)\n" + " //while (1)\n" + " {\n" + " // compute portal direction and obtain next support point\n" + " b3PortalDir(portal, &dir);\n" + " \n" + " b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);\n" + " // reached tolerance -> find penetration info\n" + " if (portalReachTolerance(portal, &v4, &dir)\n" + " || iterations ==B3_MPR_MAX_ITERATIONS)\n" + " {\n" + " *depth = b3MprVec3PointTriDist2(b3mpr_vec3_origin,&b3MprSimplexPoint(portal, 1)->v,&b3MprSimplexPoint(portal, 2)->v,&b3MprSimplexPoint(portal, 3)->v,pdir);\n" + " *depth = B3_MPR_SQRT(*depth);\n" + " \n" + " if (b3MprIsZero((*pdir).x) && b3MprIsZero((*pdir).y) && b3MprIsZero((*pdir).z))\n" + " {\n" + " \n" + " *pdir = dir;\n" + " } \n" + " b3MprVec3Normalize(pdir);\n" + " \n" + " // barycentric coordinates:\n" + " b3FindPos(portal, pos);\n" + " return;\n" + " }\n" + " b3ExpandPortal(portal, &v4);\n" + " iterations++;\n" + " }\n" + "}\n" + "B3_STATIC void b3FindPenetrTouch(b3MprSimplex_t *portal,float *depth, b3Float4 *dir, b3Float4 *pos)\n" + "{\n" + " // Touching contact on portal's v1 - so depth is zero and direction\n" + " // is unimportant and pos can be guessed\n" + " *depth = 0.f;\n" + " b3Float4 zero = b3MakeFloat4(0,0,0,0);\n" + " b3Float4* b3mpr_vec3_origin = &zero;\n" + " b3MprVec3Copy(dir, b3mpr_vec3_origin);\n" + " b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n" + " b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n" + " b3MprVec3Scale(pos, 0.5);\n" + "}\n" + "B3_STATIC void b3FindPenetrSegment(b3MprSimplex_t *portal,\n" + " float *depth, b3Float4 *dir, b3Float4 *pos)\n" + "{\n" + " \n" + " // Origin lies on v0-v1 segment.\n" + " // Depth is distance to v1, direction also and position must be\n" + " // computed\n" + " b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);\n" + " b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);\n" + " b3MprVec3Scale(pos, 0.5f);\n" + " \n" + " b3MprVec3Copy(dir, &b3MprSimplexPoint(portal, 1)->v);\n" + " *depth = B3_MPR_SQRT(b3MprVec3Len2(dir));\n" + " b3MprVec3Normalize(dir);\n" + "}\n" + "inline int b3MprPenetration(int pairIndex, int bodyIndexA, int bodyIndexB,\n" + " b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,\n" + " b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, \n" + " b3ConstArray(b3Collidable_t) cpuCollidables,\n" + " b3ConstArray(b3Float4) cpuVertices,\n" + " __global b3Float4* sepAxis,\n" + " __global int* hasSepAxis,\n" + " float *depthOut, b3Float4* dirOut, b3Float4* posOut)\n" + "{\n" + " \n" + " b3MprSimplex_t portal;\n" + " \n" + "// if (!hasSepAxis[pairIndex])\n" + " // return -1;\n" + " \n" + " hasSepAxis[pairIndex] = 0;\n" + " int res;\n" + " // Phase 1: Portal discovery\n" + " res = b3DiscoverPortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,hasSepAxis, &portal);\n" + " \n" + " \n" + " //sepAxis[pairIndex] = *pdir;//or -dir?\n" + " switch (res)\n" + " {\n" + " case 0:\n" + " {\n" + " // Phase 2: Portal refinement\n" + " \n" + " res = b3RefinePortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal);\n" + " if (res < 0)\n" + " return -1;\n" + " // Phase 3. Penetration info\n" + " b3FindPenetr(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal, depthOut, dirOut, posOut);\n" + " hasSepAxis[pairIndex] = 1;\n" + " sepAxis[pairIndex] = -*dirOut;\n" + " break;\n" + " }\n" + " case 1:\n" + " {\n" + " // Touching contact on portal's v1.\n" + " b3FindPenetrTouch(&portal, depthOut, dirOut, posOut);\n" + " break;\n" + " }\n" + " case 2:\n" + " {\n" + " \n" + " b3FindPenetrSegment( &portal, depthOut, dirOut, posOut);\n" + " break;\n" + " }\n" + " default:\n" + " {\n" + " hasSepAxis[pairIndex]=0;\n" + " //if (res < 0)\n" + " //{\n" + " // Origin isn't inside portal - no collision.\n" + " return -1;\n" + " //}\n" + " }\n" + " };\n" + " \n" + " return 0;\n" + "};\n" + "#endif //B3_MPR_PENETRATION_H\n" + "#ifndef B3_CONTACT4DATA_H\n" + "#define B3_CONTACT4DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "typedef struct b3Contact4Data b3Contact4Data_t;\n" + "struct b3Contact4Data\n" + "{\n" + " b3Float4 m_worldPosB[4];\n" + "// b3Float4 m_localPosA[4];\n" + "// b3Float4 m_localPosB[4];\n" + " b3Float4 m_worldNormalOnB; // w: m_nPoints\n" + " unsigned short m_restituitionCoeffCmp;\n" + " unsigned short m_frictionCoeffCmp;\n" + " int m_batchIdx;\n" + " int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" + " int m_bodyBPtrAndSignBit;\n" + " int m_childIndexA;\n" + " int m_childIndexB;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" + "{\n" + " return (int)contact->m_worldNormalOnB.w;\n" + "};\n" + "inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" + "{\n" + " contact->m_worldNormalOnB.w = (float)numPoints;\n" + "};\n" + "#endif //B3_CONTACT4DATA_H\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n" + "#ifdef cl_ext_atomic_counters_32\n" + " #pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + " #define counter32_t volatile __global int*\n" + "#endif\n" + "__kernel void mprPenetrationKernel( __global int4* pairs,\n" + " __global const b3RigidBodyData_t* rigidBodies, \n" + " __global const b3Collidable_t* collidables,\n" + " __global const b3ConvexPolyhedronData_t* convexShapes, \n" + " __global const float4* vertices,\n" + " __global float4* separatingNormals,\n" + " __global int* hasSeparatingAxis,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int contactCapacity,\n" + " int numPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " int pairIndex = i;\n" + " if (i<numPairs)\n" + " {\n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " \n" + " //once the broadphase avoids static-static pairs, we can remove this test\n" + " if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" + " {\n" + " return;\n" + " }\n" + " \n" + " if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n" + " {\n" + " return;\n" + " }\n" + " float depthOut;\n" + " b3Float4 dirOut;\n" + " b3Float4 posOut;\n" + " int res = b3MprPenetration(pairIndex, bodyIndexA, bodyIndexB,rigidBodies,convexShapes,collidables,vertices,separatingNormals,hasSeparatingAxis,&depthOut, &dirOut, &posOut);\n" + " \n" + " \n" + " \n" + " \n" + " if (res==0)\n" + " {\n" + " //add a contact\n" + " int dstIdx;\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " if (dstIdx<contactCapacity)\n" + " {\n" + " pairs[pairIndex].z = dstIdx;\n" + " __global struct b3Contact4Data* c = globalContactsOut + dstIdx;\n" + " c->m_worldNormalOnB = -dirOut;//normal;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " int bodyA = pairs[pairIndex].x;\n" + " int bodyB = pairs[pairIndex].y;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n" + " c->m_childIndexA = -1;\n" + " c->m_childIndexB = -1;\n" + " //for (int i=0;i<nContacts;i++)\n" + " posOut.w = -depthOut;\n" + " c->m_worldPosB[0] = posOut;//localPoints[contactIdx[i]];\n" + " GET_NPOINTS(*c) = 1;//nContacts;\n" + " }\n" + " }\n" + " }\n" + "}\n" + "typedef float4 Quaternion;\n" + "#define make_float4 (float4)\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = make_float4(a.xyz,0.f);\n" + " float4 b1 = make_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + "}\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b)\n" + "{\n" + " Quaternion ans;\n" + " ans = cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q)\n" + "{\n" + " return (Quaternion)(-q.xyz, q.w);\n" + "}\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec)\n" + "{\n" + " Quaternion qInv = qtInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = qtMul(qtMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "__inline\n" + "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" + "{\n" + " return qtRotate( *orientation, *p ) + (*translation);\n" + "}\n" + "__inline\n" + "float4 qtInvRotate(const Quaternion q, float4 vec)\n" + "{\n" + " return qtRotate( qtInvert( q ), vec );\n" + "}\n" + "inline void project(__global const b3ConvexPolyhedronData_t* hull, const float4 pos, const float4 orn, \n" + "const float4* dir, __global const float4* vertices, float* min, float* max)\n" + "{\n" + " min[0] = FLT_MAX;\n" + " max[0] = -FLT_MAX;\n" + " int numVerts = hull->m_numVertices;\n" + " const float4 localDir = qtInvRotate(orn,*dir);\n" + " float offset = dot(pos,*dir);\n" + " for(int i=0;i<numVerts;i++)\n" + " {\n" + " float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" + " if(dp < min[0]) \n" + " min[0] = dp;\n" + " if(dp > max[0]) \n" + " max[0] = dp;\n" + " }\n" + " if(min[0]>max[0])\n" + " {\n" + " float tmp = min[0];\n" + " min[0] = max[0];\n" + " max[0] = tmp;\n" + " }\n" + " min[0] += offset;\n" + " max[0] += offset;\n" + "}\n" + "bool findSeparatingAxisUnitSphere( __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " __global const float4* vertices,\n" + " __global const float4* unitSphereDirections,\n" + " int numUnitSphereDirections,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " \n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " int curPlaneTests=0;\n" + " int curEdgeEdge = 0;\n" + " // Test unit sphere directions\n" + " for (int i=0;i<numUnitSphereDirections;i++)\n" + " {\n" + " float4 crossje;\n" + " crossje = unitSphereDirections[i]; \n" + " if (dot3F4(DeltaC2,crossje)>0)\n" + " crossje *= -1.f;\n" + " {\n" + " float dist;\n" + " bool result = true;\n" + " float Min0,Max0;\n" + " float Min1,Max1;\n" + " project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n" + " project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n" + " \n" + " if(Max0<Min1 || Max1<Min0)\n" + " return false;\n" + " \n" + " float d0 = Max0 - Min1;\n" + " float d1 = Max1 - Min0;\n" + " dist = d0<d1 ? d0:d1;\n" + " result = true;\n" + " \n" + " if(dist<*dmin)\n" + " {\n" + " *dmin = dist;\n" + " *sep = crossje;\n" + " }\n" + " }\n" + " }\n" + " \n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " return true;\n" + "}\n" + "__kernel void findSeparatingAxisUnitSphereKernel( __global const int4* pairs, \n" + " __global const b3RigidBodyData_t* rigidBodies, \n" + " __global const b3Collidable_t* collidables,\n" + " __global const b3ConvexPolyhedronData_t* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* unitSphereDirections,\n" + " __global float4* separatingNormals,\n" + " __global int* hasSeparatingAxis,\n" + " __global float* dmins,\n" + " int numUnitSphereDirections,\n" + " int numPairs\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " \n" + " if (i<numPairs)\n" + " {\n" + " if (hasSeparatingAxis[i])\n" + " {\n" + " \n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " \n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " \n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " \n" + " float dmin = dmins[i];\n" + " \n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " posA.w = 0.f;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " posB.w = 0.f;\n" + " float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " float4 sepNormal = separatingNormals[i];\n" + " \n" + " int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n" + " if (numEdgeEdgeDirections>numUnitSphereDirections)\n" + " {\n" + " bool sepEE = findSeparatingAxisUnitSphere( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " vertices,unitSphereDirections,numUnitSphereDirections,&sepNormal,&dmin);\n" + " if (!sepEE)\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " } else\n" + " {\n" + " hasSeparatingAxis[i] = 1;\n" + " separatingNormals[i] = sepNormal;\n" + " }\n" + " }\n" + " } //if (hasSeparatingAxis[i])\n" + " }//(i<numPairs)\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h index b0103fe674..b2e0a2dd47 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h @@ -1,1289 +1,1288 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* primitiveContactsKernelsCL= \ -"#ifndef B3_CONTACT4DATA_H\n" -"#define B3_CONTACT4DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"typedef struct b3Contact4Data b3Contact4Data_t;\n" -"struct b3Contact4Data\n" -"{\n" -" b3Float4 m_worldPosB[4];\n" -"// b3Float4 m_localPosA[4];\n" -"// b3Float4 m_localPosB[4];\n" -" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" -" unsigned short m_restituitionCoeffCmp;\n" -" unsigned short m_frictionCoeffCmp;\n" -" int m_batchIdx;\n" -" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" -" int m_bodyBPtrAndSignBit;\n" -" int m_childIndexA;\n" -" int m_childIndexB;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" -"{\n" -" return (int)contact->m_worldNormalOnB.w;\n" -"};\n" -"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" -"{\n" -" contact->m_worldNormalOnB.w = (float)numPoints;\n" -"};\n" -"#endif //B3_CONTACT4DATA_H\n" -"#define SHAPE_CONVEX_HULL 3\n" -"#define SHAPE_PLANE 4\n" -"#define SHAPE_CONCAVE_TRIMESH 5\n" -"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" -"#define SHAPE_SPHERE 7\n" -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"#ifdef cl_ext_atomic_counters_32\n" -"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -"#define counter32_t volatile __global int*\n" -"#endif\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GET_NUM_GROUPS get_num_groups(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" -"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" -"#define max2 max\n" -"#define min2 min\n" -"typedef unsigned int u32;\n" -"typedef struct \n" -"{\n" -" union\n" -" {\n" -" float4 m_min;\n" -" float m_minElems[4];\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float4 m_max;\n" -" float m_maxElems[4];\n" -" int m_maxIndices[4];\n" -" };\n" -"} btAabbCL;\n" -"///keep this in sync with btCollidable.h\n" -"typedef struct\n" -"{\n" -" int m_numChildShapes;\n" -" float m_radius;\n" -" int m_shapeType;\n" -" int m_shapeIndex;\n" -" \n" -"} btCollidableGpu;\n" -"typedef struct\n" -"{\n" -" float4 m_childPosition;\n" -" float4 m_childOrientation;\n" -" int m_shapeIndex;\n" -" int m_unused0;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"} btGpuChildShape;\n" -"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" float4 m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" u32 m_collidableIdx; \n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} BodyData;\n" -"typedef struct \n" -"{\n" -" float4 m_localCenter;\n" -" float4 m_extents;\n" -" float4 mC;\n" -" float4 mE;\n" -" \n" -" float m_radius;\n" -" int m_faceOffset;\n" -" int m_numFaces;\n" -" int m_numVertices;\n" -" \n" -" int m_vertexOffset;\n" -" int m_uniqueEdgesOffset;\n" -" int m_numUniqueEdges;\n" -" int m_unused;\n" -"} ConvexPolyhedronCL;\n" -"typedef struct\n" -"{\n" -" float4 m_plane;\n" -" int m_indexOffset;\n" -" int m_numIndices;\n" -"} btGpuFace;\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define make_float4 (float4)\n" -"#define make_float2 (float2)\n" -"#define make_uint4 (uint4)\n" -"#define make_int4 (int4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"__inline\n" -"float fastDiv(float numerator, float denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"// return numerator/denominator; \n" -"}\n" -"__inline\n" -"float4 fastDiv4(float4 numerator, float4 denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"}\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -"}\n" -"//#define dot3F4 dot\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = make_float4(a.xyz,0.f);\n" -" float4 b1 = make_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" return fast_normalize(v);\n" -"}\n" -"///////////////////////////////////////\n" -"// Quaternion\n" -"///////////////////////////////////////\n" -"typedef float4 Quaternion;\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b);\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in);\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec);\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q);\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b)\n" -"{\n" -" Quaternion ans;\n" -" ans = cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in)\n" -"{\n" -" return fastNormalize4(in);\n" -"// in /= length( in );\n" -"// return in;\n" -"}\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec)\n" -"{\n" -" Quaternion qInv = qtInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q)\n" -"{\n" -" return (Quaternion)(-q.xyz, q.w);\n" -"}\n" -"__inline\n" -"float4 qtInvRotate(const Quaternion q, float4 vec)\n" -"{\n" -" return qtRotate( qtInvert( q ), vec );\n" -"}\n" -"__inline\n" -"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" -"{\n" -" return qtRotate( *orientation, *p ) + (*translation);\n" -"}\n" -"void trInverse(float4 translationIn, Quaternion orientationIn,\n" -" float4* translationOut, Quaternion* orientationOut)\n" -"{\n" -" *orientationOut = qtInvert(orientationIn);\n" -" *translationOut = qtRotate(*orientationOut, -translationIn);\n" -"}\n" -"void trMul(float4 translationA, Quaternion orientationA,\n" -" float4 translationB, Quaternion orientationB,\n" -" float4* translationOut, Quaternion* orientationOut)\n" -"{\n" -" *orientationOut = qtMul(orientationA,orientationB);\n" -" *translationOut = transform(&translationB,&translationA,&orientationA);\n" -"}\n" -"__inline\n" -"float4 normalize3(const float4 a)\n" -"{\n" -" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" -" return fastNormalize4( n );\n" -"}\n" -"__inline float4 lerp3(const float4 a,const float4 b, float t)\n" -"{\n" -" return make_float4( a.x + (b.x - a.x) * t,\n" -" a.y + (b.y - a.y) * t,\n" -" a.z + (b.z - a.z) * t,\n" -" 0.f);\n" -"}\n" -"float signedDistanceFromPointToPlane(float4 point, float4 planeEqn, float4* closestPointOnFace)\n" -"{\n" -" float4 n = (float4)(planeEqn.x, planeEqn.y, planeEqn.z, 0);\n" -" float dist = dot3F4(n, point) + planeEqn.w;\n" -" *closestPointOnFace = point - dist * n;\n" -" return dist;\n" -"}\n" -"inline bool IsPointInPolygon(float4 p, \n" -" const btGpuFace* face,\n" -" __global const float4* baseVertex,\n" -" __global const int* convexIndices,\n" -" float4* out)\n" -"{\n" -" float4 a;\n" -" float4 b;\n" -" float4 ab;\n" -" float4 ap;\n" -" float4 v;\n" -" float4 plane = make_float4(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);\n" -" \n" -" if (face->m_numIndices<2)\n" -" return false;\n" -" \n" -" float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];\n" -" \n" -" b = v0;\n" -" for(unsigned i=0; i != face->m_numIndices; ++i)\n" -" {\n" -" a = b;\n" -" float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]];\n" -" b = vi;\n" -" ab = b-a;\n" -" ap = p-a;\n" -" v = cross3(ab,plane);\n" -" if (dot(ap, v) > 0.f)\n" -" {\n" -" float ab_m2 = dot(ab, ab);\n" -" float rt = ab_m2 != 0.f ? dot(ab, ap) / ab_m2 : 0.f;\n" -" if (rt <= 0.f)\n" -" {\n" -" *out = a;\n" -" }\n" -" else if (rt >= 1.f) \n" -" {\n" -" *out = b;\n" -" }\n" -" else\n" -" {\n" -" float s = 1.f - rt;\n" -" out[0].x = s * a.x + rt * b.x;\n" -" out[0].y = s * a.y + rt * b.y;\n" -" out[0].z = s * a.z + rt * b.z;\n" -" }\n" -" return false;\n" -" }\n" -" }\n" -" return true;\n" -"}\n" -"void computeContactSphereConvex(int pairIndex,\n" -" int bodyIndexA, int bodyIndexB, \n" -" int collidableIndexA, int collidableIndexB, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes,\n" -" __global const float4* convexVertices,\n" -" __global const int* convexIndices,\n" -" __global const btGpuFace* faces,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int maxContactCapacity,\n" -" float4 spherePos2,\n" -" float radius,\n" -" float4 pos,\n" -" float4 quat\n" -" )\n" -"{\n" -" float4 invPos;\n" -" float4 invOrn;\n" -" trInverse(pos,quat, &invPos,&invOrn);\n" -" float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n" -" int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n" -" int numFaces = convexShapes[shapeIndex].m_numFaces;\n" -" float4 closestPnt = (float4)(0, 0, 0, 0);\n" -" float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n" -" float minDist = -1000000.f;\n" -" bool bCollide = true;\n" -" for ( int f = 0; f < numFaces; f++ )\n" -" {\n" -" btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n" -" // set up a plane equation \n" -" float4 planeEqn;\n" -" float4 n1 = face.m_plane;\n" -" n1.w = 0.f;\n" -" planeEqn = n1;\n" -" planeEqn.w = face.m_plane.w;\n" -" \n" -" \n" -" // compute a signed distance from the vertex in cloth to the face of rigidbody.\n" -" float4 pntReturn;\n" -" float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);\n" -" // If the distance is positive, the plane is a separating plane. \n" -" if ( dist > radius )\n" -" {\n" -" bCollide = false;\n" -" break;\n" -" }\n" -" if (dist>0)\n" -" {\n" -" //might hit an edge or vertex\n" -" float4 out;\n" -" float4 zeroPos = make_float4(0,0,0,0);\n" -" bool isInPoly = IsPointInPolygon(spherePos,\n" -" &face,\n" -" &convexVertices[convexShapes[shapeIndex].m_vertexOffset],\n" -" convexIndices,\n" -" &out);\n" -" if (isInPoly)\n" -" {\n" -" if (dist>minDist)\n" -" {\n" -" minDist = dist;\n" -" closestPnt = pntReturn;\n" -" hitNormalWorld = planeEqn;\n" -" \n" -" }\n" -" } else\n" -" {\n" -" float4 tmp = spherePos-out;\n" -" float l2 = dot(tmp,tmp);\n" -" if (l2<radius*radius)\n" -" {\n" -" dist = sqrt(l2);\n" -" if (dist>minDist)\n" -" {\n" -" minDist = dist;\n" -" closestPnt = out;\n" -" hitNormalWorld = tmp/dist;\n" -" \n" -" }\n" -" \n" -" } else\n" -" {\n" -" bCollide = false;\n" -" break;\n" -" }\n" -" }\n" -" } else\n" -" {\n" -" if ( dist > minDist )\n" -" {\n" -" minDist = dist;\n" -" closestPnt = pntReturn;\n" -" hitNormalWorld.xyz = planeEqn.xyz;\n" -" }\n" -" }\n" -" \n" -" }\n" -" \n" -" if (bCollide && minDist > -10000)\n" -" {\n" -" float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n" -" float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n" -" \n" -" float actualDepth = minDist-radius;\n" -" if (actualDepth<=0.f)\n" -" {\n" -" \n" -" pOnB1.w = actualDepth;\n" -" int dstIdx;\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" \n" -" \n" -" if (1)//dstIdx < maxContactCapacity)\n" -" {\n" -" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" -" c->m_worldNormalOnB = -normalOnSurfaceB1;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" -" c->m_worldPosB[0] = pOnB1;\n" -" c->m_childIndexA = -1;\n" -" c->m_childIndexB = -1;\n" -" GET_NPOINTS(*c) = 1;\n" -" } \n" -" }\n" -" }//if (hasCollision)\n" -"}\n" -" \n" -"int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n" -"{\n" -" if( nPoints == 0 )\n" -" return 0;\n" -" \n" -" if (nPoints <=4)\n" -" return nPoints;\n" -" \n" -" \n" -" if (nPoints >64)\n" -" nPoints = 64;\n" -" \n" -" float4 center = make_float4(0.f);\n" -" {\n" -" \n" -" for (int i=0;i<nPoints;i++)\n" -" center += p[i];\n" -" center /= (float)nPoints;\n" -" }\n" -" \n" -" \n" -" \n" -" // sample 4 directions\n" -" \n" -" float4 aVector = p[0] - center;\n" -" float4 u = cross3( nearNormal, aVector );\n" -" float4 v = cross3( nearNormal, u );\n" -" u = normalize3( u );\n" -" v = normalize3( v );\n" -" \n" -" \n" -" //keep point with deepest penetration\n" -" float minW= FLT_MAX;\n" -" \n" -" int minIndex=-1;\n" -" \n" -" float4 maxDots;\n" -" maxDots.x = FLT_MIN;\n" -" maxDots.y = FLT_MIN;\n" -" maxDots.z = FLT_MIN;\n" -" maxDots.w = FLT_MIN;\n" -" \n" -" // idx, distance\n" -" for(int ie = 0; ie<nPoints; ie++ )\n" -" {\n" -" if (p[ie].w<minW)\n" -" {\n" -" minW = p[ie].w;\n" -" minIndex=ie;\n" -" }\n" -" float f;\n" -" float4 r = p[ie]-center;\n" -" f = dot3F4( u, r );\n" -" if (f<maxDots.x)\n" -" {\n" -" maxDots.x = f;\n" -" contactIdx[0].x = ie;\n" -" }\n" -" \n" -" f = dot3F4( -u, r );\n" -" if (f<maxDots.y)\n" -" {\n" -" maxDots.y = f;\n" -" contactIdx[0].y = ie;\n" -" }\n" -" \n" -" \n" -" f = dot3F4( v, r );\n" -" if (f<maxDots.z)\n" -" {\n" -" maxDots.z = f;\n" -" contactIdx[0].z = ie;\n" -" }\n" -" \n" -" f = dot3F4( -v, r );\n" -" if (f<maxDots.w)\n" -" {\n" -" maxDots.w = f;\n" -" contactIdx[0].w = ie;\n" -" }\n" -" \n" -" }\n" -" \n" -" if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n" -" {\n" -" //replace the first contact with minimum (todo: replace contact with least penetration)\n" -" contactIdx[0].x = minIndex;\n" -" }\n" -" \n" -" return 4;\n" -" \n" -"}\n" -"#define MAX_PLANE_CONVEX_POINTS 64\n" -"int computeContactPlaneConvex(int pairIndex,\n" -" int bodyIndexA, int bodyIndexB, \n" -" int collidableIndexA, int collidableIndexB, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu*collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes,\n" -" __global const float4* convexVertices,\n" -" __global const int* convexIndices,\n" -" __global const btGpuFace* faces,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int maxContactCapacity,\n" -" float4 posB,\n" -" Quaternion ornB\n" -" )\n" -"{\n" -" int resultIndex=-1;\n" -" int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n" -" __global const ConvexPolyhedronCL* hullB = &convexShapes[shapeIndex];\n" -" \n" -" float4 posA;\n" -" posA = rigidBodies[bodyIndexA].m_pos;\n" -" Quaternion ornA;\n" -" ornA = rigidBodies[bodyIndexA].m_quat;\n" -" int numContactsOut = 0;\n" -" int numWorldVertsB1= 0;\n" -" float4 planeEq;\n" -" planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n" -" float4 planeNormal = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n" -" float4 planeNormalWorld;\n" -" planeNormalWorld = qtRotate(ornA,planeNormal);\n" -" float planeConstant = planeEq.w;\n" -" \n" -" float4 invPosA;Quaternion invOrnA;\n" -" float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n" -" {\n" -" \n" -" trInverse(posA,ornA,&invPosA,&invOrnA);\n" -" trMul(invPosA,invOrnA,posB,ornB,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n" -" }\n" -" float4 invPosB;Quaternion invOrnB;\n" -" float4 planeInConvexPos1; Quaternion planeInConvexOrn1;\n" -" {\n" -" \n" -" trInverse(posB,ornB,&invPosB,&invOrnB);\n" -" trMul(invPosB,invOrnB,posA,ornA,&planeInConvexPos1,&planeInConvexOrn1); \n" -" }\n" -" \n" -" float4 planeNormalInConvex = qtRotate(planeInConvexOrn1,-planeNormal);\n" -" float maxDot = -1e30;\n" -" int hitVertex=-1;\n" -" float4 hitVtx;\n" -" float4 contactPoints[MAX_PLANE_CONVEX_POINTS];\n" -" int numPoints = 0;\n" -" int4 contactIdx;\n" -" contactIdx=make_int4(0,1,2,3);\n" -" \n" -" \n" -" for (int i=0;i<hullB->m_numVertices;i++)\n" -" {\n" -" float4 vtx = convexVertices[hullB->m_vertexOffset+i];\n" -" float curDot = dot(vtx,planeNormalInConvex);\n" -" if (curDot>maxDot)\n" -" {\n" -" hitVertex=i;\n" -" maxDot=curDot;\n" -" hitVtx = vtx;\n" -" //make sure the deepest points is always included\n" -" if (numPoints==MAX_PLANE_CONVEX_POINTS)\n" -" numPoints--;\n" -" }\n" -" if (numPoints<MAX_PLANE_CONVEX_POINTS)\n" -" {\n" -" float4 vtxWorld = transform(&vtx, &posB, &ornB);\n" -" float4 vtxInPlane = transform(&vtxWorld, &invPosA, &invOrnA);//oplaneTransform.inverse()*vtxWorld;\n" -" float dist = dot(planeNormal,vtxInPlane)-planeConstant;\n" -" if (dist<0.f)\n" -" {\n" -" vtxWorld.w = dist;\n" -" contactPoints[numPoints] = vtxWorld;\n" -" numPoints++;\n" -" }\n" -" }\n" -" }\n" -" int numReducedPoints = numPoints;\n" -" if (numPoints>4)\n" -" {\n" -" numReducedPoints = extractManifoldSequential( contactPoints, numPoints, planeNormalInConvex, &contactIdx);\n" -" }\n" -" if (numReducedPoints>0)\n" -" {\n" -" int dstIdx;\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" if (dstIdx < maxContactCapacity)\n" -" {\n" -" resultIndex = dstIdx;\n" -" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" -" c->m_worldNormalOnB = -planeNormalWorld;\n" -" //c->setFrictionCoeff(0.7);\n" -" //c->setRestituitionCoeff(0.f);\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" -" c->m_childIndexA = -1;\n" -" c->m_childIndexB = -1;\n" -" switch (numReducedPoints)\n" -" {\n" -" case 4:\n" -" c->m_worldPosB[3] = contactPoints[contactIdx.w];\n" -" case 3:\n" -" c->m_worldPosB[2] = contactPoints[contactIdx.z];\n" -" case 2:\n" -" c->m_worldPosB[1] = contactPoints[contactIdx.y];\n" -" case 1:\n" -" c->m_worldPosB[0] = contactPoints[contactIdx.x];\n" -" default:\n" -" {\n" -" }\n" -" };\n" -" \n" -" GET_NPOINTS(*c) = numReducedPoints;\n" -" }//if (dstIdx < numPairs)\n" -" } \n" -" return resultIndex;\n" -"}\n" -"void computeContactPlaneSphere(int pairIndex,\n" -" int bodyIndexA, int bodyIndexB, \n" -" int collidableIndexA, int collidableIndexB, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global const btGpuFace* faces,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int maxContactCapacity)\n" -"{\n" -" float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n" -" float radius = collidables[collidableIndexB].m_radius;\n" -" float4 posA1 = rigidBodies[bodyIndexA].m_pos;\n" -" float4 ornA1 = rigidBodies[bodyIndexA].m_quat;\n" -" float4 posB1 = rigidBodies[bodyIndexB].m_pos;\n" -" float4 ornB1 = rigidBodies[bodyIndexB].m_quat;\n" -" \n" -" bool hasCollision = false;\n" -" float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n" -" float planeConstant = planeEq.w;\n" -" float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n" -" {\n" -" float4 invPosA;Quaternion invOrnA;\n" -" trInverse(posA1,ornA1,&invPosA,&invOrnA);\n" -" trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n" -" }\n" -" float4 planeInConvexPos1; Quaternion planeInConvexOrn1;\n" -" {\n" -" float4 invPosB;Quaternion invOrnB;\n" -" trInverse(posB1,ornB1,&invPosB,&invOrnB);\n" -" trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1); \n" -" }\n" -" float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius;\n" -" float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n" -" float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant;\n" -" hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold();\n" -" if (hasCollision)\n" -" {\n" -" float4 vtxInPlaneProjected1 = vtxInPlane1 - distance*planeNormal1;\n" -" float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1);\n" -" float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);\n" -" float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;\n" -" pOnB1.w = distance;\n" -" int dstIdx;\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" \n" -" if (dstIdx < maxContactCapacity)\n" -" {\n" -" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" -" c->m_worldNormalOnB = -normalOnSurfaceB1;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" -" c->m_worldPosB[0] = pOnB1;\n" -" c->m_childIndexA = -1;\n" -" c->m_childIndexB = -1;\n" -" GET_NPOINTS(*c) = 1;\n" -" }//if (dstIdx < numPairs)\n" -" }//if (hasCollision)\n" -"}\n" -"__kernel void primitiveContactsKernel( __global int4* pairs, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int numPairs, int maxContactCapacity)\n" -"{\n" -" int i = get_global_id(0);\n" -" int pairIndex = i;\n" -" \n" -" float4 worldVertsB1[64];\n" -" float4 worldVertsB2[64];\n" -" int capacityWorldVerts = 64; \n" -" float4 localContactsOut[64];\n" -" int localContactCapacity=64;\n" -" \n" -" float minDist = -1e30f;\n" -" float maxDist = 0.02f;\n" -" if (i<numPairs)\n" -" {\n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" \n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n" -" collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n" -" {\n" -" float4 posB;\n" -" posB = rigidBodies[bodyIndexB].m_pos;\n" -" Quaternion ornB;\n" -" ornB = rigidBodies[bodyIndexB].m_quat;\n" -" int contactIndex = computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" -" rigidBodies,collidables,convexShapes,vertices,indices,\n" -" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity, posB,ornB);\n" -" if (contactIndex>=0)\n" -" pairs[pairIndex].z = contactIndex;\n" -" return;\n" -" }\n" -" if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n" -" collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n" -" {\n" -" float4 posA;\n" -" posA = rigidBodies[bodyIndexA].m_pos;\n" -" Quaternion ornA;\n" -" ornA = rigidBodies[bodyIndexA].m_quat;\n" -" int contactIndex = computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, \n" -" rigidBodies,collidables,convexShapes,vertices,indices,\n" -" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n" -" if (contactIndex>=0)\n" -" pairs[pairIndex].z = contactIndex;\n" -" return;\n" -" }\n" -" if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n" -" collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" -" {\n" -" computeContactPlaneSphere(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" -" rigidBodies,collidables,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n" -" return;\n" -" }\n" -" if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" -" collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n" -" {\n" -" computeContactPlaneSphere( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, \n" -" rigidBodies,collidables,\n" -" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n" -" return;\n" -" }\n" -" \n" -" \n" -" if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" -" collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n" -" {\n" -" \n" -" float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n" -" float sphereRadius = collidables[collidableIndexA].m_radius;\n" -" float4 convexPos = rigidBodies[bodyIndexB].m_pos;\n" -" float4 convexOrn = rigidBodies[bodyIndexB].m_quat;\n" -" computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" -" rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" -" spherePos,sphereRadius,convexPos,convexOrn);\n" -" return;\n" -" }\n" -" if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n" -" collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" -" {\n" -" \n" -" float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n" -" float sphereRadius = collidables[collidableIndexB].m_radius;\n" -" float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n" -" float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n" -" computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n" -" rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" -" spherePos,sphereRadius,convexPos,convexOrn);\n" -" return;\n" -" }\n" -" \n" -" \n" -" \n" -" \n" -" \n" -" \n" -" if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" -" collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" -" {\n" -" //sphere-sphere\n" -" float radiusA = collidables[collidableIndexA].m_radius;\n" -" float radiusB = collidables[collidableIndexB].m_radius;\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" float4 diff = posA-posB;\n" -" float len = length(diff);\n" -" \n" -" ///iff distance positive, don't generate a new contact\n" -" if ( len <= (radiusA+radiusB))\n" -" {\n" -" ///distance (negative means penetration)\n" -" float dist = len - (radiusA+radiusB);\n" -" float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n" -" if (len > 0.00001)\n" -" {\n" -" normalOnSurfaceB = diff / len;\n" -" }\n" -" float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n" -" contactPosB.w = dist;\n" -" \n" -" int dstIdx;\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" \n" -" if (dstIdx < maxContactCapacity)\n" -" {\n" -" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" -" c->m_worldNormalOnB = normalOnSurfaceB;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" int bodyA = pairs[pairIndex].x;\n" -" int bodyB = pairs[pairIndex].y;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" -" c->m_worldPosB[0] = contactPosB;\n" -" c->m_childIndexA = -1;\n" -" c->m_childIndexB = -1;\n" -" GET_NPOINTS(*c) = 1;\n" -" }//if (dstIdx < numPairs)\n" -" }//if ( len <= (radiusA+radiusB))\n" -" return;\n" -" }//SHAPE_SPHERE SHAPE_SPHERE\n" -" }// if (i<numPairs)\n" -"}\n" -"// work-in-progress\n" -"__kernel void processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,\n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global btAabbCL* aabbs,\n" -" __global const btGpuChildShape* gpuChildShapes,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int numCompoundPairs, int maxContactCapacity\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i<numCompoundPairs)\n" -" {\n" -" int bodyIndexA = gpuCompoundPairs[i].x;\n" -" int bodyIndexB = gpuCompoundPairs[i].y;\n" -" int childShapeIndexA = gpuCompoundPairs[i].z;\n" -" int childShapeIndexB = gpuCompoundPairs[i].w;\n" -" \n" -" int collidableIndexA = -1;\n" -" int collidableIndexB = -1;\n" -" \n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" \n" -" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" \n" -" if (childShapeIndexA >= 0)\n" -" {\n" -" collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" -" float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" -" float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" -" float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" -" float4 newOrnA = qtMul(ornA,childOrnA);\n" -" posA = newPosA;\n" -" ornA = newOrnA;\n" -" } else\n" -" {\n" -" collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" }\n" -" \n" -" if (childShapeIndexB>=0)\n" -" {\n" -" collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" -" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" -" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" -" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" -" float4 newOrnB = qtMul(ornB,childOrnB);\n" -" posB = newPosB;\n" -" ornB = newOrnB;\n" -" } else\n" -" {\n" -" collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; \n" -" }\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n" -" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" -" int pairIndex = i;\n" -" if ((shapeTypeA == SHAPE_PLANE) && (shapeTypeB==SHAPE_CONVEX_HULL))\n" -" {\n" -" computeContactPlaneConvex( pairIndex, bodyIndexA,bodyIndexB, collidableIndexA,collidableIndexB, \n" -" rigidBodies,collidables,convexShapes,vertices,indices,\n" -" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posB,ornB);\n" -" return;\n" -" }\n" -" if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB==SHAPE_PLANE))\n" -" {\n" -" computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, \n" -" rigidBodies,collidables,convexShapes,vertices,indices,\n" -" faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n" -" return;\n" -" }\n" -" if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB == SHAPE_SPHERE))\n" -" {\n" -" float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n" -" float sphereRadius = collidables[collidableIndexB].m_radius;\n" -" float4 convexPos = posA;\n" -" float4 convexOrn = ornA;\n" -" \n" -" computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA , collidableIndexB,collidableIndexA, \n" -" rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" -" spherePos,sphereRadius,convexPos,convexOrn);\n" -" \n" -" return;\n" -" }\n" -" if ((shapeTypeA == SHAPE_SPHERE) && (shapeTypeB == SHAPE_CONVEX_HULL))\n" -" {\n" -" float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n" -" float sphereRadius = collidables[collidableIndexA].m_radius;\n" -" float4 convexPos = posB;\n" -" float4 convexOrn = ornB;\n" -" \n" -" computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" -" rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" -" spherePos,sphereRadius,convexPos,convexOrn);\n" -" \n" -" return;\n" -" }\n" -" }// if (i<numCompoundPairs)\n" -"}\n" -"bool pointInTriangle(const float4* vertices, const float4* normal, float4 *p )\n" -"{\n" -" const float4* p1 = &vertices[0];\n" -" const float4* p2 = &vertices[1];\n" -" const float4* p3 = &vertices[2];\n" -" float4 edge1; edge1 = (*p2 - *p1);\n" -" float4 edge2; edge2 = ( *p3 - *p2 );\n" -" float4 edge3; edge3 = ( *p1 - *p3 );\n" -" \n" -" float4 p1_to_p; p1_to_p = ( *p - *p1 );\n" -" float4 p2_to_p; p2_to_p = ( *p - *p2 );\n" -" float4 p3_to_p; p3_to_p = ( *p - *p3 );\n" -" float4 edge1_normal; edge1_normal = ( cross(edge1,*normal));\n" -" float4 edge2_normal; edge2_normal = ( cross(edge2,*normal));\n" -" float4 edge3_normal; edge3_normal = ( cross(edge3,*normal));\n" -" \n" -" \n" -" float r1, r2, r3;\n" -" r1 = dot(edge1_normal,p1_to_p );\n" -" r2 = dot(edge2_normal,p2_to_p );\n" -" r3 = dot(edge3_normal,p3_to_p );\n" -" \n" -" if ( r1 > 0 && r2 > 0 && r3 > 0 )\n" -" return true;\n" -" if ( r1 <= 0 && r2 <= 0 && r3 <= 0 ) \n" -" return true;\n" -" return false;\n" -"}\n" -"float segmentSqrDistance(float4 from, float4 to,float4 p, float4* nearest) \n" -"{\n" -" float4 diff = p - from;\n" -" float4 v = to - from;\n" -" float t = dot(v,diff);\n" -" \n" -" if (t > 0) \n" -" {\n" -" float dotVV = dot(v,v);\n" -" if (t < dotVV) \n" -" {\n" -" t /= dotVV;\n" -" diff -= t*v;\n" -" } else \n" -" {\n" -" t = 1;\n" -" diff -= v;\n" -" }\n" -" } else\n" -" {\n" -" t = 0;\n" -" }\n" -" *nearest = from + t*v;\n" -" return dot(diff,diff); \n" -"}\n" -"void computeContactSphereTriangle(int pairIndex,\n" -" int bodyIndexA, int bodyIndexB,\n" -" int collidableIndexA, int collidableIndexB, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" const float4* triangleVertices,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int maxContactCapacity,\n" -" float4 spherePos2,\n" -" float radius,\n" -" float4 pos,\n" -" float4 quat,\n" -" int faceIndex\n" -" )\n" -"{\n" -" float4 invPos;\n" -" float4 invOrn;\n" -" trInverse(pos,quat, &invPos,&invOrn);\n" -" float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n" -" int numFaces = 3;\n" -" float4 closestPnt = (float4)(0, 0, 0, 0);\n" -" float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n" -" float minDist = -1000000.f;\n" -" bool bCollide = false;\n" -" \n" -" //////////////////////////////////////\n" -" float4 sphereCenter;\n" -" sphereCenter = spherePos;\n" -" const float4* vertices = triangleVertices;\n" -" float contactBreakingThreshold = 0.f;//todo?\n" -" float radiusWithThreshold = radius + contactBreakingThreshold;\n" -" float4 edge10;\n" -" edge10 = vertices[1]-vertices[0];\n" -" edge10.w = 0.f;//is this needed?\n" -" float4 edge20;\n" -" edge20 = vertices[2]-vertices[0];\n" -" edge20.w = 0.f;//is this needed?\n" -" float4 normal = cross3(edge10,edge20);\n" -" normal = normalize(normal);\n" -" float4 p1ToCenter;\n" -" p1ToCenter = sphereCenter - vertices[0];\n" -" \n" -" float distanceFromPlane = dot(p1ToCenter,normal);\n" -" if (distanceFromPlane < 0.f)\n" -" {\n" -" //triangle facing the other way\n" -" distanceFromPlane *= -1.f;\n" -" normal *= -1.f;\n" -" }\n" -" hitNormalWorld = normal;\n" -" bool isInsideContactPlane = distanceFromPlane < radiusWithThreshold;\n" -" \n" -" // Check for contact / intersection\n" -" bool hasContact = false;\n" -" float4 contactPoint;\n" -" if (isInsideContactPlane) \n" -" {\n" -" \n" -" if (pointInTriangle(vertices,&normal, &sphereCenter)) \n" -" {\n" -" // Inside the contact wedge - touches a point on the shell plane\n" -" hasContact = true;\n" -" contactPoint = sphereCenter - normal*distanceFromPlane;\n" -" \n" -" } else {\n" -" // Could be inside one of the contact capsules\n" -" float contactCapsuleRadiusSqr = radiusWithThreshold*radiusWithThreshold;\n" -" float4 nearestOnEdge;\n" -" int numEdges = 3;\n" -" for (int i = 0; i < numEdges; i++) \n" -" {\n" -" float4 pa =vertices[i];\n" -" float4 pb = vertices[(i+1)%3];\n" -" float distanceSqr = segmentSqrDistance(pa,pb,sphereCenter, &nearestOnEdge);\n" -" if (distanceSqr < contactCapsuleRadiusSqr) \n" -" {\n" -" // Yep, we're inside a capsule\n" -" hasContact = true;\n" -" contactPoint = nearestOnEdge;\n" -" \n" -" }\n" -" \n" -" }\n" -" }\n" -" }\n" -" if (hasContact) \n" -" {\n" -" closestPnt = contactPoint;\n" -" float4 contactToCenter = sphereCenter - contactPoint;\n" -" minDist = length(contactToCenter);\n" -" if (minDist>FLT_EPSILON)\n" -" {\n" -" hitNormalWorld = normalize(contactToCenter);//*(1./minDist);\n" -" bCollide = true;\n" -" }\n" -" \n" -" }\n" -" /////////////////////////////////////\n" -" if (bCollide && minDist > -10000)\n" -" {\n" -" \n" -" float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n" -" float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n" -" float actualDepth = minDist-radius;\n" -" \n" -" if (actualDepth<=0.f)\n" -" {\n" -" pOnB1.w = actualDepth;\n" -" int dstIdx;\n" -" \n" -" float lenSqr = dot3F4(normalOnSurfaceB1,normalOnSurfaceB1);\n" -" if (lenSqr>FLT_EPSILON)\n" -" {\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" \n" -" if (dstIdx < maxContactCapacity)\n" -" {\n" -" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" -" c->m_worldNormalOnB = -normalOnSurfaceB1;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" -" c->m_worldPosB[0] = pOnB1;\n" -" c->m_childIndexA = -1;\n" -" c->m_childIndexB = faceIndex;\n" -" GET_NPOINTS(*c) = 1;\n" -" } \n" -" }\n" -" }\n" -" }//if (hasCollision)\n" -"}\n" -"// work-in-progress\n" -"__kernel void findConcaveSphereContactsKernel( __global int4* concavePairs,\n" -" __global const BodyData* rigidBodies,\n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global btAabbCL* aabbs,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int numConcavePairs, int maxContactCapacity\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numConcavePairs)\n" -" return;\n" -" int pairIdx = i;\n" -" int bodyIndexA = concavePairs[i].x;\n" -" int bodyIndexB = concavePairs[i].y;\n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)\n" -" {\n" -" int f = concavePairs[i].z;\n" -" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" -" \n" -" float4 verticesA[3];\n" -" for (int i=0;i<3;i++)\n" -" {\n" -" int index = indices[face.m_indexOffset+i];\n" -" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" -" verticesA[i] = vert;\n" -" }\n" -" float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n" -" float sphereRadius = collidables[collidableIndexB].m_radius;\n" -" float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n" -" float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n" -" computeContactSphereTriangle(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n" -" rigidBodies,collidables,\n" -" verticesA,\n" -" globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" -" spherePos,sphereRadius,convexPos,convexOrn, f);\n" -" return;\n" -" }\n" -"}\n" -; +static const char* primitiveContactsKernelsCL = + "#ifndef B3_CONTACT4DATA_H\n" + "#define B3_CONTACT4DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "typedef struct b3Contact4Data b3Contact4Data_t;\n" + "struct b3Contact4Data\n" + "{\n" + " b3Float4 m_worldPosB[4];\n" + "// b3Float4 m_localPosA[4];\n" + "// b3Float4 m_localPosB[4];\n" + " b3Float4 m_worldNormalOnB; // w: m_nPoints\n" + " unsigned short m_restituitionCoeffCmp;\n" + " unsigned short m_frictionCoeffCmp;\n" + " int m_batchIdx;\n" + " int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" + " int m_bodyBPtrAndSignBit;\n" + " int m_childIndexA;\n" + " int m_childIndexB;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" + "{\n" + " return (int)contact->m_worldNormalOnB.w;\n" + "};\n" + "inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" + "{\n" + " contact->m_worldNormalOnB.w = (float)numPoints;\n" + "};\n" + "#endif //B3_CONTACT4DATA_H\n" + "#define SHAPE_CONVEX_HULL 3\n" + "#define SHAPE_PLANE 4\n" + "#define SHAPE_CONCAVE_TRIMESH 5\n" + "#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" + "#define SHAPE_SPHERE 7\n" + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "#ifdef cl_ext_atomic_counters_32\n" + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + "#define counter32_t volatile __global int*\n" + "#endif\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GET_NUM_GROUPS get_num_groups(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" + "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" + "#define max2 max\n" + "#define min2 min\n" + "typedef unsigned int u32;\n" + "typedef struct \n" + "{\n" + " union\n" + " {\n" + " float4 m_min;\n" + " float m_minElems[4];\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float4 m_max;\n" + " float m_maxElems[4];\n" + " int m_maxIndices[4];\n" + " };\n" + "} btAabbCL;\n" + "///keep this in sync with btCollidable.h\n" + "typedef struct\n" + "{\n" + " int m_numChildShapes;\n" + " float m_radius;\n" + " int m_shapeType;\n" + " int m_shapeIndex;\n" + " \n" + "} btCollidableGpu;\n" + "typedef struct\n" + "{\n" + " float4 m_childPosition;\n" + " float4 m_childOrientation;\n" + " int m_shapeIndex;\n" + " int m_unused0;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "} btGpuChildShape;\n" + "#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " float4 m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " u32 m_collidableIdx; \n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} BodyData;\n" + "typedef struct \n" + "{\n" + " float4 m_localCenter;\n" + " float4 m_extents;\n" + " float4 mC;\n" + " float4 mE;\n" + " \n" + " float m_radius;\n" + " int m_faceOffset;\n" + " int m_numFaces;\n" + " int m_numVertices;\n" + " \n" + " int m_vertexOffset;\n" + " int m_uniqueEdgesOffset;\n" + " int m_numUniqueEdges;\n" + " int m_unused;\n" + "} ConvexPolyhedronCL;\n" + "typedef struct\n" + "{\n" + " float4 m_plane;\n" + " int m_indexOffset;\n" + " int m_numIndices;\n" + "} btGpuFace;\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define make_float4 (float4)\n" + "#define make_float2 (float2)\n" + "#define make_uint4 (uint4)\n" + "#define make_int4 (int4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "__inline\n" + "float fastDiv(float numerator, float denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "// return numerator/denominator; \n" + "}\n" + "__inline\n" + "float4 fastDiv4(float4 numerator, float4 denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "}\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + "}\n" + "//#define dot3F4 dot\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = make_float4(a.xyz,0.f);\n" + " float4 b1 = make_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " return fast_normalize(v);\n" + "}\n" + "///////////////////////////////////////\n" + "// Quaternion\n" + "///////////////////////////////////////\n" + "typedef float4 Quaternion;\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b);\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in);\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec);\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q);\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b)\n" + "{\n" + " Quaternion ans;\n" + " ans = cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in)\n" + "{\n" + " return fastNormalize4(in);\n" + "// in /= length( in );\n" + "// return in;\n" + "}\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec)\n" + "{\n" + " Quaternion qInv = qtInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = qtMul(qtMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q)\n" + "{\n" + " return (Quaternion)(-q.xyz, q.w);\n" + "}\n" + "__inline\n" + "float4 qtInvRotate(const Quaternion q, float4 vec)\n" + "{\n" + " return qtRotate( qtInvert( q ), vec );\n" + "}\n" + "__inline\n" + "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" + "{\n" + " return qtRotate( *orientation, *p ) + (*translation);\n" + "}\n" + "void trInverse(float4 translationIn, Quaternion orientationIn,\n" + " float4* translationOut, Quaternion* orientationOut)\n" + "{\n" + " *orientationOut = qtInvert(orientationIn);\n" + " *translationOut = qtRotate(*orientationOut, -translationIn);\n" + "}\n" + "void trMul(float4 translationA, Quaternion orientationA,\n" + " float4 translationB, Quaternion orientationB,\n" + " float4* translationOut, Quaternion* orientationOut)\n" + "{\n" + " *orientationOut = qtMul(orientationA,orientationB);\n" + " *translationOut = transform(&translationB,&translationA,&orientationA);\n" + "}\n" + "__inline\n" + "float4 normalize3(const float4 a)\n" + "{\n" + " float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" + " return fastNormalize4( n );\n" + "}\n" + "__inline float4 lerp3(const float4 a,const float4 b, float t)\n" + "{\n" + " return make_float4( a.x + (b.x - a.x) * t,\n" + " a.y + (b.y - a.y) * t,\n" + " a.z + (b.z - a.z) * t,\n" + " 0.f);\n" + "}\n" + "float signedDistanceFromPointToPlane(float4 point, float4 planeEqn, float4* closestPointOnFace)\n" + "{\n" + " float4 n = (float4)(planeEqn.x, planeEqn.y, planeEqn.z, 0);\n" + " float dist = dot3F4(n, point) + planeEqn.w;\n" + " *closestPointOnFace = point - dist * n;\n" + " return dist;\n" + "}\n" + "inline bool IsPointInPolygon(float4 p, \n" + " const btGpuFace* face,\n" + " __global const float4* baseVertex,\n" + " __global const int* convexIndices,\n" + " float4* out)\n" + "{\n" + " float4 a;\n" + " float4 b;\n" + " float4 ab;\n" + " float4 ap;\n" + " float4 v;\n" + " float4 plane = make_float4(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);\n" + " \n" + " if (face->m_numIndices<2)\n" + " return false;\n" + " \n" + " float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];\n" + " \n" + " b = v0;\n" + " for(unsigned i=0; i != face->m_numIndices; ++i)\n" + " {\n" + " a = b;\n" + " float4 vi = baseVertex[convexIndices[face->m_indexOffset + i]];\n" + " b = vi;\n" + " ab = b-a;\n" + " ap = p-a;\n" + " v = cross3(ab,plane);\n" + " if (dot(ap, v) > 0.f)\n" + " {\n" + " float ab_m2 = dot(ab, ab);\n" + " float rt = ab_m2 != 0.f ? dot(ab, ap) / ab_m2 : 0.f;\n" + " if (rt <= 0.f)\n" + " {\n" + " *out = a;\n" + " }\n" + " else if (rt >= 1.f) \n" + " {\n" + " *out = b;\n" + " }\n" + " else\n" + " {\n" + " float s = 1.f - rt;\n" + " out[0].x = s * a.x + rt * b.x;\n" + " out[0].y = s * a.y + rt * b.y;\n" + " out[0].z = s * a.z + rt * b.z;\n" + " }\n" + " return false;\n" + " }\n" + " }\n" + " return true;\n" + "}\n" + "void computeContactSphereConvex(int pairIndex,\n" + " int bodyIndexA, int bodyIndexB, \n" + " int collidableIndexA, int collidableIndexB, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes,\n" + " __global const float4* convexVertices,\n" + " __global const int* convexIndices,\n" + " __global const btGpuFace* faces,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int maxContactCapacity,\n" + " float4 spherePos2,\n" + " float radius,\n" + " float4 pos,\n" + " float4 quat\n" + " )\n" + "{\n" + " float4 invPos;\n" + " float4 invOrn;\n" + " trInverse(pos,quat, &invPos,&invOrn);\n" + " float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n" + " int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n" + " int numFaces = convexShapes[shapeIndex].m_numFaces;\n" + " float4 closestPnt = (float4)(0, 0, 0, 0);\n" + " float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n" + " float minDist = -1000000.f;\n" + " bool bCollide = true;\n" + " for ( int f = 0; f < numFaces; f++ )\n" + " {\n" + " btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n" + " // set up a plane equation \n" + " float4 planeEqn;\n" + " float4 n1 = face.m_plane;\n" + " n1.w = 0.f;\n" + " planeEqn = n1;\n" + " planeEqn.w = face.m_plane.w;\n" + " \n" + " \n" + " // compute a signed distance from the vertex in cloth to the face of rigidbody.\n" + " float4 pntReturn;\n" + " float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);\n" + " // If the distance is positive, the plane is a separating plane. \n" + " if ( dist > radius )\n" + " {\n" + " bCollide = false;\n" + " break;\n" + " }\n" + " if (dist>0)\n" + " {\n" + " //might hit an edge or vertex\n" + " float4 out;\n" + " float4 zeroPos = make_float4(0,0,0,0);\n" + " bool isInPoly = IsPointInPolygon(spherePos,\n" + " &face,\n" + " &convexVertices[convexShapes[shapeIndex].m_vertexOffset],\n" + " convexIndices,\n" + " &out);\n" + " if (isInPoly)\n" + " {\n" + " if (dist>minDist)\n" + " {\n" + " minDist = dist;\n" + " closestPnt = pntReturn;\n" + " hitNormalWorld = planeEqn;\n" + " \n" + " }\n" + " } else\n" + " {\n" + " float4 tmp = spherePos-out;\n" + " float l2 = dot(tmp,tmp);\n" + " if (l2<radius*radius)\n" + " {\n" + " dist = sqrt(l2);\n" + " if (dist>minDist)\n" + " {\n" + " minDist = dist;\n" + " closestPnt = out;\n" + " hitNormalWorld = tmp/dist;\n" + " \n" + " }\n" + " \n" + " } else\n" + " {\n" + " bCollide = false;\n" + " break;\n" + " }\n" + " }\n" + " } else\n" + " {\n" + " if ( dist > minDist )\n" + " {\n" + " minDist = dist;\n" + " closestPnt = pntReturn;\n" + " hitNormalWorld.xyz = planeEqn.xyz;\n" + " }\n" + " }\n" + " \n" + " }\n" + " \n" + " if (bCollide && minDist > -10000)\n" + " {\n" + " float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n" + " float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n" + " \n" + " float actualDepth = minDist-radius;\n" + " if (actualDepth<=0.f)\n" + " {\n" + " \n" + " pOnB1.w = actualDepth;\n" + " int dstIdx;\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " \n" + " \n" + " if (1)//dstIdx < maxContactCapacity)\n" + " {\n" + " __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" + " c->m_worldNormalOnB = -normalOnSurfaceB1;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" + " c->m_worldPosB[0] = pOnB1;\n" + " c->m_childIndexA = -1;\n" + " c->m_childIndexB = -1;\n" + " GET_NPOINTS(*c) = 1;\n" + " } \n" + " }\n" + " }//if (hasCollision)\n" + "}\n" + " \n" + "int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n" + "{\n" + " if( nPoints == 0 )\n" + " return 0;\n" + " \n" + " if (nPoints <=4)\n" + " return nPoints;\n" + " \n" + " \n" + " if (nPoints >64)\n" + " nPoints = 64;\n" + " \n" + " float4 center = make_float4(0.f);\n" + " {\n" + " \n" + " for (int i=0;i<nPoints;i++)\n" + " center += p[i];\n" + " center /= (float)nPoints;\n" + " }\n" + " \n" + " \n" + " \n" + " // sample 4 directions\n" + " \n" + " float4 aVector = p[0] - center;\n" + " float4 u = cross3( nearNormal, aVector );\n" + " float4 v = cross3( nearNormal, u );\n" + " u = normalize3( u );\n" + " v = normalize3( v );\n" + " \n" + " \n" + " //keep point with deepest penetration\n" + " float minW= FLT_MAX;\n" + " \n" + " int minIndex=-1;\n" + " \n" + " float4 maxDots;\n" + " maxDots.x = FLT_MIN;\n" + " maxDots.y = FLT_MIN;\n" + " maxDots.z = FLT_MIN;\n" + " maxDots.w = FLT_MIN;\n" + " \n" + " // idx, distance\n" + " for(int ie = 0; ie<nPoints; ie++ )\n" + " {\n" + " if (p[ie].w<minW)\n" + " {\n" + " minW = p[ie].w;\n" + " minIndex=ie;\n" + " }\n" + " float f;\n" + " float4 r = p[ie]-center;\n" + " f = dot3F4( u, r );\n" + " if (f<maxDots.x)\n" + " {\n" + " maxDots.x = f;\n" + " contactIdx[0].x = ie;\n" + " }\n" + " \n" + " f = dot3F4( -u, r );\n" + " if (f<maxDots.y)\n" + " {\n" + " maxDots.y = f;\n" + " contactIdx[0].y = ie;\n" + " }\n" + " \n" + " \n" + " f = dot3F4( v, r );\n" + " if (f<maxDots.z)\n" + " {\n" + " maxDots.z = f;\n" + " contactIdx[0].z = ie;\n" + " }\n" + " \n" + " f = dot3F4( -v, r );\n" + " if (f<maxDots.w)\n" + " {\n" + " maxDots.w = f;\n" + " contactIdx[0].w = ie;\n" + " }\n" + " \n" + " }\n" + " \n" + " if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n" + " {\n" + " //replace the first contact with minimum (todo: replace contact with least penetration)\n" + " contactIdx[0].x = minIndex;\n" + " }\n" + " \n" + " return 4;\n" + " \n" + "}\n" + "#define MAX_PLANE_CONVEX_POINTS 64\n" + "int computeContactPlaneConvex(int pairIndex,\n" + " int bodyIndexA, int bodyIndexB, \n" + " int collidableIndexA, int collidableIndexB, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu*collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes,\n" + " __global const float4* convexVertices,\n" + " __global const int* convexIndices,\n" + " __global const btGpuFace* faces,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int maxContactCapacity,\n" + " float4 posB,\n" + " Quaternion ornB\n" + " )\n" + "{\n" + " int resultIndex=-1;\n" + " int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n" + " __global const ConvexPolyhedronCL* hullB = &convexShapes[shapeIndex];\n" + " \n" + " float4 posA;\n" + " posA = rigidBodies[bodyIndexA].m_pos;\n" + " Quaternion ornA;\n" + " ornA = rigidBodies[bodyIndexA].m_quat;\n" + " int numContactsOut = 0;\n" + " int numWorldVertsB1= 0;\n" + " float4 planeEq;\n" + " planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n" + " float4 planeNormal = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n" + " float4 planeNormalWorld;\n" + " planeNormalWorld = qtRotate(ornA,planeNormal);\n" + " float planeConstant = planeEq.w;\n" + " \n" + " float4 invPosA;Quaternion invOrnA;\n" + " float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n" + " {\n" + " \n" + " trInverse(posA,ornA,&invPosA,&invOrnA);\n" + " trMul(invPosA,invOrnA,posB,ornB,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n" + " }\n" + " float4 invPosB;Quaternion invOrnB;\n" + " float4 planeInConvexPos1; Quaternion planeInConvexOrn1;\n" + " {\n" + " \n" + " trInverse(posB,ornB,&invPosB,&invOrnB);\n" + " trMul(invPosB,invOrnB,posA,ornA,&planeInConvexPos1,&planeInConvexOrn1); \n" + " }\n" + " \n" + " float4 planeNormalInConvex = qtRotate(planeInConvexOrn1,-planeNormal);\n" + " float maxDot = -1e30;\n" + " int hitVertex=-1;\n" + " float4 hitVtx;\n" + " float4 contactPoints[MAX_PLANE_CONVEX_POINTS];\n" + " int numPoints = 0;\n" + " int4 contactIdx;\n" + " contactIdx=make_int4(0,1,2,3);\n" + " \n" + " \n" + " for (int i=0;i<hullB->m_numVertices;i++)\n" + " {\n" + " float4 vtx = convexVertices[hullB->m_vertexOffset+i];\n" + " float curDot = dot(vtx,planeNormalInConvex);\n" + " if (curDot>maxDot)\n" + " {\n" + " hitVertex=i;\n" + " maxDot=curDot;\n" + " hitVtx = vtx;\n" + " //make sure the deepest points is always included\n" + " if (numPoints==MAX_PLANE_CONVEX_POINTS)\n" + " numPoints--;\n" + " }\n" + " if (numPoints<MAX_PLANE_CONVEX_POINTS)\n" + " {\n" + " float4 vtxWorld = transform(&vtx, &posB, &ornB);\n" + " float4 vtxInPlane = transform(&vtxWorld, &invPosA, &invOrnA);//oplaneTransform.inverse()*vtxWorld;\n" + " float dist = dot(planeNormal,vtxInPlane)-planeConstant;\n" + " if (dist<0.f)\n" + " {\n" + " vtxWorld.w = dist;\n" + " contactPoints[numPoints] = vtxWorld;\n" + " numPoints++;\n" + " }\n" + " }\n" + " }\n" + " int numReducedPoints = numPoints;\n" + " if (numPoints>4)\n" + " {\n" + " numReducedPoints = extractManifoldSequential( contactPoints, numPoints, planeNormalInConvex, &contactIdx);\n" + " }\n" + " if (numReducedPoints>0)\n" + " {\n" + " int dstIdx;\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " if (dstIdx < maxContactCapacity)\n" + " {\n" + " resultIndex = dstIdx;\n" + " __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" + " c->m_worldNormalOnB = -planeNormalWorld;\n" + " //c->setFrictionCoeff(0.7);\n" + " //c->setRestituitionCoeff(0.f);\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" + " c->m_childIndexA = -1;\n" + " c->m_childIndexB = -1;\n" + " switch (numReducedPoints)\n" + " {\n" + " case 4:\n" + " c->m_worldPosB[3] = contactPoints[contactIdx.w];\n" + " case 3:\n" + " c->m_worldPosB[2] = contactPoints[contactIdx.z];\n" + " case 2:\n" + " c->m_worldPosB[1] = contactPoints[contactIdx.y];\n" + " case 1:\n" + " c->m_worldPosB[0] = contactPoints[contactIdx.x];\n" + " default:\n" + " {\n" + " }\n" + " };\n" + " \n" + " GET_NPOINTS(*c) = numReducedPoints;\n" + " }//if (dstIdx < numPairs)\n" + " } \n" + " return resultIndex;\n" + "}\n" + "void computeContactPlaneSphere(int pairIndex,\n" + " int bodyIndexA, int bodyIndexB, \n" + " int collidableIndexA, int collidableIndexB, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global const btGpuFace* faces,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int maxContactCapacity)\n" + "{\n" + " float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n" + " float radius = collidables[collidableIndexB].m_radius;\n" + " float4 posA1 = rigidBodies[bodyIndexA].m_pos;\n" + " float4 ornA1 = rigidBodies[bodyIndexA].m_quat;\n" + " float4 posB1 = rigidBodies[bodyIndexB].m_pos;\n" + " float4 ornB1 = rigidBodies[bodyIndexB].m_quat;\n" + " \n" + " bool hasCollision = false;\n" + " float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n" + " float planeConstant = planeEq.w;\n" + " float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n" + " {\n" + " float4 invPosA;Quaternion invOrnA;\n" + " trInverse(posA1,ornA1,&invPosA,&invOrnA);\n" + " trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n" + " }\n" + " float4 planeInConvexPos1; Quaternion planeInConvexOrn1;\n" + " {\n" + " float4 invPosB;Quaternion invOrnB;\n" + " trInverse(posB1,ornB1,&invPosB,&invOrnB);\n" + " trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1); \n" + " }\n" + " float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius;\n" + " float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n" + " float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant;\n" + " hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold();\n" + " if (hasCollision)\n" + " {\n" + " float4 vtxInPlaneProjected1 = vtxInPlane1 - distance*planeNormal1;\n" + " float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1);\n" + " float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);\n" + " float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;\n" + " pOnB1.w = distance;\n" + " int dstIdx;\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " \n" + " if (dstIdx < maxContactCapacity)\n" + " {\n" + " __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" + " c->m_worldNormalOnB = -normalOnSurfaceB1;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" + " c->m_worldPosB[0] = pOnB1;\n" + " c->m_childIndexA = -1;\n" + " c->m_childIndexB = -1;\n" + " GET_NPOINTS(*c) = 1;\n" + " }//if (dstIdx < numPairs)\n" + " }//if (hasCollision)\n" + "}\n" + "__kernel void primitiveContactsKernel( __global int4* pairs, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int numPairs, int maxContactCapacity)\n" + "{\n" + " int i = get_global_id(0);\n" + " int pairIndex = i;\n" + " \n" + " float4 worldVertsB1[64];\n" + " float4 worldVertsB2[64];\n" + " int capacityWorldVerts = 64; \n" + " float4 localContactsOut[64];\n" + " int localContactCapacity=64;\n" + " \n" + " float minDist = -1e30f;\n" + " float maxDist = 0.02f;\n" + " if (i<numPairs)\n" + " {\n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " \n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n" + " collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n" + " {\n" + " float4 posB;\n" + " posB = rigidBodies[bodyIndexB].m_pos;\n" + " Quaternion ornB;\n" + " ornB = rigidBodies[bodyIndexB].m_quat;\n" + " int contactIndex = computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" + " rigidBodies,collidables,convexShapes,vertices,indices,\n" + " faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity, posB,ornB);\n" + " if (contactIndex>=0)\n" + " pairs[pairIndex].z = contactIndex;\n" + " return;\n" + " }\n" + " if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n" + " collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n" + " {\n" + " float4 posA;\n" + " posA = rigidBodies[bodyIndexA].m_pos;\n" + " Quaternion ornA;\n" + " ornA = rigidBodies[bodyIndexA].m_quat;\n" + " int contactIndex = computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, \n" + " rigidBodies,collidables,convexShapes,vertices,indices,\n" + " faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n" + " if (contactIndex>=0)\n" + " pairs[pairIndex].z = contactIndex;\n" + " return;\n" + " }\n" + " if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n" + " collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" + " {\n" + " computeContactPlaneSphere(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" + " rigidBodies,collidables,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n" + " return;\n" + " }\n" + " if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" + " collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n" + " {\n" + " computeContactPlaneSphere( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, \n" + " rigidBodies,collidables,\n" + " faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n" + " return;\n" + " }\n" + " \n" + " \n" + " if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" + " collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n" + " {\n" + " \n" + " float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n" + " float sphereRadius = collidables[collidableIndexA].m_radius;\n" + " float4 convexPos = rigidBodies[bodyIndexB].m_pos;\n" + " float4 convexOrn = rigidBodies[bodyIndexB].m_quat;\n" + " computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" + " rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" + " spherePos,sphereRadius,convexPos,convexOrn);\n" + " return;\n" + " }\n" + " if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n" + " collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" + " {\n" + " \n" + " float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n" + " float sphereRadius = collidables[collidableIndexB].m_radius;\n" + " float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n" + " float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n" + " computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n" + " rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" + " spherePos,sphereRadius,convexPos,convexOrn);\n" + " return;\n" + " }\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" + " collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" + " {\n" + " //sphere-sphere\n" + " float radiusA = collidables[collidableIndexA].m_radius;\n" + " float radiusB = collidables[collidableIndexB].m_radius;\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " float4 diff = posA-posB;\n" + " float len = length(diff);\n" + " \n" + " ///iff distance positive, don't generate a new contact\n" + " if ( len <= (radiusA+radiusB))\n" + " {\n" + " ///distance (negative means penetration)\n" + " float dist = len - (radiusA+radiusB);\n" + " float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n" + " if (len > 0.00001)\n" + " {\n" + " normalOnSurfaceB = diff / len;\n" + " }\n" + " float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n" + " contactPosB.w = dist;\n" + " \n" + " int dstIdx;\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " \n" + " if (dstIdx < maxContactCapacity)\n" + " {\n" + " __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" + " c->m_worldNormalOnB = normalOnSurfaceB;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " int bodyA = pairs[pairIndex].x;\n" + " int bodyB = pairs[pairIndex].y;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" + " c->m_worldPosB[0] = contactPosB;\n" + " c->m_childIndexA = -1;\n" + " c->m_childIndexB = -1;\n" + " GET_NPOINTS(*c) = 1;\n" + " }//if (dstIdx < numPairs)\n" + " }//if ( len <= (radiusA+radiusB))\n" + " return;\n" + " }//SHAPE_SPHERE SHAPE_SPHERE\n" + " }// if (i<numPairs)\n" + "}\n" + "// work-in-progress\n" + "__kernel void processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,\n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global btAabbCL* aabbs,\n" + " __global const btGpuChildShape* gpuChildShapes,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int numCompoundPairs, int maxContactCapacity\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i<numCompoundPairs)\n" + " {\n" + " int bodyIndexA = gpuCompoundPairs[i].x;\n" + " int bodyIndexB = gpuCompoundPairs[i].y;\n" + " int childShapeIndexA = gpuCompoundPairs[i].z;\n" + " int childShapeIndexB = gpuCompoundPairs[i].w;\n" + " \n" + " int collidableIndexA = -1;\n" + " int collidableIndexB = -1;\n" + " \n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " \n" + " float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " \n" + " if (childShapeIndexA >= 0)\n" + " {\n" + " collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" + " float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" + " float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" + " float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" + " float4 newOrnA = qtMul(ornA,childOrnA);\n" + " posA = newPosA;\n" + " ornA = newOrnA;\n" + " } else\n" + " {\n" + " collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " }\n" + " \n" + " if (childShapeIndexB>=0)\n" + " {\n" + " collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" + " float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" + " float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" + " float4 newPosB = transform(&childPosB,&posB,&ornB);\n" + " float4 newOrnB = qtMul(ornB,childOrnB);\n" + " posB = newPosB;\n" + " ornB = newOrnB;\n" + " } else\n" + " {\n" + " collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; \n" + " }\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n" + " int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" + " int pairIndex = i;\n" + " if ((shapeTypeA == SHAPE_PLANE) && (shapeTypeB==SHAPE_CONVEX_HULL))\n" + " {\n" + " computeContactPlaneConvex( pairIndex, bodyIndexA,bodyIndexB, collidableIndexA,collidableIndexB, \n" + " rigidBodies,collidables,convexShapes,vertices,indices,\n" + " faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posB,ornB);\n" + " return;\n" + " }\n" + " if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB==SHAPE_PLANE))\n" + " {\n" + " computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA, collidableIndexB,collidableIndexA, \n" + " rigidBodies,collidables,convexShapes,vertices,indices,\n" + " faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n" + " return;\n" + " }\n" + " if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB == SHAPE_SPHERE))\n" + " {\n" + " float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n" + " float sphereRadius = collidables[collidableIndexB].m_radius;\n" + " float4 convexPos = posA;\n" + " float4 convexOrn = ornA;\n" + " \n" + " computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA , collidableIndexB,collidableIndexA, \n" + " rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" + " spherePos,sphereRadius,convexPos,convexOrn);\n" + " \n" + " return;\n" + " }\n" + " if ((shapeTypeA == SHAPE_SPHERE) && (shapeTypeB == SHAPE_CONVEX_HULL))\n" + " {\n" + " float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n" + " float sphereRadius = collidables[collidableIndexA].m_radius;\n" + " float4 convexPos = posB;\n" + " float4 convexOrn = ornB;\n" + " \n" + " computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n" + " rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" + " spherePos,sphereRadius,convexPos,convexOrn);\n" + " \n" + " return;\n" + " }\n" + " }// if (i<numCompoundPairs)\n" + "}\n" + "bool pointInTriangle(const float4* vertices, const float4* normal, float4 *p )\n" + "{\n" + " const float4* p1 = &vertices[0];\n" + " const float4* p2 = &vertices[1];\n" + " const float4* p3 = &vertices[2];\n" + " float4 edge1; edge1 = (*p2 - *p1);\n" + " float4 edge2; edge2 = ( *p3 - *p2 );\n" + " float4 edge3; edge3 = ( *p1 - *p3 );\n" + " \n" + " float4 p1_to_p; p1_to_p = ( *p - *p1 );\n" + " float4 p2_to_p; p2_to_p = ( *p - *p2 );\n" + " float4 p3_to_p; p3_to_p = ( *p - *p3 );\n" + " float4 edge1_normal; edge1_normal = ( cross(edge1,*normal));\n" + " float4 edge2_normal; edge2_normal = ( cross(edge2,*normal));\n" + " float4 edge3_normal; edge3_normal = ( cross(edge3,*normal));\n" + " \n" + " \n" + " float r1, r2, r3;\n" + " r1 = dot(edge1_normal,p1_to_p );\n" + " r2 = dot(edge2_normal,p2_to_p );\n" + " r3 = dot(edge3_normal,p3_to_p );\n" + " \n" + " if ( r1 > 0 && r2 > 0 && r3 > 0 )\n" + " return true;\n" + " if ( r1 <= 0 && r2 <= 0 && r3 <= 0 ) \n" + " return true;\n" + " return false;\n" + "}\n" + "float segmentSqrDistance(float4 from, float4 to,float4 p, float4* nearest) \n" + "{\n" + " float4 diff = p - from;\n" + " float4 v = to - from;\n" + " float t = dot(v,diff);\n" + " \n" + " if (t > 0) \n" + " {\n" + " float dotVV = dot(v,v);\n" + " if (t < dotVV) \n" + " {\n" + " t /= dotVV;\n" + " diff -= t*v;\n" + " } else \n" + " {\n" + " t = 1;\n" + " diff -= v;\n" + " }\n" + " } else\n" + " {\n" + " t = 0;\n" + " }\n" + " *nearest = from + t*v;\n" + " return dot(diff,diff); \n" + "}\n" + "void computeContactSphereTriangle(int pairIndex,\n" + " int bodyIndexA, int bodyIndexB,\n" + " int collidableIndexA, int collidableIndexB, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " const float4* triangleVertices,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int maxContactCapacity,\n" + " float4 spherePos2,\n" + " float radius,\n" + " float4 pos,\n" + " float4 quat,\n" + " int faceIndex\n" + " )\n" + "{\n" + " float4 invPos;\n" + " float4 invOrn;\n" + " trInverse(pos,quat, &invPos,&invOrn);\n" + " float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n" + " int numFaces = 3;\n" + " float4 closestPnt = (float4)(0, 0, 0, 0);\n" + " float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n" + " float minDist = -1000000.f;\n" + " bool bCollide = false;\n" + " \n" + " //////////////////////////////////////\n" + " float4 sphereCenter;\n" + " sphereCenter = spherePos;\n" + " const float4* vertices = triangleVertices;\n" + " float contactBreakingThreshold = 0.f;//todo?\n" + " float radiusWithThreshold = radius + contactBreakingThreshold;\n" + " float4 edge10;\n" + " edge10 = vertices[1]-vertices[0];\n" + " edge10.w = 0.f;//is this needed?\n" + " float4 edge20;\n" + " edge20 = vertices[2]-vertices[0];\n" + " edge20.w = 0.f;//is this needed?\n" + " float4 normal = cross3(edge10,edge20);\n" + " normal = normalize(normal);\n" + " float4 p1ToCenter;\n" + " p1ToCenter = sphereCenter - vertices[0];\n" + " \n" + " float distanceFromPlane = dot(p1ToCenter,normal);\n" + " if (distanceFromPlane < 0.f)\n" + " {\n" + " //triangle facing the other way\n" + " distanceFromPlane *= -1.f;\n" + " normal *= -1.f;\n" + " }\n" + " hitNormalWorld = normal;\n" + " bool isInsideContactPlane = distanceFromPlane < radiusWithThreshold;\n" + " \n" + " // Check for contact / intersection\n" + " bool hasContact = false;\n" + " float4 contactPoint;\n" + " if (isInsideContactPlane) \n" + " {\n" + " \n" + " if (pointInTriangle(vertices,&normal, &sphereCenter)) \n" + " {\n" + " // Inside the contact wedge - touches a point on the shell plane\n" + " hasContact = true;\n" + " contactPoint = sphereCenter - normal*distanceFromPlane;\n" + " \n" + " } else {\n" + " // Could be inside one of the contact capsules\n" + " float contactCapsuleRadiusSqr = radiusWithThreshold*radiusWithThreshold;\n" + " float4 nearestOnEdge;\n" + " int numEdges = 3;\n" + " for (int i = 0; i < numEdges; i++) \n" + " {\n" + " float4 pa =vertices[i];\n" + " float4 pb = vertices[(i+1)%3];\n" + " float distanceSqr = segmentSqrDistance(pa,pb,sphereCenter, &nearestOnEdge);\n" + " if (distanceSqr < contactCapsuleRadiusSqr) \n" + " {\n" + " // Yep, we're inside a capsule\n" + " hasContact = true;\n" + " contactPoint = nearestOnEdge;\n" + " \n" + " }\n" + " \n" + " }\n" + " }\n" + " }\n" + " if (hasContact) \n" + " {\n" + " closestPnt = contactPoint;\n" + " float4 contactToCenter = sphereCenter - contactPoint;\n" + " minDist = length(contactToCenter);\n" + " if (minDist>FLT_EPSILON)\n" + " {\n" + " hitNormalWorld = normalize(contactToCenter);//*(1./minDist);\n" + " bCollide = true;\n" + " }\n" + " \n" + " }\n" + " /////////////////////////////////////\n" + " if (bCollide && minDist > -10000)\n" + " {\n" + " \n" + " float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n" + " float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n" + " float actualDepth = minDist-radius;\n" + " \n" + " if (actualDepth<=0.f)\n" + " {\n" + " pOnB1.w = actualDepth;\n" + " int dstIdx;\n" + " \n" + " float lenSqr = dot3F4(normalOnSurfaceB1,normalOnSurfaceB1);\n" + " if (lenSqr>FLT_EPSILON)\n" + " {\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " \n" + " if (dstIdx < maxContactCapacity)\n" + " {\n" + " __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" + " c->m_worldNormalOnB = -normalOnSurfaceB1;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n" + " c->m_worldPosB[0] = pOnB1;\n" + " c->m_childIndexA = -1;\n" + " c->m_childIndexB = faceIndex;\n" + " GET_NPOINTS(*c) = 1;\n" + " } \n" + " }\n" + " }\n" + " }//if (hasCollision)\n" + "}\n" + "// work-in-progress\n" + "__kernel void findConcaveSphereContactsKernel( __global int4* concavePairs,\n" + " __global const BodyData* rigidBodies,\n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global btAabbCL* aabbs,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int numConcavePairs, int maxContactCapacity\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numConcavePairs)\n" + " return;\n" + " int pairIdx = i;\n" + " int bodyIndexA = concavePairs[i].x;\n" + " int bodyIndexB = concavePairs[i].y;\n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)\n" + " {\n" + " int f = concavePairs[i].z;\n" + " btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" + " \n" + " float4 verticesA[3];\n" + " for (int i=0;i<3;i++)\n" + " {\n" + " int index = indices[face.m_indexOffset+i];\n" + " float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" + " verticesA[i] = vert;\n" + " }\n" + " float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n" + " float sphereRadius = collidables[collidableIndexB].m_radius;\n" + " float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n" + " float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n" + " computeContactSphereTriangle(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n" + " rigidBodies,collidables,\n" + " verticesA,\n" + " globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n" + " spherePos,sphereRadius,convexPos,convexOrn, f);\n" + " return;\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h index f0ecfc7851..907809d8bd 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h @@ -1,2099 +1,2098 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* satClipKernelsCL= \ -"#define TRIANGLE_NUM_CONVEX_FACES 5\n" -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"#ifdef cl_ext_atomic_counters_32\n" -"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -"#define counter32_t volatile __global int*\n" -"#endif\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GET_NUM_GROUPS get_num_groups(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" -"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" -"#define max2 max\n" -"#define min2 min\n" -"typedef unsigned int u32;\n" -"#ifndef B3_CONTACT4DATA_H\n" -"#define B3_CONTACT4DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"typedef struct b3Contact4Data b3Contact4Data_t;\n" -"struct b3Contact4Data\n" -"{\n" -" b3Float4 m_worldPosB[4];\n" -"// b3Float4 m_localPosA[4];\n" -"// b3Float4 m_localPosB[4];\n" -" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" -" unsigned short m_restituitionCoeffCmp;\n" -" unsigned short m_frictionCoeffCmp;\n" -" int m_batchIdx;\n" -" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" -" int m_bodyBPtrAndSignBit;\n" -" int m_childIndexA;\n" -" int m_childIndexB;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" -"{\n" -" return (int)contact->m_worldNormalOnB.w;\n" -"};\n" -"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" -"{\n" -" contact->m_worldNormalOnB.w = (float)numPoints;\n" -"};\n" -"#endif //B3_CONTACT4DATA_H\n" -"#ifndef B3_CONVEX_POLYHEDRON_DATA_H\n" -"#define B3_CONVEX_POLYHEDRON_DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#define B3_QUAT_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif\n" -"#endif\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Quat;\n" -" #define b3QuatConstArg const b3Quat\n" -" \n" -" \n" -"inline float4 b3FastNormalize4(float4 v)\n" -"{\n" -" v = (float4)(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -" \n" -"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" -"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" -"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" -"{\n" -" b3Quat ans;\n" -" ans = b3Cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" -"{\n" -" b3Quat q;\n" -" q=in;\n" -" //return b3FastNormalize4(in);\n" -" float len = native_sqrt(dot(q, q));\n" -" if(len > 0.f)\n" -" {\n" -" q *= 1.f / len;\n" -" }\n" -" else\n" -" {\n" -" q.x = q.y = q.z = 0.f;\n" -" q.w = 1.f;\n" -" }\n" -" return q;\n" -"}\n" -"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" b3Quat qInv = b3QuatInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" return b3QuatRotate( b3QuatInvert( q ), vec );\n" -"}\n" -"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" -"{\n" -" return b3QuatRotate( orientation, point ) + (translation);\n" -"}\n" -" \n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"typedef struct b3GpuFace b3GpuFace_t;\n" -"struct b3GpuFace\n" -"{\n" -" b3Float4 m_plane;\n" -" int m_indexOffset;\n" -" int m_numIndices;\n" -" int m_unusedPadding1;\n" -" int m_unusedPadding2;\n" -"};\n" -"typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n" -"struct b3ConvexPolyhedronData\n" -"{\n" -" b3Float4 m_localCenter;\n" -" b3Float4 m_extents;\n" -" b3Float4 mC;\n" -" b3Float4 mE;\n" -" float m_radius;\n" -" int m_faceOffset;\n" -" int m_numFaces;\n" -" int m_numVertices;\n" -" int m_vertexOffset;\n" -" int m_uniqueEdgesOffset;\n" -" int m_numUniqueEdges;\n" -" int m_unused;\n" -"};\n" -"#endif //B3_CONVEX_POLYHEDRON_DATA_H\n" -"#ifndef B3_COLLIDABLE_H\n" -"#define B3_COLLIDABLE_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"enum b3ShapeTypes\n" -"{\n" -" SHAPE_HEIGHT_FIELD=1,\n" -" SHAPE_CONVEX_HULL=3,\n" -" SHAPE_PLANE=4,\n" -" SHAPE_CONCAVE_TRIMESH=5,\n" -" SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" -" SHAPE_SPHERE=7,\n" -" MAX_NUM_SHAPE_TYPES,\n" -"};\n" -"typedef struct b3Collidable b3Collidable_t;\n" -"struct b3Collidable\n" -"{\n" -" union {\n" -" int m_numChildShapes;\n" -" int m_bvhIndex;\n" -" };\n" -" union\n" -" {\n" -" float m_radius;\n" -" int m_compoundBvhIndex;\n" -" };\n" -" int m_shapeType;\n" -" int m_shapeIndex;\n" -"};\n" -"typedef struct b3GpuChildShape b3GpuChildShape_t;\n" -"struct b3GpuChildShape\n" -"{\n" -" b3Float4 m_childPosition;\n" -" b3Quat m_childOrientation;\n" -" int m_shapeIndex;\n" -" int m_unused0;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"struct b3CompoundOverlappingPair\n" -"{\n" -" int m_bodyIndexA;\n" -" int m_bodyIndexB;\n" -"// int m_pairType;\n" -" int m_childShapeIndexA;\n" -" int m_childShapeIndexB;\n" -"};\n" -"#endif //B3_COLLIDABLE_H\n" -"#ifndef B3_RIGIDBODY_DATA_H\n" -"#define B3_RIGIDBODY_DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifndef B3_MAT3x3_H\n" -"#define B3_MAT3x3_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"typedef struct\n" -"{\n" -" b3Float4 m_row[3];\n" -"}b3Mat3x3;\n" -"#define b3Mat3x3ConstArg const b3Mat3x3\n" -"#define b3GetRow(m,row) (m.m_row[row])\n" -"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" -"{\n" -" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" -" b3Mat3x3 out;\n" -" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" -" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" -" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" -" out.m_row[0].w = 0.f;\n" -" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" -" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" -" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" -" out.m_row[1].w = 0.f;\n" -" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" -" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" -" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" -" out.m_row[2].w = 0.f;\n" -" return out;\n" -"}\n" -"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = fabs(matIn.m_row[0]);\n" -" out.m_row[1] = fabs(matIn.m_row[1]);\n" -" out.m_row[2] = fabs(matIn.m_row[2]);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtZero();\n" -"__inline\n" -"b3Mat3x3 mtIdentity();\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Mat3x3 mtZero()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(0.f);\n" -" m.m_row[1] = (b3Float4)(0.f);\n" -" m.m_row[2] = (b3Float4)(0.f);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtIdentity()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(1,0,0,0);\n" -" m.m_row[1] = (b3Float4)(0,1,0,0);\n" -" m.m_row[2] = (b3Float4)(0,0,1,0);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" -" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" -" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" -"{\n" -" b3Mat3x3 transB;\n" -" transB = mtTranspose( b );\n" -" b3Mat3x3 ans;\n" -" // why this doesn't run when 0ing in the for{}\n" -" a.m_row[0].w = 0.f;\n" -" a.m_row[1].w = 0.f;\n" -" a.m_row[2].w = 0.f;\n" -" for(int i=0; i<3; i++)\n" -" {\n" -"// a.m_row[i].w = 0.f;\n" -" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" -" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" -" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" -" ans.m_row[i].w = 0.f;\n" -" }\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" -"{\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a.m_row[0], b );\n" -" ans.y = b3Dot3F4( a.m_row[1], b );\n" -" ans.z = b3Dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" -"{\n" -" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a, colx );\n" -" ans.y = b3Dot3F4( a, coly );\n" -" ans.z = b3Dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"#endif\n" -"#endif //B3_MAT3x3_H\n" -"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" -"struct b3RigidBodyData\n" -"{\n" -" b3Float4 m_pos;\n" -" b3Quat m_quat;\n" -" b3Float4 m_linVel;\n" -" b3Float4 m_angVel;\n" -" int m_collidableIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"};\n" -"typedef struct b3InertiaData b3InertiaData_t;\n" -"struct b3InertiaData\n" -"{\n" -" b3Mat3x3 m_invInertiaWorld;\n" -" b3Mat3x3 m_initInvInertia;\n" -"};\n" -"#endif //B3_RIGIDBODY_DATA_H\n" -" \n" -"#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define make_float4 (float4)\n" -"#define make_float2 (float2)\n" -"#define make_uint4 (uint4)\n" -"#define make_int4 (int4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"__inline\n" -"float fastDiv(float numerator, float denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"// return numerator/denominator; \n" -"}\n" -"__inline\n" -"float4 fastDiv4(float4 numerator, float4 denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"}\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -"}\n" -"//#define dot3F4 dot\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = make_float4(a.xyz,0.f);\n" -" float4 b1 = make_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" return fast_normalize(v);\n" -"}\n" -"///////////////////////////////////////\n" -"// Quaternion\n" -"///////////////////////////////////////\n" -"typedef float4 Quaternion;\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b);\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in);\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec);\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q);\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b)\n" -"{\n" -" Quaternion ans;\n" -" ans = cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in)\n" -"{\n" -" return fastNormalize4(in);\n" -"// in /= length( in );\n" -"// return in;\n" -"}\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec)\n" -"{\n" -" Quaternion qInv = qtInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q)\n" -"{\n" -" return (Quaternion)(-q.xyz, q.w);\n" -"}\n" -"__inline\n" -"float4 qtInvRotate(const Quaternion q, float4 vec)\n" -"{\n" -" return qtRotate( qtInvert( q ), vec );\n" -"}\n" -"__inline\n" -"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" -"{\n" -" return qtRotate( *orientation, *p ) + (*translation);\n" -"}\n" -"__inline\n" -"float4 normalize3(const float4 a)\n" -"{\n" -" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" -" return fastNormalize4( n );\n" -"}\n" -"__inline float4 lerp3(const float4 a,const float4 b, float t)\n" -"{\n" -" return make_float4( a.x + (b.x - a.x) * t,\n" -" a.y + (b.y - a.y) * t,\n" -" a.z + (b.z - a.z) * t,\n" -" 0.f);\n" -"}\n" -"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n" -"int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut)\n" -"{\n" -" \n" -" int ve;\n" -" float ds, de;\n" -" int numVertsOut = 0;\n" -" //double-check next test\n" -" if (numVertsIn < 2)\n" -" return 0;\n" -" \n" -" float4 firstVertex=pVtxIn[numVertsIn-1];\n" -" float4 endVertex = pVtxIn[0];\n" -" \n" -" ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n" -" \n" -" for (ve = 0; ve < numVertsIn; ve++)\n" -" {\n" -" endVertex=pVtxIn[ve];\n" -" de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n" -" if (ds<0)\n" -" {\n" -" if (de<0)\n" -" {\n" -" // Start < 0, end < 0, so output endVertex\n" -" ppVtxOut[numVertsOut++] = endVertex;\n" -" }\n" -" else\n" -" {\n" -" // Start < 0, end >= 0, so output intersection\n" -" ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" -" }\n" -" }\n" -" else\n" -" {\n" -" if (de<0)\n" -" {\n" -" // Start >= 0, end < 0 so output intersection and end\n" -" ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" -" ppVtxOut[numVertsOut++] = endVertex;\n" -" }\n" -" }\n" -" firstVertex = endVertex;\n" -" ds = de;\n" -" }\n" -" return numVertsOut;\n" -"}\n" -"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n" -"int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut)\n" -"{\n" -" \n" -" int ve;\n" -" float ds, de;\n" -" int numVertsOut = 0;\n" -"//double-check next test\n" -" if (numVertsIn < 2)\n" -" return 0;\n" -" float4 firstVertex=pVtxIn[numVertsIn-1];\n" -" float4 endVertex = pVtxIn[0];\n" -" \n" -" ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n" -" for (ve = 0; ve < numVertsIn; ve++)\n" -" {\n" -" endVertex=pVtxIn[ve];\n" -" de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n" -" if (ds<0)\n" -" {\n" -" if (de<0)\n" -" {\n" -" // Start < 0, end < 0, so output endVertex\n" -" ppVtxOut[numVertsOut++] = endVertex;\n" -" }\n" -" else\n" -" {\n" -" // Start < 0, end >= 0, so output intersection\n" -" ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" -" }\n" -" }\n" -" else\n" -" {\n" -" if (de<0)\n" -" {\n" -" // Start >= 0, end < 0 so output intersection and end\n" -" ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" -" ppVtxOut[numVertsOut++] = endVertex;\n" -" }\n" -" }\n" -" firstVertex = endVertex;\n" -" ds = de;\n" -" }\n" -" return numVertsOut;\n" -"}\n" -"int clipFaceAgainstHull(const float4 separatingNormal, __global const b3ConvexPolyhedronData_t* hullA, \n" -" const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n" -" float4* worldVertsB2, int capacityWorldVertsB2,\n" -" const float minDist, float maxDist,\n" -" __global const float4* vertices,\n" -" __global const b3GpuFace_t* faces,\n" -" __global const int* indices,\n" -" float4* contactsOut,\n" -" int contactCapacity)\n" -"{\n" -" int numContactsOut = 0;\n" -" float4* pVtxIn = worldVertsB1;\n" -" float4* pVtxOut = worldVertsB2;\n" -" \n" -" int numVertsIn = numWorldVertsB1;\n" -" int numVertsOut = 0;\n" -" int closestFaceA=-1;\n" -" {\n" -" float dmin = FLT_MAX;\n" -" for(int face=0;face<hullA->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(\n" -" faces[hullA->m_faceOffset+face].m_plane.x, \n" -" faces[hullA->m_faceOffset+face].m_plane.y, \n" -" faces[hullA->m_faceOffset+face].m_plane.z,0.f);\n" -" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" -" \n" -" float d = dot3F4(faceANormalWS,separatingNormal);\n" -" if (d < dmin)\n" -" {\n" -" dmin = d;\n" -" closestFaceA = face;\n" -" }\n" -" }\n" -" }\n" -" if (closestFaceA<0)\n" -" return numContactsOut;\n" -" b3GpuFace_t polyA = faces[hullA->m_faceOffset+closestFaceA];\n" -" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" -" int numVerticesA = polyA.m_numIndices;\n" -" for(int e0=0;e0<numVerticesA;e0++)\n" -" {\n" -" const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]];\n" -" const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n" -" const float4 edge0 = a - b;\n" -" const float4 WorldEdge0 = qtRotate(ornA,edge0);\n" -" float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" -" float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n" -" float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" -" float4 worldA1 = transform(&a,&posA,&ornA);\n" -" float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" -" \n" -" float4 planeNormalWS = planeNormalWS1;\n" -" float planeEqWS=planeEqWS1;\n" -" \n" -" //clip face\n" -" //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n" -" numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n" -" //btSwap(pVtxIn,pVtxOut);\n" -" float4* tmp = pVtxOut;\n" -" pVtxOut = pVtxIn;\n" -" pVtxIn = tmp;\n" -" numVertsIn = numVertsOut;\n" -" numVertsOut = 0;\n" -" }\n" -" \n" -" // only keep points that are behind the witness face\n" -" {\n" -" float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" -" float localPlaneEq = polyA.m_plane.w;\n" -" float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n" -" float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n" -" for (int i=0;i<numVertsIn;i++)\n" -" {\n" -" float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" -" if (depth <=minDist)\n" -" {\n" -" depth = minDist;\n" -" }\n" -" if (depth <=maxDist)\n" -" {\n" -" float4 pointInWorld = pVtxIn[i];\n" -" //resultOut.addContactPoint(separatingNormal,point,depth);\n" -" contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" -" }\n" -" }\n" -" }\n" -" return numContactsOut;\n" -"}\n" -"int clipFaceAgainstHullLocalA(const float4 separatingNormal, const b3ConvexPolyhedronData_t* hullA, \n" -" const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n" -" float4* worldVertsB2, int capacityWorldVertsB2,\n" -" const float minDist, float maxDist,\n" -" const float4* verticesA,\n" -" const b3GpuFace_t* facesA,\n" -" const int* indicesA,\n" -" __global const float4* verticesB,\n" -" __global const b3GpuFace_t* facesB,\n" -" __global const int* indicesB,\n" -" float4* contactsOut,\n" -" int contactCapacity)\n" -"{\n" -" int numContactsOut = 0;\n" -" float4* pVtxIn = worldVertsB1;\n" -" float4* pVtxOut = worldVertsB2;\n" -" \n" -" int numVertsIn = numWorldVertsB1;\n" -" int numVertsOut = 0;\n" -" int closestFaceA=-1;\n" -" {\n" -" float dmin = FLT_MAX;\n" -" for(int face=0;face<hullA->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(\n" -" facesA[hullA->m_faceOffset+face].m_plane.x, \n" -" facesA[hullA->m_faceOffset+face].m_plane.y, \n" -" facesA[hullA->m_faceOffset+face].m_plane.z,0.f);\n" -" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" -" \n" -" float d = dot3F4(faceANormalWS,separatingNormal);\n" -" if (d < dmin)\n" -" {\n" -" dmin = d;\n" -" closestFaceA = face;\n" -" }\n" -" }\n" -" }\n" -" if (closestFaceA<0)\n" -" return numContactsOut;\n" -" b3GpuFace_t polyA = facesA[hullA->m_faceOffset+closestFaceA];\n" -" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" -" int numVerticesA = polyA.m_numIndices;\n" -" for(int e0=0;e0<numVerticesA;e0++)\n" -" {\n" -" const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];\n" -" const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n" -" const float4 edge0 = a - b;\n" -" const float4 WorldEdge0 = qtRotate(ornA,edge0);\n" -" float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" -" float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n" -" float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" -" float4 worldA1 = transform(&a,&posA,&ornA);\n" -" float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" -" \n" -" float4 planeNormalWS = planeNormalWS1;\n" -" float planeEqWS=planeEqWS1;\n" -" \n" -" //clip face\n" -" //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n" -" numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n" -" //btSwap(pVtxIn,pVtxOut);\n" -" float4* tmp = pVtxOut;\n" -" pVtxOut = pVtxIn;\n" -" pVtxIn = tmp;\n" -" numVertsIn = numVertsOut;\n" -" numVertsOut = 0;\n" -" }\n" -" \n" -" // only keep points that are behind the witness face\n" -" {\n" -" float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" -" float localPlaneEq = polyA.m_plane.w;\n" -" float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n" -" float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n" -" for (int i=0;i<numVertsIn;i++)\n" -" {\n" -" float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" -" if (depth <=minDist)\n" -" {\n" -" depth = minDist;\n" -" }\n" -" if (depth <=maxDist)\n" -" {\n" -" float4 pointInWorld = pVtxIn[i];\n" -" //resultOut.addContactPoint(separatingNormal,point,depth);\n" -" contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" -" }\n" -" }\n" -" }\n" -" return numContactsOut;\n" -"}\n" -"int clipHullAgainstHull(const float4 separatingNormal,\n" -" __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n" -" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n" -" float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n" -" const float minDist, float maxDist,\n" -" __global const float4* vertices,\n" -" __global const b3GpuFace_t* faces,\n" -" __global const int* indices,\n" -" float4* localContactsOut,\n" -" int localContactCapacity)\n" -"{\n" -" int numContactsOut = 0;\n" -" int numWorldVertsB1= 0;\n" -" int closestFaceB=-1;\n" -" float dmax = -FLT_MAX;\n" -" {\n" -" for(int face=0;face<hullB->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, \n" -" faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n" -" const float4 WorldNormal = qtRotate(ornB, Normal);\n" -" float d = dot3F4(WorldNormal,separatingNormal);\n" -" if (d > dmax)\n" -" {\n" -" dmax = d;\n" -" closestFaceB = face;\n" -" }\n" -" }\n" -" }\n" -" {\n" -" const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n" -" const int numVertices = polyB.m_numIndices;\n" -" for(int e0=0;e0<numVertices;e0++)\n" -" {\n" -" const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n" -" worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" -" }\n" -" }\n" -" if (closestFaceB>=0)\n" -" {\n" -" numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, \n" -" posA,ornA,\n" -" worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices,\n" -" faces,\n" -" indices,localContactsOut,localContactCapacity);\n" -" }\n" -" return numContactsOut;\n" -"}\n" -"int clipHullAgainstHullLocalA(const float4 separatingNormal,\n" -" const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n" -" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n" -" float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n" -" const float minDist, float maxDist,\n" -" const float4* verticesA,\n" -" const b3GpuFace_t* facesA,\n" -" const int* indicesA,\n" -" __global const float4* verticesB,\n" -" __global const b3GpuFace_t* facesB,\n" -" __global const int* indicesB,\n" -" float4* localContactsOut,\n" -" int localContactCapacity)\n" -"{\n" -" int numContactsOut = 0;\n" -" int numWorldVertsB1= 0;\n" -" int closestFaceB=-1;\n" -" float dmax = -FLT_MAX;\n" -" {\n" -" for(int face=0;face<hullB->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, \n" -" facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n" -" const float4 WorldNormal = qtRotate(ornB, Normal);\n" -" float d = dot3F4(WorldNormal,separatingNormal);\n" -" if (d > dmax)\n" -" {\n" -" dmax = d;\n" -" closestFaceB = face;\n" -" }\n" -" }\n" -" }\n" -" {\n" -" const b3GpuFace_t polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" -" const int numVertices = polyB.m_numIndices;\n" -" for(int e0=0;e0<numVertices;e0++)\n" -" {\n" -" const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n" -" worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" -" }\n" -" }\n" -" if (closestFaceB>=0)\n" -" {\n" -" numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, \n" -" posA,ornA,\n" -" worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,\n" -" verticesA,facesA,indicesA,\n" -" verticesB,facesB,indicesB,\n" -" localContactsOut,localContactCapacity);\n" -" }\n" -" return numContactsOut;\n" -"}\n" -"#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];\n" -"#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}\n" -"#define REDUCE_MAX(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }\n" -"#define REDUCE_MIN(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }\n" -"int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n" -"{\n" -" if( nPoints == 0 )\n" -" return 0;\n" -" \n" -" if (nPoints <=4)\n" -" return nPoints;\n" -" \n" -" \n" -" if (nPoints >64)\n" -" nPoints = 64;\n" -" \n" -" float4 center = make_float4(0.f);\n" -" {\n" -" \n" -" for (int i=0;i<nPoints;i++)\n" -" center += p[i];\n" -" center /= (float)nPoints;\n" -" }\n" -" \n" -" \n" -" \n" -" // sample 4 directions\n" -" \n" -" float4 aVector = p[0] - center;\n" -" float4 u = cross3( nearNormal, aVector );\n" -" float4 v = cross3( nearNormal, u );\n" -" u = normalize3( u );\n" -" v = normalize3( v );\n" -" \n" -" \n" -" //keep point with deepest penetration\n" -" float minW= FLT_MAX;\n" -" \n" -" int minIndex=-1;\n" -" \n" -" float4 maxDots;\n" -" maxDots.x = FLT_MIN;\n" -" maxDots.y = FLT_MIN;\n" -" maxDots.z = FLT_MIN;\n" -" maxDots.w = FLT_MIN;\n" -" \n" -" // idx, distance\n" -" for(int ie = 0; ie<nPoints; ie++ )\n" -" {\n" -" if (p[ie].w<minW)\n" -" {\n" -" minW = p[ie].w;\n" -" minIndex=ie;\n" -" }\n" -" float f;\n" -" float4 r = p[ie]-center;\n" -" f = dot3F4( u, r );\n" -" if (f<maxDots.x)\n" -" {\n" -" maxDots.x = f;\n" -" contactIdx[0].x = ie;\n" -" }\n" -" \n" -" f = dot3F4( -u, r );\n" -" if (f<maxDots.y)\n" -" {\n" -" maxDots.y = f;\n" -" contactIdx[0].y = ie;\n" -" }\n" -" \n" -" \n" -" f = dot3F4( v, r );\n" -" if (f<maxDots.z)\n" -" {\n" -" maxDots.z = f;\n" -" contactIdx[0].z = ie;\n" -" }\n" -" \n" -" f = dot3F4( -v, r );\n" -" if (f<maxDots.w)\n" -" {\n" -" maxDots.w = f;\n" -" contactIdx[0].w = ie;\n" -" }\n" -" \n" -" }\n" -" \n" -" if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n" -" {\n" -" //replace the first contact with minimum (todo: replace contact with least penetration)\n" -" contactIdx[0].x = minIndex;\n" -" }\n" -" \n" -" return 4;\n" -" \n" -"}\n" -"int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n" -"{\n" -" contactIdx[0] = 0;\n" -" contactIdx[1] = 1;\n" -" contactIdx[2] = 2;\n" -" contactIdx[3] = 3;\n" -" \n" -" if( nPoints == 0 ) return 0;\n" -" \n" -" nPoints = min2( nPoints, 4 );\n" -" return nPoints;\n" -" \n" -"}\n" -"int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n" -"{\n" -" if( nPoints == 0 ) return 0;\n" -" nPoints = min2( nPoints, 64 );\n" -" float4 center = make_float4(0.f);\n" -" {\n" -" float4 v[64];\n" -" for (int i=0;i<nPoints;i++)\n" -" v[i] = p[i];\n" -" //memcpy( v, p, nPoints*sizeof(float4) );\n" -" PARALLEL_SUM( v, nPoints );\n" -" center = v[0]/(float)nPoints;\n" -" }\n" -" \n" -" { // sample 4 directions\n" -" if( nPoints < 4 )\n" -" {\n" -" for(int i=0; i<nPoints; i++) \n" -" contactIdx[i] = i;\n" -" return nPoints;\n" -" }\n" -" float4 aVector = p[0] - center;\n" -" float4 u = cross3( nearNormal, aVector );\n" -" float4 v = cross3( nearNormal, u );\n" -" u = normalize3( u );\n" -" v = normalize3( v );\n" -" int idx[4];\n" -" float2 max00 = make_float2(0,FLT_MAX);\n" -" {\n" -" // idx, distance\n" -" {\n" -" {\n" -" int4 a[64];\n" -" for(int ie = 0; ie<nPoints; ie++ )\n" -" {\n" -" \n" -" \n" -" float f;\n" -" float4 r = p[ie]-center;\n" -" f = dot3F4( u, r );\n" -" a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" -" f = dot3F4( -u, r );\n" -" a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" -" f = dot3F4( v, r );\n" -" a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" -" f = dot3F4( -v, r );\n" -" a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" -" }\n" -" for(int ie=0; ie<nPoints; ie++)\n" -" {\n" -" a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;\n" -" a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;\n" -" a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;\n" -" a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;\n" -" }\n" -" idx[0] = (int)a[0].x & 0xff;\n" -" idx[1] = (int)a[0].y & 0xff;\n" -" idx[2] = (int)a[0].z & 0xff;\n" -" idx[3] = (int)a[0].w & 0xff;\n" -" }\n" -" }\n" -" {\n" -" float2 h[64];\n" -" PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );\n" -" REDUCE_MIN( h, nPoints );\n" -" max00 = h[0];\n" -" }\n" -" }\n" -" contactIdx[0] = idx[0];\n" -" contactIdx[1] = idx[1];\n" -" contactIdx[2] = idx[2];\n" -" contactIdx[3] = idx[3];\n" -" return 4;\n" -" }\n" -"}\n" -"__kernel void extractManifoldAndAddContactKernel(__global const int4* pairs, \n" -" __global const b3RigidBodyData_t* rigidBodies, \n" -" __global const float4* closestPointsWorld,\n" -" __global const float4* separatingNormalsWorld,\n" -" __global const int* contactCounts,\n" -" __global const int* contactOffsets,\n" -" __global struct b3Contact4Data* restrict contactsOut,\n" -" counter32_t nContactsOut,\n" -" int contactCapacity,\n" -" int numPairs,\n" -" int pairIndex\n" -" )\n" -"{\n" -" int idx = get_global_id(0);\n" -" \n" -" if (idx<numPairs)\n" -" {\n" -" float4 normal = separatingNormalsWorld[idx];\n" -" int nPoints = contactCounts[idx];\n" -" __global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]];\n" -" float4 localPoints[64];\n" -" for (int i=0;i<nPoints;i++)\n" -" {\n" -" localPoints[i] = pointsIn[i];\n" -" }\n" -" int contactIdx[4];// = {-1,-1,-1,-1};\n" -" contactIdx[0] = -1;\n" -" contactIdx[1] = -1;\n" -" contactIdx[2] = -1;\n" -" contactIdx[3] = -1;\n" -" int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx);\n" -" int dstIdx;\n" -" AppendInc( nContactsOut, dstIdx );\n" -" if (dstIdx<contactCapacity)\n" -" {\n" -" __global struct b3Contact4Data* c = contactsOut + dstIdx;\n" -" c->m_worldNormalOnB = -normal;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = idx;\n" -" int bodyA = pairs[pairIndex].x;\n" -" int bodyB = pairs[pairIndex].y;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n" -" c->m_childIndexA = -1;\n" -" c->m_childIndexB = -1;\n" -" for (int i=0;i<nContacts;i++)\n" -" {\n" -" c->m_worldPosB[i] = localPoints[contactIdx[i]];\n" -" }\n" -" GET_NPOINTS(*c) = nContacts;\n" -" }\n" -" }\n" -"}\n" -"void trInverse(float4 translationIn, Quaternion orientationIn,\n" -" float4* translationOut, Quaternion* orientationOut)\n" -"{\n" -" *orientationOut = qtInvert(orientationIn);\n" -" *translationOut = qtRotate(*orientationOut, -translationIn);\n" -"}\n" -"void trMul(float4 translationA, Quaternion orientationA,\n" -" float4 translationB, Quaternion orientationB,\n" -" float4* translationOut, Quaternion* orientationOut)\n" -"{\n" -" *orientationOut = qtMul(orientationA,orientationB);\n" -" *translationOut = transform(&translationB,&translationA,&orientationA);\n" -"}\n" -"__kernel void clipHullHullKernel( __global int4* pairs, \n" -" __global const b3RigidBodyData_t* rigidBodies, \n" -" __global const b3Collidable_t* collidables,\n" -" __global const b3ConvexPolyhedronData_t* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const b3GpuFace_t* faces,\n" -" __global const int* indices,\n" -" __global const float4* separatingNormals,\n" -" __global const int* hasSeparatingAxis,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int numPairs,\n" -" int contactCapacity)\n" -"{\n" -" int i = get_global_id(0);\n" -" int pairIndex = i;\n" -" \n" -" float4 worldVertsB1[64];\n" -" float4 worldVertsB2[64];\n" -" int capacityWorldVerts = 64; \n" -" float4 localContactsOut[64];\n" -" int localContactCapacity=64;\n" -" \n" -" float minDist = -1e30f;\n" -" float maxDist = 0.02f;\n" -" if (i<numPairs)\n" -" {\n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" \n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" if (hasSeparatingAxis[i])\n" -" {\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" \n" -" int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i],\n" -" &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n" -" rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n" -" rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n" -" worldVertsB1,worldVertsB2,capacityWorldVerts,\n" -" minDist, maxDist,\n" -" vertices,faces,indices,\n" -" localContactsOut,localContactCapacity);\n" -" \n" -" if (numLocalContactsOut>0)\n" -" {\n" -" float4 normal = -separatingNormals[i];\n" -" int nPoints = numLocalContactsOut;\n" -" float4* pointsIn = localContactsOut;\n" -" int contactIdx[4];// = {-1,-1,-1,-1};\n" -" contactIdx[0] = -1;\n" -" contactIdx[1] = -1;\n" -" contactIdx[2] = -1;\n" -" contactIdx[3] = -1;\n" -" \n" -" int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n" -" \n" -" \n" -" int mprContactIndex = pairs[pairIndex].z;\n" -" int dstIdx = mprContactIndex;\n" -" if (dstIdx<0)\n" -" {\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" }\n" -" if (dstIdx<contactCapacity)\n" -" {\n" -" pairs[pairIndex].z = dstIdx;\n" -" __global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n" -" c->m_worldNormalOnB = -normal;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" int bodyA = pairs[pairIndex].x;\n" -" int bodyB = pairs[pairIndex].y;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" -" c->m_childIndexA = -1;\n" -" c->m_childIndexB = -1;\n" -" for (int i=0;i<nReducedContacts;i++)\n" -" {\n" -" //this condition means: overwrite contact point, unless at index i==0 we have a valid 'mpr' contact\n" -" if (i>0||(mprContactIndex<0))\n" -" {\n" -" c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n" -" }\n" -" }\n" -" GET_NPOINTS(*c) = nReducedContacts;\n" -" }\n" -" \n" -" }// if (numContactsOut>0)\n" -" }// if (hasSeparatingAxis[i])\n" -" }// if (i<numPairs)\n" -"}\n" -"__kernel void clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, \n" -" __global const b3RigidBodyData_t* rigidBodies, \n" -" __global const b3Collidable_t* collidables,\n" -" __global const b3ConvexPolyhedronData_t* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const b3GpuFace_t* faces,\n" -" __global const int* indices,\n" -" __global const b3GpuChildShape_t* gpuChildShapes,\n" -" __global const float4* gpuCompoundSepNormalsOut,\n" -" __global const int* gpuHasCompoundSepNormalsOut,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int numCompoundPairs, int maxContactCapacity)\n" -"{\n" -" int i = get_global_id(0);\n" -" int pairIndex = i;\n" -" \n" -" float4 worldVertsB1[64];\n" -" float4 worldVertsB2[64];\n" -" int capacityWorldVerts = 64; \n" -" float4 localContactsOut[64];\n" -" int localContactCapacity=64;\n" -" \n" -" float minDist = -1e30f;\n" -" float maxDist = 0.02f;\n" -" if (i<numCompoundPairs)\n" -" {\n" -" if (gpuHasCompoundSepNormalsOut[i])\n" -" {\n" -" int bodyIndexA = gpuCompoundPairs[i].x;\n" -" int bodyIndexB = gpuCompoundPairs[i].y;\n" -" \n" -" int childShapeIndexA = gpuCompoundPairs[i].z;\n" -" int childShapeIndexB = gpuCompoundPairs[i].w;\n" -" \n" -" int collidableIndexA = -1;\n" -" int collidableIndexB = -1;\n" -" \n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" \n" -" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" \n" -" if (childShapeIndexA >= 0)\n" -" {\n" -" collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" -" float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" -" float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" -" float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" -" float4 newOrnA = qtMul(ornA,childOrnA);\n" -" posA = newPosA;\n" -" ornA = newOrnA;\n" -" } else\n" -" {\n" -" collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" }\n" -" \n" -" if (childShapeIndexB>=0)\n" -" {\n" -" collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" -" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" -" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" -" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" -" float4 newOrnB = qtMul(ornB,childOrnB);\n" -" posB = newPosB;\n" -" ornB = newOrnB;\n" -" } else\n" -" {\n" -" collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; \n" -" }\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],\n" -" &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n" -" posA,ornA,\n" -" posB,ornB,\n" -" worldVertsB1,worldVertsB2,capacityWorldVerts,\n" -" minDist, maxDist,\n" -" vertices,faces,indices,\n" -" localContactsOut,localContactCapacity);\n" -" \n" -" if (numLocalContactsOut>0)\n" -" {\n" -" float4 normal = -gpuCompoundSepNormalsOut[i];\n" -" int nPoints = numLocalContactsOut;\n" -" float4* pointsIn = localContactsOut;\n" -" int contactIdx[4];// = {-1,-1,-1,-1};\n" -" contactIdx[0] = -1;\n" -" contactIdx[1] = -1;\n" -" contactIdx[2] = -1;\n" -" contactIdx[3] = -1;\n" -" \n" -" int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n" -" \n" -" int dstIdx;\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" if ((dstIdx+nReducedContacts) < maxContactCapacity)\n" -" {\n" -" __global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n" -" c->m_worldNormalOnB = -normal;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" int bodyA = gpuCompoundPairs[pairIndex].x;\n" -" int bodyB = gpuCompoundPairs[pairIndex].y;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" -" c->m_childIndexA = childShapeIndexA;\n" -" c->m_childIndexB = childShapeIndexB;\n" -" for (int i=0;i<nReducedContacts;i++)\n" -" {\n" -" c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n" -" }\n" -" GET_NPOINTS(*c) = nReducedContacts;\n" -" }\n" -" \n" -" }// if (numContactsOut>0)\n" -" }// if (gpuHasCompoundSepNormalsOut[i])\n" -" }// if (i<numCompoundPairs)\n" -"}\n" -"__kernel void sphereSphereCollisionKernel( __global const int4* pairs, \n" -" __global const b3RigidBodyData_t* rigidBodies, \n" -" __global const b3Collidable_t* collidables,\n" -" __global const float4* separatingNormals,\n" -" __global const int* hasSeparatingAxis,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int contactCapacity,\n" -" int numPairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" int pairIndex = i;\n" -" \n" -" if (i<numPairs)\n" -" {\n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" \n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" -" collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" -" {\n" -" //sphere-sphere\n" -" float radiusA = collidables[collidableIndexA].m_radius;\n" -" float radiusB = collidables[collidableIndexB].m_radius;\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" float4 diff = posA-posB;\n" -" float len = length(diff);\n" -" \n" -" ///iff distance positive, don't generate a new contact\n" -" if ( len <= (radiusA+radiusB))\n" -" {\n" -" ///distance (negative means penetration)\n" -" float dist = len - (radiusA+radiusB);\n" -" float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n" -" if (len > 0.00001)\n" -" {\n" -" normalOnSurfaceB = diff / len;\n" -" }\n" -" float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n" -" contactPosB.w = dist;\n" -" \n" -" int dstIdx;\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" if (dstIdx < contactCapacity)\n" -" {\n" -" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" -" c->m_worldNormalOnB = -normalOnSurfaceB;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" int bodyA = pairs[pairIndex].x;\n" -" int bodyB = pairs[pairIndex].y;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" -" c->m_worldPosB[0] = contactPosB;\n" -" c->m_childIndexA = -1;\n" -" c->m_childIndexB = -1;\n" -" GET_NPOINTS(*c) = 1;\n" -" }//if (dstIdx < numPairs)\n" -" }//if ( len <= (radiusA+radiusB))\n" -" }//SHAPE_SPHERE SHAPE_SPHERE\n" -" }//if (i<numPairs)\n" -"} \n" -"__kernel void clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,\n" -" __global const b3RigidBodyData_t* rigidBodies, \n" -" __global const b3Collidable_t* collidables,\n" -" __global const b3ConvexPolyhedronData_t* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const b3GpuFace_t* faces,\n" -" __global const int* indices,\n" -" __global const b3GpuChildShape_t* gpuChildShapes,\n" -" __global const float4* separatingNormals,\n" -" __global struct b3Contact4Data* restrict globalContactsOut,\n" -" counter32_t nGlobalContactsOut,\n" -" int contactCapacity,\n" -" int numConcavePairs)\n" -"{\n" -" int i = get_global_id(0);\n" -" int pairIndex = i;\n" -" \n" -" float4 worldVertsB1[64];\n" -" float4 worldVertsB2[64];\n" -" int capacityWorldVerts = 64; \n" -" float4 localContactsOut[64];\n" -" int localContactCapacity=64;\n" -" \n" -" float minDist = -1e30f;\n" -" float maxDist = 0.02f;\n" -" if (i<numConcavePairs)\n" -" {\n" -" //negative value means that the pair is invalid\n" -" if (concavePairsIn[i].w<0)\n" -" return;\n" -" int bodyIndexA = concavePairsIn[i].x;\n" -" int bodyIndexB = concavePairsIn[i].y;\n" -" int f = concavePairsIn[i].z;\n" -" int childShapeIndexA = f;\n" -" \n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" ///////////////////////////////////////////////////////////////\n" -" \n" -" \n" -" bool overlap = false;\n" -" \n" -" b3ConvexPolyhedronData_t convexPolyhedronA;\n" -" //add 3 vertices of the triangle\n" -" convexPolyhedronA.m_numVertices = 3;\n" -" convexPolyhedronA.m_vertexOffset = 0;\n" -" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" -" b3GpuFace_t face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" -" \n" -" float4 verticesA[3];\n" -" for (int i=0;i<3;i++)\n" -" {\n" -" int index = indices[face.m_indexOffset+i];\n" -" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" -" verticesA[i] = vert;\n" -" localCenter += vert;\n" -" }\n" -" float dmin = FLT_MAX;\n" -" int localCC=0;\n" -" //a triangle has 3 unique edges\n" -" convexPolyhedronA.m_numUniqueEdges = 3;\n" -" convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" -" float4 uniqueEdgesA[3];\n" -" \n" -" uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" -" uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" -" uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" -" convexPolyhedronA.m_faceOffset = 0;\n" -" \n" -" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" -" \n" -" b3GpuFace_t facesA[TRIANGLE_NUM_CONVEX_FACES];\n" -" int indicesA[3+3+2+2+2];\n" -" int curUsedIndices=0;\n" -" int fidx=0;\n" -" //front size of triangle\n" -" {\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[0] = 0;\n" -" indicesA[1] = 1;\n" -" indicesA[2] = 2;\n" -" curUsedIndices+=3;\n" -" float c = face.m_plane.w;\n" -" facesA[fidx].m_plane.x = normal.x;\n" -" facesA[fidx].m_plane.y = normal.y;\n" -" facesA[fidx].m_plane.z = normal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" facesA[fidx].m_numIndices=3;\n" -" }\n" -" fidx++;\n" -" //back size of triangle\n" -" {\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[3]=2;\n" -" indicesA[4]=1;\n" -" indicesA[5]=0;\n" -" curUsedIndices+=3;\n" -" float c = dot3F4(normal,verticesA[0]);\n" -" float c1 = -face.m_plane.w;\n" -" facesA[fidx].m_plane.x = -normal.x;\n" -" facesA[fidx].m_plane.y = -normal.y;\n" -" facesA[fidx].m_plane.z = -normal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" facesA[fidx].m_numIndices=3;\n" -" }\n" -" fidx++;\n" -" bool addEdgePlanes = true;\n" -" if (addEdgePlanes)\n" -" {\n" -" int numVertices=3;\n" -" int prevVertex = numVertices-1;\n" -" for (int i=0;i<numVertices;i++)\n" -" {\n" -" float4 v0 = verticesA[i];\n" -" float4 v1 = verticesA[prevVertex];\n" -" \n" -" float4 edgeNormal = normalize(cross(normal,v1-v0));\n" -" float c = -dot3F4(edgeNormal,v0);\n" -" facesA[fidx].m_numIndices = 2;\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[curUsedIndices++]=i;\n" -" indicesA[curUsedIndices++]=prevVertex;\n" -" \n" -" facesA[fidx].m_plane.x = edgeNormal.x;\n" -" facesA[fidx].m_plane.y = edgeNormal.y;\n" -" facesA[fidx].m_plane.z = edgeNormal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" fidx++;\n" -" prevVertex = i;\n" -" }\n" -" }\n" -" convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" -" convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" posA.w = 0.f;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" posB.w = 0.f;\n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" -" float4 sepAxis = separatingNormals[i];\n" -" \n" -" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" -" int childShapeIndexB =-1;\n" -" if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" {\n" -" ///////////////////\n" -" ///compound shape support\n" -" \n" -" childShapeIndexB = concavePairsIn[pairIndex].w;\n" -" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" -" shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" -" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" -" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" -" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" -" float4 newOrnB = qtMul(ornB,childOrnB);\n" -" posB = newPosB;\n" -" ornB = newOrnB;\n" -" \n" -" }\n" -" \n" -" ////////////////////////////////////////\n" -" \n" -" \n" -" \n" -" int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis,\n" -" &convexPolyhedronA, &convexShapes[shapeIndexB],\n" -" posA,ornA,\n" -" posB,ornB,\n" -" worldVertsB1,worldVertsB2,capacityWorldVerts,\n" -" minDist, maxDist,\n" -" &verticesA,&facesA,&indicesA,\n" -" vertices,faces,indices,\n" -" localContactsOut,localContactCapacity);\n" -" \n" -" if (numLocalContactsOut>0)\n" -" {\n" -" float4 normal = -separatingNormals[i];\n" -" int nPoints = numLocalContactsOut;\n" -" float4* pointsIn = localContactsOut;\n" -" int contactIdx[4];// = {-1,-1,-1,-1};\n" -" contactIdx[0] = -1;\n" -" contactIdx[1] = -1;\n" -" contactIdx[2] = -1;\n" -" contactIdx[3] = -1;\n" -" \n" -" int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n" -" \n" -" int dstIdx;\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" if (dstIdx<contactCapacity)\n" -" {\n" -" __global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n" -" c->m_worldNormalOnB = -normal;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" int bodyA = concavePairsIn[pairIndex].x;\n" -" int bodyB = concavePairsIn[pairIndex].y;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" -" c->m_childIndexA = childShapeIndexA;\n" -" c->m_childIndexB = childShapeIndexB;\n" -" for (int i=0;i<nReducedContacts;i++)\n" -" {\n" -" c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n" -" }\n" -" GET_NPOINTS(*c) = nReducedContacts;\n" -" }\n" -" \n" -" }// if (numContactsOut>0)\n" -" }// if (i<numPairs)\n" -"}\n" -"int findClippingFaces(const float4 separatingNormal,\n" -" __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,\n" -" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n" -" __global float4* worldVertsA1,\n" -" __global float4* worldNormalsA1,\n" -" __global float4* worldVertsB1,\n" -" int capacityWorldVerts,\n" -" const float minDist, float maxDist,\n" -" __global const float4* vertices,\n" -" __global const b3GpuFace_t* faces,\n" -" __global const int* indices,\n" -" __global int4* clippingFaces, int pairIndex)\n" -"{\n" -" int numContactsOut = 0;\n" -" int numWorldVertsB1= 0;\n" -" \n" -" \n" -" int closestFaceB=-1;\n" -" float dmax = -FLT_MAX;\n" -" \n" -" {\n" -" for(int face=0;face<hullB->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x,\n" -" faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n" -" const float4 WorldNormal = qtRotate(ornB, Normal);\n" -" float d = dot3F4(WorldNormal,separatingNormal);\n" -" if (d > dmax)\n" -" {\n" -" dmax = d;\n" -" closestFaceB = face;\n" -" }\n" -" }\n" -" }\n" -" \n" -" {\n" -" const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n" -" const int numVertices = polyB.m_numIndices;\n" -" for(int e0=0;e0<numVertices;e0++)\n" -" {\n" -" const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n" -" worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" -" }\n" -" }\n" -" \n" -" int closestFaceA=-1;\n" -" {\n" -" float dmin = FLT_MAX;\n" -" for(int face=0;face<hullA->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(\n" -" faces[hullA->m_faceOffset+face].m_plane.x,\n" -" faces[hullA->m_faceOffset+face].m_plane.y,\n" -" faces[hullA->m_faceOffset+face].m_plane.z,\n" -" 0.f);\n" -" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" -" \n" -" float d = dot3F4(faceANormalWS,separatingNormal);\n" -" if (d < dmin)\n" -" {\n" -" dmin = d;\n" -" closestFaceA = face;\n" -" worldNormalsA1[pairIndex] = faceANormalWS;\n" -" }\n" -" }\n" -" }\n" -" \n" -" int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices;\n" -" for(int e0=0;e0<numVerticesA;e0++)\n" -" {\n" -" const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n" -" worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n" -" }\n" -" \n" -" clippingFaces[pairIndex].x = closestFaceA;\n" -" clippingFaces[pairIndex].y = closestFaceB;\n" -" clippingFaces[pairIndex].z = numVerticesA;\n" -" clippingFaces[pairIndex].w = numWorldVertsB1;\n" -" \n" -" \n" -" return numContactsOut;\n" -"}\n" -"int clipFaces(__global float4* worldVertsA1,\n" -" __global float4* worldNormalsA1,\n" -" __global float4* worldVertsB1,\n" -" __global float4* worldVertsB2, \n" -" int capacityWorldVertsB2,\n" -" const float minDist, float maxDist,\n" -" __global int4* clippingFaces,\n" -" int pairIndex)\n" -"{\n" -" int numContactsOut = 0;\n" -" \n" -" int closestFaceA = clippingFaces[pairIndex].x;\n" -" int closestFaceB = clippingFaces[pairIndex].y;\n" -" int numVertsInA = clippingFaces[pairIndex].z;\n" -" int numVertsInB = clippingFaces[pairIndex].w;\n" -" \n" -" int numVertsOut = 0;\n" -" \n" -" if (closestFaceA<0)\n" -" return numContactsOut;\n" -" \n" -" __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n" -" __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n" -" \n" -" \n" -" \n" -" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" -" \n" -" for(int e0=0;e0<numVertsInA;e0++)\n" -" {\n" -" const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n" -" const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n" -" const float4 WorldEdge0 = aw - bw;\n" -" float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n" -" float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" -" float4 worldA1 = aw;\n" -" float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" -" float4 planeNormalWS = planeNormalWS1;\n" -" float planeEqWS=planeEqWS1;\n" -" numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n" -" __global float4* tmp = pVtxOut;\n" -" pVtxOut = pVtxIn;\n" -" pVtxIn = tmp;\n" -" numVertsInB = numVertsOut;\n" -" numVertsOut = 0;\n" -" }\n" -" \n" -" //float4 planeNormalWS = worldNormalsA1[pairIndex];\n" -" //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n" -" \n" -" /*for (int i=0;i<numVertsInB;i++)\n" -" {\n" -" pVtxOut[i] = pVtxIn[i];\n" -" }*/\n" -" \n" -" \n" -" \n" -" \n" -" //numVertsInB=0;\n" -" \n" -" float4 planeNormalWS = worldNormalsA1[pairIndex];\n" -" float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n" -" for (int i=0;i<numVertsInB;i++)\n" -" {\n" -" float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" -" if (depth <=minDist)\n" -" {\n" -" depth = minDist;\n" -" }\n" -" \n" -" if (depth <=maxDist)\n" -" {\n" -" float4 pointInWorld = pVtxIn[i];\n" -" pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" -" }\n" -" }\n" -" \n" -" clippingFaces[pairIndex].w =numContactsOut;\n" -" \n" -" \n" -" return numContactsOut;\n" -"}\n" -"__kernel void findClippingFacesKernel( __global const int4* pairs,\n" -" __global const b3RigidBodyData_t* rigidBodies,\n" -" __global const b3Collidable_t* collidables,\n" -" __global const b3ConvexPolyhedronData_t* convexShapes,\n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const b3GpuFace_t* faces,\n" -" __global const int* indices,\n" -" __global const float4* separatingNormals,\n" -" __global const int* hasSeparatingAxis,\n" -" __global int4* clippingFacesOut,\n" -" __global float4* worldVertsA1,\n" -" __global float4* worldNormalsA1,\n" -" __global float4* worldVertsB1,\n" -" int capacityWorldVerts,\n" -" int numPairs\n" -" )\n" -"{\n" -" \n" -" int i = get_global_id(0);\n" -" int pairIndex = i;\n" -" \n" -" \n" -" float minDist = -1e30f;\n" -" float maxDist = 0.02f;\n" -" \n" -" if (i<numPairs)\n" -" {\n" -" \n" -" if (hasSeparatingAxis[i])\n" -" {\n" -" \n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" \n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" \n" -" \n" -" int numLocalContactsOut = findClippingFaces(separatingNormals[i],\n" -" &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n" -" rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n" -" rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n" -" worldVertsA1,\n" -" worldNormalsA1,\n" -" worldVertsB1,capacityWorldVerts,\n" -" minDist, maxDist,\n" -" vertices,faces,indices,\n" -" clippingFacesOut,i);\n" -" \n" -" \n" -" }// if (hasSeparatingAxis[i])\n" -" }// if (i<numPairs)\n" -" \n" -"}\n" -"__kernel void clipFacesAndFindContactsKernel( __global const float4* separatingNormals,\n" -" __global const int* hasSeparatingAxis,\n" -" __global int4* clippingFacesOut,\n" -" __global float4* worldVertsA1,\n" -" __global float4* worldNormalsA1,\n" -" __global float4* worldVertsB1,\n" -" __global float4* worldVertsB2,\n" -" int vertexFaceCapacity,\n" -" int numPairs,\n" -" int debugMode\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" int pairIndex = i;\n" -" \n" -" \n" -" float minDist = -1e30f;\n" -" float maxDist = 0.02f;\n" -" \n" -" if (i<numPairs)\n" -" {\n" -" \n" -" if (hasSeparatingAxis[i])\n" -" {\n" -" \n" -"// int bodyIndexA = pairs[i].x;\n" -" // int bodyIndexB = pairs[i].y;\n" -" \n" -" int numLocalContactsOut = 0;\n" -" int capacityWorldVertsB2 = vertexFaceCapacity;\n" -" \n" -" __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n" -" __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n" -" \n" -" {\n" -" __global int4* clippingFaces = clippingFacesOut;\n" -" \n" -" \n" -" int closestFaceA = clippingFaces[pairIndex].x;\n" -" int closestFaceB = clippingFaces[pairIndex].y;\n" -" int numVertsInA = clippingFaces[pairIndex].z;\n" -" int numVertsInB = clippingFaces[pairIndex].w;\n" -" \n" -" int numVertsOut = 0;\n" -" \n" -" if (closestFaceA>=0)\n" -" {\n" -" \n" -" \n" -" \n" -" // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" -" \n" -" for(int e0=0;e0<numVertsInA;e0++)\n" -" {\n" -" const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n" -" const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n" -" const float4 WorldEdge0 = aw - bw;\n" -" float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n" -" float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" -" float4 worldA1 = aw;\n" -" float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" -" float4 planeNormalWS = planeNormalWS1;\n" -" float planeEqWS=planeEqWS1;\n" -" numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n" -" __global float4* tmp = pVtxOut;\n" -" pVtxOut = pVtxIn;\n" -" pVtxIn = tmp;\n" -" numVertsInB = numVertsOut;\n" -" numVertsOut = 0;\n" -" }\n" -" \n" -" float4 planeNormalWS = worldNormalsA1[pairIndex];\n" -" float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n" -" \n" -" for (int i=0;i<numVertsInB;i++)\n" -" {\n" -" float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" -" if (depth <=minDist)\n" -" {\n" -" depth = minDist;\n" -" }\n" -" \n" -" if (depth <=maxDist)\n" -" {\n" -" float4 pointInWorld = pVtxIn[i];\n" -" pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" -" }\n" -" }\n" -" \n" -" }\n" -" clippingFaces[pairIndex].w =numLocalContactsOut;\n" -" \n" -" }\n" -" \n" -" for (int i=0;i<numLocalContactsOut;i++)\n" -" pVtxIn[i] = pVtxOut[i];\n" -" \n" -" }// if (hasSeparatingAxis[i])\n" -" }// if (i<numPairs)\n" -" \n" -"}\n" -"__kernel void newContactReductionKernel( __global int4* pairs,\n" -" __global const b3RigidBodyData_t* rigidBodies,\n" -" __global const float4* separatingNormals,\n" -" __global const int* hasSeparatingAxis,\n" -" __global struct b3Contact4Data* globalContactsOut,\n" -" __global int4* clippingFaces,\n" -" __global float4* worldVertsB2,\n" -" volatile __global int* nGlobalContactsOut,\n" -" int vertexFaceCapacity,\n" -" int contactCapacity,\n" -" int numPairs\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" int pairIndex = i;\n" -" \n" -" int4 contactIdx;\n" -" contactIdx=make_int4(0,1,2,3);\n" -" \n" -" if (i<numPairs)\n" -" {\n" -" \n" -" if (hasSeparatingAxis[i])\n" -" {\n" -" \n" -" \n" -" \n" -" \n" -" int nPoints = clippingFaces[pairIndex].w;\n" -" \n" -" if (nPoints>0)\n" -" {\n" -" __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];\n" -" float4 normal = -separatingNormals[i];\n" -" \n" -" int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);\n" -" \n" -" int mprContactIndex = pairs[pairIndex].z;\n" -" int dstIdx = mprContactIndex;\n" -" if (dstIdx<0)\n" -" {\n" -" AppendInc( nGlobalContactsOut, dstIdx );\n" -" }\n" -"//#if 0\n" -" \n" -" if (dstIdx < contactCapacity)\n" -" {\n" -" __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" -" c->m_worldNormalOnB = -normal;\n" -" c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" -" c->m_batchIdx = pairIndex;\n" -" int bodyA = pairs[pairIndex].x;\n" -" int bodyB = pairs[pairIndex].y;\n" -" pairs[pairIndex].w = dstIdx;\n" -" c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" -" c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" -" c->m_childIndexA =-1;\n" -" c->m_childIndexB =-1;\n" -" switch (nReducedContacts)\n" -" {\n" -" case 4:\n" -" c->m_worldPosB[3] = pointsIn[contactIdx.w];\n" -" case 3:\n" -" c->m_worldPosB[2] = pointsIn[contactIdx.z];\n" -" case 2:\n" -" c->m_worldPosB[1] = pointsIn[contactIdx.y];\n" -" case 1:\n" -" if (mprContactIndex<0)//test\n" -" c->m_worldPosB[0] = pointsIn[contactIdx.x];\n" -" default:\n" -" {\n" -" }\n" -" };\n" -" \n" -" GET_NPOINTS(*c) = nReducedContacts;\n" -" \n" -" }\n" -" \n" -" \n" -"//#endif\n" -" \n" -" }// if (numContactsOut>0)\n" -" }// if (hasSeparatingAxis[i])\n" -" }// if (i<numPairs)\n" -" \n" -" \n" -"}\n" -; +static const char* satClipKernelsCL = + "#define TRIANGLE_NUM_CONVEX_FACES 5\n" + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "#ifdef cl_ext_atomic_counters_32\n" + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + "#define counter32_t volatile __global int*\n" + "#endif\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GET_NUM_GROUPS get_num_groups(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" + "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" + "#define max2 max\n" + "#define min2 min\n" + "typedef unsigned int u32;\n" + "#ifndef B3_CONTACT4DATA_H\n" + "#define B3_CONTACT4DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "typedef struct b3Contact4Data b3Contact4Data_t;\n" + "struct b3Contact4Data\n" + "{\n" + " b3Float4 m_worldPosB[4];\n" + "// b3Float4 m_localPosA[4];\n" + "// b3Float4 m_localPosB[4];\n" + " b3Float4 m_worldNormalOnB; // w: m_nPoints\n" + " unsigned short m_restituitionCoeffCmp;\n" + " unsigned short m_frictionCoeffCmp;\n" + " int m_batchIdx;\n" + " int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" + " int m_bodyBPtrAndSignBit;\n" + " int m_childIndexA;\n" + " int m_childIndexB;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" + "{\n" + " return (int)contact->m_worldNormalOnB.w;\n" + "};\n" + "inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" + "{\n" + " contact->m_worldNormalOnB.w = (float)numPoints;\n" + "};\n" + "#endif //B3_CONTACT4DATA_H\n" + "#ifndef B3_CONVEX_POLYHEDRON_DATA_H\n" + "#define B3_CONVEX_POLYHEDRON_DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#define B3_QUAT_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif\n" + "#endif\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Quat;\n" + " #define b3QuatConstArg const b3Quat\n" + " \n" + " \n" + "inline float4 b3FastNormalize4(float4 v)\n" + "{\n" + " v = (float4)(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + " \n" + "inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" + "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" + "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" + "{\n" + " b3Quat ans;\n" + " ans = b3Cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - b3Dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" + "{\n" + " b3Quat q;\n" + " q=in;\n" + " //return b3FastNormalize4(in);\n" + " float len = native_sqrt(dot(q, q));\n" + " if(len > 0.f)\n" + " {\n" + " q *= 1.f / len;\n" + " }\n" + " else\n" + " {\n" + " q.x = q.y = q.z = 0.f;\n" + " q.w = 1.f;\n" + " }\n" + " return q;\n" + "}\n" + "inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " b3Quat qInv = b3QuatInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " return b3QuatRotate( b3QuatInvert( q ), vec );\n" + "}\n" + "inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" + "{\n" + " return b3QuatRotate( orientation, point ) + (translation);\n" + "}\n" + " \n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "typedef struct b3GpuFace b3GpuFace_t;\n" + "struct b3GpuFace\n" + "{\n" + " b3Float4 m_plane;\n" + " int m_indexOffset;\n" + " int m_numIndices;\n" + " int m_unusedPadding1;\n" + " int m_unusedPadding2;\n" + "};\n" + "typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;\n" + "struct b3ConvexPolyhedronData\n" + "{\n" + " b3Float4 m_localCenter;\n" + " b3Float4 m_extents;\n" + " b3Float4 mC;\n" + " b3Float4 mE;\n" + " float m_radius;\n" + " int m_faceOffset;\n" + " int m_numFaces;\n" + " int m_numVertices;\n" + " int m_vertexOffset;\n" + " int m_uniqueEdgesOffset;\n" + " int m_numUniqueEdges;\n" + " int m_unused;\n" + "};\n" + "#endif //B3_CONVEX_POLYHEDRON_DATA_H\n" + "#ifndef B3_COLLIDABLE_H\n" + "#define B3_COLLIDABLE_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "enum b3ShapeTypes\n" + "{\n" + " SHAPE_HEIGHT_FIELD=1,\n" + " SHAPE_CONVEX_HULL=3,\n" + " SHAPE_PLANE=4,\n" + " SHAPE_CONCAVE_TRIMESH=5,\n" + " SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" + " SHAPE_SPHERE=7,\n" + " MAX_NUM_SHAPE_TYPES,\n" + "};\n" + "typedef struct b3Collidable b3Collidable_t;\n" + "struct b3Collidable\n" + "{\n" + " union {\n" + " int m_numChildShapes;\n" + " int m_bvhIndex;\n" + " };\n" + " union\n" + " {\n" + " float m_radius;\n" + " int m_compoundBvhIndex;\n" + " };\n" + " int m_shapeType;\n" + " int m_shapeIndex;\n" + "};\n" + "typedef struct b3GpuChildShape b3GpuChildShape_t;\n" + "struct b3GpuChildShape\n" + "{\n" + " b3Float4 m_childPosition;\n" + " b3Quat m_childOrientation;\n" + " int m_shapeIndex;\n" + " int m_unused0;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "struct b3CompoundOverlappingPair\n" + "{\n" + " int m_bodyIndexA;\n" + " int m_bodyIndexB;\n" + "// int m_pairType;\n" + " int m_childShapeIndexA;\n" + " int m_childShapeIndexB;\n" + "};\n" + "#endif //B3_COLLIDABLE_H\n" + "#ifndef B3_RIGIDBODY_DATA_H\n" + "#define B3_RIGIDBODY_DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifndef B3_MAT3x3_H\n" + "#define B3_MAT3x3_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "typedef struct\n" + "{\n" + " b3Float4 m_row[3];\n" + "}b3Mat3x3;\n" + "#define b3Mat3x3ConstArg const b3Mat3x3\n" + "#define b3GetRow(m,row) (m.m_row[row])\n" + "inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" + "{\n" + " b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" + " b3Mat3x3 out;\n" + " out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" + " out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" + " out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" + " out.m_row[0].w = 0.f;\n" + " out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" + " out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" + " out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" + " out.m_row[1].w = 0.f;\n" + " out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" + " out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" + " out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" + " out.m_row[2].w = 0.f;\n" + " return out;\n" + "}\n" + "inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = fabs(matIn.m_row[0]);\n" + " out.m_row[1] = fabs(matIn.m_row[1]);\n" + " out.m_row[2] = fabs(matIn.m_row[2]);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtZero();\n" + "__inline\n" + "b3Mat3x3 mtIdentity();\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Mat3x3 mtZero()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(0.f);\n" + " m.m_row[1] = (b3Float4)(0.f);\n" + " m.m_row[2] = (b3Float4)(0.f);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtIdentity()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(1,0,0,0);\n" + " m.m_row[1] = (b3Float4)(0,1,0,0);\n" + " m.m_row[2] = (b3Float4)(0,0,1,0);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" + " out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" + " out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" + "{\n" + " b3Mat3x3 transB;\n" + " transB = mtTranspose( b );\n" + " b3Mat3x3 ans;\n" + " // why this doesn't run when 0ing in the for{}\n" + " a.m_row[0].w = 0.f;\n" + " a.m_row[1].w = 0.f;\n" + " a.m_row[2].w = 0.f;\n" + " for(int i=0; i<3; i++)\n" + " {\n" + "// a.m_row[i].w = 0.f;\n" + " ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" + " ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" + " ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" + " ans.m_row[i].w = 0.f;\n" + " }\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" + "{\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a.m_row[0], b );\n" + " ans.y = b3Dot3F4( a.m_row[1], b );\n" + " ans.z = b3Dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" + "{\n" + " b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a, colx );\n" + " ans.y = b3Dot3F4( a, coly );\n" + " ans.z = b3Dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "#endif\n" + "#endif //B3_MAT3x3_H\n" + "typedef struct b3RigidBodyData b3RigidBodyData_t;\n" + "struct b3RigidBodyData\n" + "{\n" + " b3Float4 m_pos;\n" + " b3Quat m_quat;\n" + " b3Float4 m_linVel;\n" + " b3Float4 m_angVel;\n" + " int m_collidableIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "};\n" + "typedef struct b3InertiaData b3InertiaData_t;\n" + "struct b3InertiaData\n" + "{\n" + " b3Mat3x3 m_invInertiaWorld;\n" + " b3Mat3x3 m_initInvInertia;\n" + "};\n" + "#endif //B3_RIGIDBODY_DATA_H\n" + " \n" + "#define GET_NPOINTS(x) (x).m_worldNormalOnB.w\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define make_float4 (float4)\n" + "#define make_float2 (float2)\n" + "#define make_uint4 (uint4)\n" + "#define make_int4 (int4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "__inline\n" + "float fastDiv(float numerator, float denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "// return numerator/denominator; \n" + "}\n" + "__inline\n" + "float4 fastDiv4(float4 numerator, float4 denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "}\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + "}\n" + "//#define dot3F4 dot\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = make_float4(a.xyz,0.f);\n" + " float4 b1 = make_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " return fast_normalize(v);\n" + "}\n" + "///////////////////////////////////////\n" + "// Quaternion\n" + "///////////////////////////////////////\n" + "typedef float4 Quaternion;\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b);\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in);\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec);\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q);\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b)\n" + "{\n" + " Quaternion ans;\n" + " ans = cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in)\n" + "{\n" + " return fastNormalize4(in);\n" + "// in /= length( in );\n" + "// return in;\n" + "}\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec)\n" + "{\n" + " Quaternion qInv = qtInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = qtMul(qtMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q)\n" + "{\n" + " return (Quaternion)(-q.xyz, q.w);\n" + "}\n" + "__inline\n" + "float4 qtInvRotate(const Quaternion q, float4 vec)\n" + "{\n" + " return qtRotate( qtInvert( q ), vec );\n" + "}\n" + "__inline\n" + "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" + "{\n" + " return qtRotate( *orientation, *p ) + (*translation);\n" + "}\n" + "__inline\n" + "float4 normalize3(const float4 a)\n" + "{\n" + " float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" + " return fastNormalize4( n );\n" + "}\n" + "__inline float4 lerp3(const float4 a,const float4 b, float t)\n" + "{\n" + " return make_float4( a.x + (b.x - a.x) * t,\n" + " a.y + (b.y - a.y) * t,\n" + " a.z + (b.z - a.z) * t,\n" + " 0.f);\n" + "}\n" + "// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n" + "int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut)\n" + "{\n" + " \n" + " int ve;\n" + " float ds, de;\n" + " int numVertsOut = 0;\n" + " //double-check next test\n" + " if (numVertsIn < 2)\n" + " return 0;\n" + " \n" + " float4 firstVertex=pVtxIn[numVertsIn-1];\n" + " float4 endVertex = pVtxIn[0];\n" + " \n" + " ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n" + " \n" + " for (ve = 0; ve < numVertsIn; ve++)\n" + " {\n" + " endVertex=pVtxIn[ve];\n" + " de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n" + " if (ds<0)\n" + " {\n" + " if (de<0)\n" + " {\n" + " // Start < 0, end < 0, so output endVertex\n" + " ppVtxOut[numVertsOut++] = endVertex;\n" + " }\n" + " else\n" + " {\n" + " // Start < 0, end >= 0, so output intersection\n" + " ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" + " }\n" + " }\n" + " else\n" + " {\n" + " if (de<0)\n" + " {\n" + " // Start >= 0, end < 0 so output intersection and end\n" + " ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" + " ppVtxOut[numVertsOut++] = endVertex;\n" + " }\n" + " }\n" + " firstVertex = endVertex;\n" + " ds = de;\n" + " }\n" + " return numVertsOut;\n" + "}\n" + "// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n" + "int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut)\n" + "{\n" + " \n" + " int ve;\n" + " float ds, de;\n" + " int numVertsOut = 0;\n" + "//double-check next test\n" + " if (numVertsIn < 2)\n" + " return 0;\n" + " float4 firstVertex=pVtxIn[numVertsIn-1];\n" + " float4 endVertex = pVtxIn[0];\n" + " \n" + " ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n" + " for (ve = 0; ve < numVertsIn; ve++)\n" + " {\n" + " endVertex=pVtxIn[ve];\n" + " de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n" + " if (ds<0)\n" + " {\n" + " if (de<0)\n" + " {\n" + " // Start < 0, end < 0, so output endVertex\n" + " ppVtxOut[numVertsOut++] = endVertex;\n" + " }\n" + " else\n" + " {\n" + " // Start < 0, end >= 0, so output intersection\n" + " ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" + " }\n" + " }\n" + " else\n" + " {\n" + " if (de<0)\n" + " {\n" + " // Start >= 0, end < 0 so output intersection and end\n" + " ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n" + " ppVtxOut[numVertsOut++] = endVertex;\n" + " }\n" + " }\n" + " firstVertex = endVertex;\n" + " ds = de;\n" + " }\n" + " return numVertsOut;\n" + "}\n" + "int clipFaceAgainstHull(const float4 separatingNormal, __global const b3ConvexPolyhedronData_t* hullA, \n" + " const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n" + " float4* worldVertsB2, int capacityWorldVertsB2,\n" + " const float minDist, float maxDist,\n" + " __global const float4* vertices,\n" + " __global const b3GpuFace_t* faces,\n" + " __global const int* indices,\n" + " float4* contactsOut,\n" + " int contactCapacity)\n" + "{\n" + " int numContactsOut = 0;\n" + " float4* pVtxIn = worldVertsB1;\n" + " float4* pVtxOut = worldVertsB2;\n" + " \n" + " int numVertsIn = numWorldVertsB1;\n" + " int numVertsOut = 0;\n" + " int closestFaceA=-1;\n" + " {\n" + " float dmin = FLT_MAX;\n" + " for(int face=0;face<hullA->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(\n" + " faces[hullA->m_faceOffset+face].m_plane.x, \n" + " faces[hullA->m_faceOffset+face].m_plane.y, \n" + " faces[hullA->m_faceOffset+face].m_plane.z,0.f);\n" + " const float4 faceANormalWS = qtRotate(ornA,Normal);\n" + " \n" + " float d = dot3F4(faceANormalWS,separatingNormal);\n" + " if (d < dmin)\n" + " {\n" + " dmin = d;\n" + " closestFaceA = face;\n" + " }\n" + " }\n" + " }\n" + " if (closestFaceA<0)\n" + " return numContactsOut;\n" + " b3GpuFace_t polyA = faces[hullA->m_faceOffset+closestFaceA];\n" + " // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" + " int numVerticesA = polyA.m_numIndices;\n" + " for(int e0=0;e0<numVerticesA;e0++)\n" + " {\n" + " const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]];\n" + " const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n" + " const float4 edge0 = a - b;\n" + " const float4 WorldEdge0 = qtRotate(ornA,edge0);\n" + " float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" + " float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n" + " float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" + " float4 worldA1 = transform(&a,&posA,&ornA);\n" + " float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" + " \n" + " float4 planeNormalWS = planeNormalWS1;\n" + " float planeEqWS=planeEqWS1;\n" + " \n" + " //clip face\n" + " //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n" + " numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n" + " //btSwap(pVtxIn,pVtxOut);\n" + " float4* tmp = pVtxOut;\n" + " pVtxOut = pVtxIn;\n" + " pVtxIn = tmp;\n" + " numVertsIn = numVertsOut;\n" + " numVertsOut = 0;\n" + " }\n" + " \n" + " // only keep points that are behind the witness face\n" + " {\n" + " float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" + " float localPlaneEq = polyA.m_plane.w;\n" + " float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n" + " float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n" + " for (int i=0;i<numVertsIn;i++)\n" + " {\n" + " float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" + " if (depth <=minDist)\n" + " {\n" + " depth = minDist;\n" + " }\n" + " if (depth <=maxDist)\n" + " {\n" + " float4 pointInWorld = pVtxIn[i];\n" + " //resultOut.addContactPoint(separatingNormal,point,depth);\n" + " contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" + " }\n" + " }\n" + " }\n" + " return numContactsOut;\n" + "}\n" + "int clipFaceAgainstHullLocalA(const float4 separatingNormal, const b3ConvexPolyhedronData_t* hullA, \n" + " const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n" + " float4* worldVertsB2, int capacityWorldVertsB2,\n" + " const float minDist, float maxDist,\n" + " const float4* verticesA,\n" + " const b3GpuFace_t* facesA,\n" + " const int* indicesA,\n" + " __global const float4* verticesB,\n" + " __global const b3GpuFace_t* facesB,\n" + " __global const int* indicesB,\n" + " float4* contactsOut,\n" + " int contactCapacity)\n" + "{\n" + " int numContactsOut = 0;\n" + " float4* pVtxIn = worldVertsB1;\n" + " float4* pVtxOut = worldVertsB2;\n" + " \n" + " int numVertsIn = numWorldVertsB1;\n" + " int numVertsOut = 0;\n" + " int closestFaceA=-1;\n" + " {\n" + " float dmin = FLT_MAX;\n" + " for(int face=0;face<hullA->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(\n" + " facesA[hullA->m_faceOffset+face].m_plane.x, \n" + " facesA[hullA->m_faceOffset+face].m_plane.y, \n" + " facesA[hullA->m_faceOffset+face].m_plane.z,0.f);\n" + " const float4 faceANormalWS = qtRotate(ornA,Normal);\n" + " \n" + " float d = dot3F4(faceANormalWS,separatingNormal);\n" + " if (d < dmin)\n" + " {\n" + " dmin = d;\n" + " closestFaceA = face;\n" + " }\n" + " }\n" + " }\n" + " if (closestFaceA<0)\n" + " return numContactsOut;\n" + " b3GpuFace_t polyA = facesA[hullA->m_faceOffset+closestFaceA];\n" + " // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" + " int numVerticesA = polyA.m_numIndices;\n" + " for(int e0=0;e0<numVerticesA;e0++)\n" + " {\n" + " const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];\n" + " const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n" + " const float4 edge0 = a - b;\n" + " const float4 WorldEdge0 = qtRotate(ornA,edge0);\n" + " float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" + " float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n" + " float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" + " float4 worldA1 = transform(&a,&posA,&ornA);\n" + " float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" + " \n" + " float4 planeNormalWS = planeNormalWS1;\n" + " float planeEqWS=planeEqWS1;\n" + " \n" + " //clip face\n" + " //clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n" + " numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n" + " //btSwap(pVtxIn,pVtxOut);\n" + " float4* tmp = pVtxOut;\n" + " pVtxOut = pVtxIn;\n" + " pVtxIn = tmp;\n" + " numVertsIn = numVertsOut;\n" + " numVertsOut = 0;\n" + " }\n" + " \n" + " // only keep points that are behind the witness face\n" + " {\n" + " float4 localPlaneNormal = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n" + " float localPlaneEq = polyA.m_plane.w;\n" + " float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n" + " float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n" + " for (int i=0;i<numVertsIn;i++)\n" + " {\n" + " float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" + " if (depth <=minDist)\n" + " {\n" + " depth = minDist;\n" + " }\n" + " if (depth <=maxDist)\n" + " {\n" + " float4 pointInWorld = pVtxIn[i];\n" + " //resultOut.addContactPoint(separatingNormal,point,depth);\n" + " contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" + " }\n" + " }\n" + " }\n" + " return numContactsOut;\n" + "}\n" + "int clipHullAgainstHull(const float4 separatingNormal,\n" + " __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n" + " const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n" + " float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n" + " const float minDist, float maxDist,\n" + " __global const float4* vertices,\n" + " __global const b3GpuFace_t* faces,\n" + " __global const int* indices,\n" + " float4* localContactsOut,\n" + " int localContactCapacity)\n" + "{\n" + " int numContactsOut = 0;\n" + " int numWorldVertsB1= 0;\n" + " int closestFaceB=-1;\n" + " float dmax = -FLT_MAX;\n" + " {\n" + " for(int face=0;face<hullB->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, \n" + " faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n" + " const float4 WorldNormal = qtRotate(ornB, Normal);\n" + " float d = dot3F4(WorldNormal,separatingNormal);\n" + " if (d > dmax)\n" + " {\n" + " dmax = d;\n" + " closestFaceB = face;\n" + " }\n" + " }\n" + " }\n" + " {\n" + " const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n" + " const int numVertices = polyB.m_numIndices;\n" + " for(int e0=0;e0<numVertices;e0++)\n" + " {\n" + " const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n" + " worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" + " }\n" + " }\n" + " if (closestFaceB>=0)\n" + " {\n" + " numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, \n" + " posA,ornA,\n" + " worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices,\n" + " faces,\n" + " indices,localContactsOut,localContactCapacity);\n" + " }\n" + " return numContactsOut;\n" + "}\n" + "int clipHullAgainstHullLocalA(const float4 separatingNormal,\n" + " const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB, \n" + " const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n" + " float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n" + " const float minDist, float maxDist,\n" + " const float4* verticesA,\n" + " const b3GpuFace_t* facesA,\n" + " const int* indicesA,\n" + " __global const float4* verticesB,\n" + " __global const b3GpuFace_t* facesB,\n" + " __global const int* indicesB,\n" + " float4* localContactsOut,\n" + " int localContactCapacity)\n" + "{\n" + " int numContactsOut = 0;\n" + " int numWorldVertsB1= 0;\n" + " int closestFaceB=-1;\n" + " float dmax = -FLT_MAX;\n" + " {\n" + " for(int face=0;face<hullB->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, \n" + " facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n" + " const float4 WorldNormal = qtRotate(ornB, Normal);\n" + " float d = dot3F4(WorldNormal,separatingNormal);\n" + " if (d > dmax)\n" + " {\n" + " dmax = d;\n" + " closestFaceB = face;\n" + " }\n" + " }\n" + " }\n" + " {\n" + " const b3GpuFace_t polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" + " const int numVertices = polyB.m_numIndices;\n" + " for(int e0=0;e0<numVertices;e0++)\n" + " {\n" + " const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n" + " worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" + " }\n" + " }\n" + " if (closestFaceB>=0)\n" + " {\n" + " numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, \n" + " posA,ornA,\n" + " worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,\n" + " verticesA,facesA,indicesA,\n" + " verticesB,facesB,indicesB,\n" + " localContactsOut,localContactCapacity);\n" + " }\n" + " return numContactsOut;\n" + "}\n" + "#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];\n" + "#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}\n" + "#define REDUCE_MAX(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }\n" + "#define REDUCE_MIN(v, n) {int i=0; for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }\n" + "int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n" + "{\n" + " if( nPoints == 0 )\n" + " return 0;\n" + " \n" + " if (nPoints <=4)\n" + " return nPoints;\n" + " \n" + " \n" + " if (nPoints >64)\n" + " nPoints = 64;\n" + " \n" + " float4 center = make_float4(0.f);\n" + " {\n" + " \n" + " for (int i=0;i<nPoints;i++)\n" + " center += p[i];\n" + " center /= (float)nPoints;\n" + " }\n" + " \n" + " \n" + " \n" + " // sample 4 directions\n" + " \n" + " float4 aVector = p[0] - center;\n" + " float4 u = cross3( nearNormal, aVector );\n" + " float4 v = cross3( nearNormal, u );\n" + " u = normalize3( u );\n" + " v = normalize3( v );\n" + " \n" + " \n" + " //keep point with deepest penetration\n" + " float minW= FLT_MAX;\n" + " \n" + " int minIndex=-1;\n" + " \n" + " float4 maxDots;\n" + " maxDots.x = FLT_MIN;\n" + " maxDots.y = FLT_MIN;\n" + " maxDots.z = FLT_MIN;\n" + " maxDots.w = FLT_MIN;\n" + " \n" + " // idx, distance\n" + " for(int ie = 0; ie<nPoints; ie++ )\n" + " {\n" + " if (p[ie].w<minW)\n" + " {\n" + " minW = p[ie].w;\n" + " minIndex=ie;\n" + " }\n" + " float f;\n" + " float4 r = p[ie]-center;\n" + " f = dot3F4( u, r );\n" + " if (f<maxDots.x)\n" + " {\n" + " maxDots.x = f;\n" + " contactIdx[0].x = ie;\n" + " }\n" + " \n" + " f = dot3F4( -u, r );\n" + " if (f<maxDots.y)\n" + " {\n" + " maxDots.y = f;\n" + " contactIdx[0].y = ie;\n" + " }\n" + " \n" + " \n" + " f = dot3F4( v, r );\n" + " if (f<maxDots.z)\n" + " {\n" + " maxDots.z = f;\n" + " contactIdx[0].z = ie;\n" + " }\n" + " \n" + " f = dot3F4( -v, r );\n" + " if (f<maxDots.w)\n" + " {\n" + " maxDots.w = f;\n" + " contactIdx[0].w = ie;\n" + " }\n" + " \n" + " }\n" + " \n" + " if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n" + " {\n" + " //replace the first contact with minimum (todo: replace contact with least penetration)\n" + " contactIdx[0].x = minIndex;\n" + " }\n" + " \n" + " return 4;\n" + " \n" + "}\n" + "int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n" + "{\n" + " contactIdx[0] = 0;\n" + " contactIdx[1] = 1;\n" + " contactIdx[2] = 2;\n" + " contactIdx[3] = 3;\n" + " \n" + " if( nPoints == 0 ) return 0;\n" + " \n" + " nPoints = min2( nPoints, 4 );\n" + " return nPoints;\n" + " \n" + "}\n" + "int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n" + "{\n" + " if( nPoints == 0 ) return 0;\n" + " nPoints = min2( nPoints, 64 );\n" + " float4 center = make_float4(0.f);\n" + " {\n" + " float4 v[64];\n" + " for (int i=0;i<nPoints;i++)\n" + " v[i] = p[i];\n" + " //memcpy( v, p, nPoints*sizeof(float4) );\n" + " PARALLEL_SUM( v, nPoints );\n" + " center = v[0]/(float)nPoints;\n" + " }\n" + " \n" + " { // sample 4 directions\n" + " if( nPoints < 4 )\n" + " {\n" + " for(int i=0; i<nPoints; i++) \n" + " contactIdx[i] = i;\n" + " return nPoints;\n" + " }\n" + " float4 aVector = p[0] - center;\n" + " float4 u = cross3( nearNormal, aVector );\n" + " float4 v = cross3( nearNormal, u );\n" + " u = normalize3( u );\n" + " v = normalize3( v );\n" + " int idx[4];\n" + " float2 max00 = make_float2(0,FLT_MAX);\n" + " {\n" + " // idx, distance\n" + " {\n" + " {\n" + " int4 a[64];\n" + " for(int ie = 0; ie<nPoints; ie++ )\n" + " {\n" + " \n" + " \n" + " float f;\n" + " float4 r = p[ie]-center;\n" + " f = dot3F4( u, r );\n" + " a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" + " f = dot3F4( -u, r );\n" + " a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" + " f = dot3F4( v, r );\n" + " a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" + " f = dot3F4( -v, r );\n" + " a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n" + " }\n" + " for(int ie=0; ie<nPoints; ie++)\n" + " {\n" + " a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;\n" + " a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;\n" + " a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;\n" + " a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;\n" + " }\n" + " idx[0] = (int)a[0].x & 0xff;\n" + " idx[1] = (int)a[0].y & 0xff;\n" + " idx[2] = (int)a[0].z & 0xff;\n" + " idx[3] = (int)a[0].w & 0xff;\n" + " }\n" + " }\n" + " {\n" + " float2 h[64];\n" + " PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );\n" + " REDUCE_MIN( h, nPoints );\n" + " max00 = h[0];\n" + " }\n" + " }\n" + " contactIdx[0] = idx[0];\n" + " contactIdx[1] = idx[1];\n" + " contactIdx[2] = idx[2];\n" + " contactIdx[3] = idx[3];\n" + " return 4;\n" + " }\n" + "}\n" + "__kernel void extractManifoldAndAddContactKernel(__global const int4* pairs, \n" + " __global const b3RigidBodyData_t* rigidBodies, \n" + " __global const float4* closestPointsWorld,\n" + " __global const float4* separatingNormalsWorld,\n" + " __global const int* contactCounts,\n" + " __global const int* contactOffsets,\n" + " __global struct b3Contact4Data* restrict contactsOut,\n" + " counter32_t nContactsOut,\n" + " int contactCapacity,\n" + " int numPairs,\n" + " int pairIndex\n" + " )\n" + "{\n" + " int idx = get_global_id(0);\n" + " \n" + " if (idx<numPairs)\n" + " {\n" + " float4 normal = separatingNormalsWorld[idx];\n" + " int nPoints = contactCounts[idx];\n" + " __global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]];\n" + " float4 localPoints[64];\n" + " for (int i=0;i<nPoints;i++)\n" + " {\n" + " localPoints[i] = pointsIn[i];\n" + " }\n" + " int contactIdx[4];// = {-1,-1,-1,-1};\n" + " contactIdx[0] = -1;\n" + " contactIdx[1] = -1;\n" + " contactIdx[2] = -1;\n" + " contactIdx[3] = -1;\n" + " int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx);\n" + " int dstIdx;\n" + " AppendInc( nContactsOut, dstIdx );\n" + " if (dstIdx<contactCapacity)\n" + " {\n" + " __global struct b3Contact4Data* c = contactsOut + dstIdx;\n" + " c->m_worldNormalOnB = -normal;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = idx;\n" + " int bodyA = pairs[pairIndex].x;\n" + " int bodyB = pairs[pairIndex].y;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n" + " c->m_childIndexA = -1;\n" + " c->m_childIndexB = -1;\n" + " for (int i=0;i<nContacts;i++)\n" + " {\n" + " c->m_worldPosB[i] = localPoints[contactIdx[i]];\n" + " }\n" + " GET_NPOINTS(*c) = nContacts;\n" + " }\n" + " }\n" + "}\n" + "void trInverse(float4 translationIn, Quaternion orientationIn,\n" + " float4* translationOut, Quaternion* orientationOut)\n" + "{\n" + " *orientationOut = qtInvert(orientationIn);\n" + " *translationOut = qtRotate(*orientationOut, -translationIn);\n" + "}\n" + "void trMul(float4 translationA, Quaternion orientationA,\n" + " float4 translationB, Quaternion orientationB,\n" + " float4* translationOut, Quaternion* orientationOut)\n" + "{\n" + " *orientationOut = qtMul(orientationA,orientationB);\n" + " *translationOut = transform(&translationB,&translationA,&orientationA);\n" + "}\n" + "__kernel void clipHullHullKernel( __global int4* pairs, \n" + " __global const b3RigidBodyData_t* rigidBodies, \n" + " __global const b3Collidable_t* collidables,\n" + " __global const b3ConvexPolyhedronData_t* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const b3GpuFace_t* faces,\n" + " __global const int* indices,\n" + " __global const float4* separatingNormals,\n" + " __global const int* hasSeparatingAxis,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int numPairs,\n" + " int contactCapacity)\n" + "{\n" + " int i = get_global_id(0);\n" + " int pairIndex = i;\n" + " \n" + " float4 worldVertsB1[64];\n" + " float4 worldVertsB2[64];\n" + " int capacityWorldVerts = 64; \n" + " float4 localContactsOut[64];\n" + " int localContactCapacity=64;\n" + " \n" + " float minDist = -1e30f;\n" + " float maxDist = 0.02f;\n" + " if (i<numPairs)\n" + " {\n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " \n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " if (hasSeparatingAxis[i])\n" + " {\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " \n" + " int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i],\n" + " &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n" + " rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n" + " rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n" + " worldVertsB1,worldVertsB2,capacityWorldVerts,\n" + " minDist, maxDist,\n" + " vertices,faces,indices,\n" + " localContactsOut,localContactCapacity);\n" + " \n" + " if (numLocalContactsOut>0)\n" + " {\n" + " float4 normal = -separatingNormals[i];\n" + " int nPoints = numLocalContactsOut;\n" + " float4* pointsIn = localContactsOut;\n" + " int contactIdx[4];// = {-1,-1,-1,-1};\n" + " contactIdx[0] = -1;\n" + " contactIdx[1] = -1;\n" + " contactIdx[2] = -1;\n" + " contactIdx[3] = -1;\n" + " \n" + " int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n" + " \n" + " \n" + " int mprContactIndex = pairs[pairIndex].z;\n" + " int dstIdx = mprContactIndex;\n" + " if (dstIdx<0)\n" + " {\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " }\n" + " if (dstIdx<contactCapacity)\n" + " {\n" + " pairs[pairIndex].z = dstIdx;\n" + " __global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n" + " c->m_worldNormalOnB = -normal;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " int bodyA = pairs[pairIndex].x;\n" + " int bodyB = pairs[pairIndex].y;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" + " c->m_childIndexA = -1;\n" + " c->m_childIndexB = -1;\n" + " for (int i=0;i<nReducedContacts;i++)\n" + " {\n" + " //this condition means: overwrite contact point, unless at index i==0 we have a valid 'mpr' contact\n" + " if (i>0||(mprContactIndex<0))\n" + " {\n" + " c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n" + " }\n" + " }\n" + " GET_NPOINTS(*c) = nReducedContacts;\n" + " }\n" + " \n" + " }// if (numContactsOut>0)\n" + " }// if (hasSeparatingAxis[i])\n" + " }// if (i<numPairs)\n" + "}\n" + "__kernel void clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, \n" + " __global const b3RigidBodyData_t* rigidBodies, \n" + " __global const b3Collidable_t* collidables,\n" + " __global const b3ConvexPolyhedronData_t* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const b3GpuFace_t* faces,\n" + " __global const int* indices,\n" + " __global const b3GpuChildShape_t* gpuChildShapes,\n" + " __global const float4* gpuCompoundSepNormalsOut,\n" + " __global const int* gpuHasCompoundSepNormalsOut,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int numCompoundPairs, int maxContactCapacity)\n" + "{\n" + " int i = get_global_id(0);\n" + " int pairIndex = i;\n" + " \n" + " float4 worldVertsB1[64];\n" + " float4 worldVertsB2[64];\n" + " int capacityWorldVerts = 64; \n" + " float4 localContactsOut[64];\n" + " int localContactCapacity=64;\n" + " \n" + " float minDist = -1e30f;\n" + " float maxDist = 0.02f;\n" + " if (i<numCompoundPairs)\n" + " {\n" + " if (gpuHasCompoundSepNormalsOut[i])\n" + " {\n" + " int bodyIndexA = gpuCompoundPairs[i].x;\n" + " int bodyIndexB = gpuCompoundPairs[i].y;\n" + " \n" + " int childShapeIndexA = gpuCompoundPairs[i].z;\n" + " int childShapeIndexB = gpuCompoundPairs[i].w;\n" + " \n" + " int collidableIndexA = -1;\n" + " int collidableIndexB = -1;\n" + " \n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " \n" + " float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " \n" + " if (childShapeIndexA >= 0)\n" + " {\n" + " collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" + " float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" + " float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" + " float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" + " float4 newOrnA = qtMul(ornA,childOrnA);\n" + " posA = newPosA;\n" + " ornA = newOrnA;\n" + " } else\n" + " {\n" + " collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " }\n" + " \n" + " if (childShapeIndexB>=0)\n" + " {\n" + " collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" + " float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" + " float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" + " float4 newPosB = transform(&childPosB,&posB,&ornB);\n" + " float4 newOrnB = qtMul(ornB,childOrnB);\n" + " posB = newPosB;\n" + " ornB = newOrnB;\n" + " } else\n" + " {\n" + " collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; \n" + " }\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],\n" + " &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n" + " posA,ornA,\n" + " posB,ornB,\n" + " worldVertsB1,worldVertsB2,capacityWorldVerts,\n" + " minDist, maxDist,\n" + " vertices,faces,indices,\n" + " localContactsOut,localContactCapacity);\n" + " \n" + " if (numLocalContactsOut>0)\n" + " {\n" + " float4 normal = -gpuCompoundSepNormalsOut[i];\n" + " int nPoints = numLocalContactsOut;\n" + " float4* pointsIn = localContactsOut;\n" + " int contactIdx[4];// = {-1,-1,-1,-1};\n" + " contactIdx[0] = -1;\n" + " contactIdx[1] = -1;\n" + " contactIdx[2] = -1;\n" + " contactIdx[3] = -1;\n" + " \n" + " int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n" + " \n" + " int dstIdx;\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " if ((dstIdx+nReducedContacts) < maxContactCapacity)\n" + " {\n" + " __global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n" + " c->m_worldNormalOnB = -normal;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " int bodyA = gpuCompoundPairs[pairIndex].x;\n" + " int bodyB = gpuCompoundPairs[pairIndex].y;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" + " c->m_childIndexA = childShapeIndexA;\n" + " c->m_childIndexB = childShapeIndexB;\n" + " for (int i=0;i<nReducedContacts;i++)\n" + " {\n" + " c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n" + " }\n" + " GET_NPOINTS(*c) = nReducedContacts;\n" + " }\n" + " \n" + " }// if (numContactsOut>0)\n" + " }// if (gpuHasCompoundSepNormalsOut[i])\n" + " }// if (i<numCompoundPairs)\n" + "}\n" + "__kernel void sphereSphereCollisionKernel( __global const int4* pairs, \n" + " __global const b3RigidBodyData_t* rigidBodies, \n" + " __global const b3Collidable_t* collidables,\n" + " __global const float4* separatingNormals,\n" + " __global const int* hasSeparatingAxis,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int contactCapacity,\n" + " int numPairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " int pairIndex = i;\n" + " \n" + " if (i<numPairs)\n" + " {\n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " \n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n" + " collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n" + " {\n" + " //sphere-sphere\n" + " float radiusA = collidables[collidableIndexA].m_radius;\n" + " float radiusB = collidables[collidableIndexB].m_radius;\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " float4 diff = posA-posB;\n" + " float len = length(diff);\n" + " \n" + " ///iff distance positive, don't generate a new contact\n" + " if ( len <= (radiusA+radiusB))\n" + " {\n" + " ///distance (negative means penetration)\n" + " float dist = len - (radiusA+radiusB);\n" + " float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n" + " if (len > 0.00001)\n" + " {\n" + " normalOnSurfaceB = diff / len;\n" + " }\n" + " float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n" + " contactPosB.w = dist;\n" + " \n" + " int dstIdx;\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " if (dstIdx < contactCapacity)\n" + " {\n" + " __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" + " c->m_worldNormalOnB = -normalOnSurfaceB;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " int bodyA = pairs[pairIndex].x;\n" + " int bodyB = pairs[pairIndex].y;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" + " c->m_worldPosB[0] = contactPosB;\n" + " c->m_childIndexA = -1;\n" + " c->m_childIndexB = -1;\n" + " GET_NPOINTS(*c) = 1;\n" + " }//if (dstIdx < numPairs)\n" + " }//if ( len <= (radiusA+radiusB))\n" + " }//SHAPE_SPHERE SHAPE_SPHERE\n" + " }//if (i<numPairs)\n" + "} \n" + "__kernel void clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,\n" + " __global const b3RigidBodyData_t* rigidBodies, \n" + " __global const b3Collidable_t* collidables,\n" + " __global const b3ConvexPolyhedronData_t* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const b3GpuFace_t* faces,\n" + " __global const int* indices,\n" + " __global const b3GpuChildShape_t* gpuChildShapes,\n" + " __global const float4* separatingNormals,\n" + " __global struct b3Contact4Data* restrict globalContactsOut,\n" + " counter32_t nGlobalContactsOut,\n" + " int contactCapacity,\n" + " int numConcavePairs)\n" + "{\n" + " int i = get_global_id(0);\n" + " int pairIndex = i;\n" + " \n" + " float4 worldVertsB1[64];\n" + " float4 worldVertsB2[64];\n" + " int capacityWorldVerts = 64; \n" + " float4 localContactsOut[64];\n" + " int localContactCapacity=64;\n" + " \n" + " float minDist = -1e30f;\n" + " float maxDist = 0.02f;\n" + " if (i<numConcavePairs)\n" + " {\n" + " //negative value means that the pair is invalid\n" + " if (concavePairsIn[i].w<0)\n" + " return;\n" + " int bodyIndexA = concavePairsIn[i].x;\n" + " int bodyIndexB = concavePairsIn[i].y;\n" + " int f = concavePairsIn[i].z;\n" + " int childShapeIndexA = f;\n" + " \n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " ///////////////////////////////////////////////////////////////\n" + " \n" + " \n" + " bool overlap = false;\n" + " \n" + " b3ConvexPolyhedronData_t convexPolyhedronA;\n" + " //add 3 vertices of the triangle\n" + " convexPolyhedronA.m_numVertices = 3;\n" + " convexPolyhedronA.m_vertexOffset = 0;\n" + " float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" + " b3GpuFace_t face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" + " \n" + " float4 verticesA[3];\n" + " for (int i=0;i<3;i++)\n" + " {\n" + " int index = indices[face.m_indexOffset+i];\n" + " float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" + " verticesA[i] = vert;\n" + " localCenter += vert;\n" + " }\n" + " float dmin = FLT_MAX;\n" + " int localCC=0;\n" + " //a triangle has 3 unique edges\n" + " convexPolyhedronA.m_numUniqueEdges = 3;\n" + " convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" + " float4 uniqueEdgesA[3];\n" + " \n" + " uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" + " uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" + " uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" + " convexPolyhedronA.m_faceOffset = 0;\n" + " \n" + " float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" + " \n" + " b3GpuFace_t facesA[TRIANGLE_NUM_CONVEX_FACES];\n" + " int indicesA[3+3+2+2+2];\n" + " int curUsedIndices=0;\n" + " int fidx=0;\n" + " //front size of triangle\n" + " {\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[0] = 0;\n" + " indicesA[1] = 1;\n" + " indicesA[2] = 2;\n" + " curUsedIndices+=3;\n" + " float c = face.m_plane.w;\n" + " facesA[fidx].m_plane.x = normal.x;\n" + " facesA[fidx].m_plane.y = normal.y;\n" + " facesA[fidx].m_plane.z = normal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " facesA[fidx].m_numIndices=3;\n" + " }\n" + " fidx++;\n" + " //back size of triangle\n" + " {\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[3]=2;\n" + " indicesA[4]=1;\n" + " indicesA[5]=0;\n" + " curUsedIndices+=3;\n" + " float c = dot3F4(normal,verticesA[0]);\n" + " float c1 = -face.m_plane.w;\n" + " facesA[fidx].m_plane.x = -normal.x;\n" + " facesA[fidx].m_plane.y = -normal.y;\n" + " facesA[fidx].m_plane.z = -normal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " facesA[fidx].m_numIndices=3;\n" + " }\n" + " fidx++;\n" + " bool addEdgePlanes = true;\n" + " if (addEdgePlanes)\n" + " {\n" + " int numVertices=3;\n" + " int prevVertex = numVertices-1;\n" + " for (int i=0;i<numVertices;i++)\n" + " {\n" + " float4 v0 = verticesA[i];\n" + " float4 v1 = verticesA[prevVertex];\n" + " \n" + " float4 edgeNormal = normalize(cross(normal,v1-v0));\n" + " float c = -dot3F4(edgeNormal,v0);\n" + " facesA[fidx].m_numIndices = 2;\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[curUsedIndices++]=i;\n" + " indicesA[curUsedIndices++]=prevVertex;\n" + " \n" + " facesA[fidx].m_plane.x = edgeNormal.x;\n" + " facesA[fidx].m_plane.y = edgeNormal.y;\n" + " facesA[fidx].m_plane.z = edgeNormal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " fidx++;\n" + " prevVertex = i;\n" + " }\n" + " }\n" + " convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" + " convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " posA.w = 0.f;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " posB.w = 0.f;\n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" + " float4 sepAxis = separatingNormals[i];\n" + " \n" + " int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" + " int childShapeIndexB =-1;\n" + " if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " {\n" + " ///////////////////\n" + " ///compound shape support\n" + " \n" + " childShapeIndexB = concavePairsIn[pairIndex].w;\n" + " int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" + " shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" + " float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" + " float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" + " float4 newPosB = transform(&childPosB,&posB,&ornB);\n" + " float4 newOrnB = qtMul(ornB,childOrnB);\n" + " posB = newPosB;\n" + " ornB = newOrnB;\n" + " \n" + " }\n" + " \n" + " ////////////////////////////////////////\n" + " \n" + " \n" + " \n" + " int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis,\n" + " &convexPolyhedronA, &convexShapes[shapeIndexB],\n" + " posA,ornA,\n" + " posB,ornB,\n" + " worldVertsB1,worldVertsB2,capacityWorldVerts,\n" + " minDist, maxDist,\n" + " &verticesA,&facesA,&indicesA,\n" + " vertices,faces,indices,\n" + " localContactsOut,localContactCapacity);\n" + " \n" + " if (numLocalContactsOut>0)\n" + " {\n" + " float4 normal = -separatingNormals[i];\n" + " int nPoints = numLocalContactsOut;\n" + " float4* pointsIn = localContactsOut;\n" + " int contactIdx[4];// = {-1,-1,-1,-1};\n" + " contactIdx[0] = -1;\n" + " contactIdx[1] = -1;\n" + " contactIdx[2] = -1;\n" + " contactIdx[3] = -1;\n" + " \n" + " int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n" + " \n" + " int dstIdx;\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " if (dstIdx<contactCapacity)\n" + " {\n" + " __global struct b3Contact4Data* c = globalContactsOut+ dstIdx;\n" + " c->m_worldNormalOnB = -normal;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " int bodyA = concavePairsIn[pairIndex].x;\n" + " int bodyB = concavePairsIn[pairIndex].y;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" + " c->m_childIndexA = childShapeIndexA;\n" + " c->m_childIndexB = childShapeIndexB;\n" + " for (int i=0;i<nReducedContacts;i++)\n" + " {\n" + " c->m_worldPosB[i] = pointsIn[contactIdx[i]];\n" + " }\n" + " GET_NPOINTS(*c) = nReducedContacts;\n" + " }\n" + " \n" + " }// if (numContactsOut>0)\n" + " }// if (i<numPairs)\n" + "}\n" + "int findClippingFaces(const float4 separatingNormal,\n" + " __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,\n" + " const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n" + " __global float4* worldVertsA1,\n" + " __global float4* worldNormalsA1,\n" + " __global float4* worldVertsB1,\n" + " int capacityWorldVerts,\n" + " const float minDist, float maxDist,\n" + " __global const float4* vertices,\n" + " __global const b3GpuFace_t* faces,\n" + " __global const int* indices,\n" + " __global int4* clippingFaces, int pairIndex)\n" + "{\n" + " int numContactsOut = 0;\n" + " int numWorldVertsB1= 0;\n" + " \n" + " \n" + " int closestFaceB=-1;\n" + " float dmax = -FLT_MAX;\n" + " \n" + " {\n" + " for(int face=0;face<hullB->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x,\n" + " faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n" + " const float4 WorldNormal = qtRotate(ornB, Normal);\n" + " float d = dot3F4(WorldNormal,separatingNormal);\n" + " if (d > dmax)\n" + " {\n" + " dmax = d;\n" + " closestFaceB = face;\n" + " }\n" + " }\n" + " }\n" + " \n" + " {\n" + " const b3GpuFace_t polyB = faces[hullB->m_faceOffset+closestFaceB];\n" + " const int numVertices = polyB.m_numIndices;\n" + " for(int e0=0;e0<numVertices;e0++)\n" + " {\n" + " const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n" + " worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" + " }\n" + " }\n" + " \n" + " int closestFaceA=-1;\n" + " {\n" + " float dmin = FLT_MAX;\n" + " for(int face=0;face<hullA->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(\n" + " faces[hullA->m_faceOffset+face].m_plane.x,\n" + " faces[hullA->m_faceOffset+face].m_plane.y,\n" + " faces[hullA->m_faceOffset+face].m_plane.z,\n" + " 0.f);\n" + " const float4 faceANormalWS = qtRotate(ornA,Normal);\n" + " \n" + " float d = dot3F4(faceANormalWS,separatingNormal);\n" + " if (d < dmin)\n" + " {\n" + " dmin = d;\n" + " closestFaceA = face;\n" + " worldNormalsA1[pairIndex] = faceANormalWS;\n" + " }\n" + " }\n" + " }\n" + " \n" + " int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices;\n" + " for(int e0=0;e0<numVerticesA;e0++)\n" + " {\n" + " const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n" + " worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n" + " }\n" + " \n" + " clippingFaces[pairIndex].x = closestFaceA;\n" + " clippingFaces[pairIndex].y = closestFaceB;\n" + " clippingFaces[pairIndex].z = numVerticesA;\n" + " clippingFaces[pairIndex].w = numWorldVertsB1;\n" + " \n" + " \n" + " return numContactsOut;\n" + "}\n" + "int clipFaces(__global float4* worldVertsA1,\n" + " __global float4* worldNormalsA1,\n" + " __global float4* worldVertsB1,\n" + " __global float4* worldVertsB2, \n" + " int capacityWorldVertsB2,\n" + " const float minDist, float maxDist,\n" + " __global int4* clippingFaces,\n" + " int pairIndex)\n" + "{\n" + " int numContactsOut = 0;\n" + " \n" + " int closestFaceA = clippingFaces[pairIndex].x;\n" + " int closestFaceB = clippingFaces[pairIndex].y;\n" + " int numVertsInA = clippingFaces[pairIndex].z;\n" + " int numVertsInB = clippingFaces[pairIndex].w;\n" + " \n" + " int numVertsOut = 0;\n" + " \n" + " if (closestFaceA<0)\n" + " return numContactsOut;\n" + " \n" + " __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n" + " __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n" + " \n" + " \n" + " \n" + " // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" + " \n" + " for(int e0=0;e0<numVertsInA;e0++)\n" + " {\n" + " const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n" + " const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n" + " const float4 WorldEdge0 = aw - bw;\n" + " float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n" + " float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" + " float4 worldA1 = aw;\n" + " float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" + " float4 planeNormalWS = planeNormalWS1;\n" + " float planeEqWS=planeEqWS1;\n" + " numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n" + " __global float4* tmp = pVtxOut;\n" + " pVtxOut = pVtxIn;\n" + " pVtxIn = tmp;\n" + " numVertsInB = numVertsOut;\n" + " numVertsOut = 0;\n" + " }\n" + " \n" + " //float4 planeNormalWS = worldNormalsA1[pairIndex];\n" + " //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n" + " \n" + " /*for (int i=0;i<numVertsInB;i++)\n" + " {\n" + " pVtxOut[i] = pVtxIn[i];\n" + " }*/\n" + " \n" + " \n" + " \n" + " \n" + " //numVertsInB=0;\n" + " \n" + " float4 planeNormalWS = worldNormalsA1[pairIndex];\n" + " float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n" + " for (int i=0;i<numVertsInB;i++)\n" + " {\n" + " float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" + " if (depth <=minDist)\n" + " {\n" + " depth = minDist;\n" + " }\n" + " \n" + " if (depth <=maxDist)\n" + " {\n" + " float4 pointInWorld = pVtxIn[i];\n" + " pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" + " }\n" + " }\n" + " \n" + " clippingFaces[pairIndex].w =numContactsOut;\n" + " \n" + " \n" + " return numContactsOut;\n" + "}\n" + "__kernel void findClippingFacesKernel( __global const int4* pairs,\n" + " __global const b3RigidBodyData_t* rigidBodies,\n" + " __global const b3Collidable_t* collidables,\n" + " __global const b3ConvexPolyhedronData_t* convexShapes,\n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const b3GpuFace_t* faces,\n" + " __global const int* indices,\n" + " __global const float4* separatingNormals,\n" + " __global const int* hasSeparatingAxis,\n" + " __global int4* clippingFacesOut,\n" + " __global float4* worldVertsA1,\n" + " __global float4* worldNormalsA1,\n" + " __global float4* worldVertsB1,\n" + " int capacityWorldVerts,\n" + " int numPairs\n" + " )\n" + "{\n" + " \n" + " int i = get_global_id(0);\n" + " int pairIndex = i;\n" + " \n" + " \n" + " float minDist = -1e30f;\n" + " float maxDist = 0.02f;\n" + " \n" + " if (i<numPairs)\n" + " {\n" + " \n" + " if (hasSeparatingAxis[i])\n" + " {\n" + " \n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " \n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " \n" + " \n" + " int numLocalContactsOut = findClippingFaces(separatingNormals[i],\n" + " &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n" + " rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n" + " rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n" + " worldVertsA1,\n" + " worldNormalsA1,\n" + " worldVertsB1,capacityWorldVerts,\n" + " minDist, maxDist,\n" + " vertices,faces,indices,\n" + " clippingFacesOut,i);\n" + " \n" + " \n" + " }// if (hasSeparatingAxis[i])\n" + " }// if (i<numPairs)\n" + " \n" + "}\n" + "__kernel void clipFacesAndFindContactsKernel( __global const float4* separatingNormals,\n" + " __global const int* hasSeparatingAxis,\n" + " __global int4* clippingFacesOut,\n" + " __global float4* worldVertsA1,\n" + " __global float4* worldNormalsA1,\n" + " __global float4* worldVertsB1,\n" + " __global float4* worldVertsB2,\n" + " int vertexFaceCapacity,\n" + " int numPairs,\n" + " int debugMode\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " int pairIndex = i;\n" + " \n" + " \n" + " float minDist = -1e30f;\n" + " float maxDist = 0.02f;\n" + " \n" + " if (i<numPairs)\n" + " {\n" + " \n" + " if (hasSeparatingAxis[i])\n" + " {\n" + " \n" + "// int bodyIndexA = pairs[i].x;\n" + " // int bodyIndexB = pairs[i].y;\n" + " \n" + " int numLocalContactsOut = 0;\n" + " int capacityWorldVertsB2 = vertexFaceCapacity;\n" + " \n" + " __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n" + " __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n" + " \n" + " {\n" + " __global int4* clippingFaces = clippingFacesOut;\n" + " \n" + " \n" + " int closestFaceA = clippingFaces[pairIndex].x;\n" + " int closestFaceB = clippingFaces[pairIndex].y;\n" + " int numVertsInA = clippingFaces[pairIndex].z;\n" + " int numVertsInB = clippingFaces[pairIndex].w;\n" + " \n" + " int numVertsOut = 0;\n" + " \n" + " if (closestFaceA>=0)\n" + " {\n" + " \n" + " \n" + " \n" + " // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n" + " \n" + " for(int e0=0;e0<numVertsInA;e0++)\n" + " {\n" + " const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n" + " const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n" + " const float4 WorldEdge0 = aw - bw;\n" + " float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n" + " float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n" + " float4 worldA1 = aw;\n" + " float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n" + " float4 planeNormalWS = planeNormalWS1;\n" + " float planeEqWS=planeEqWS1;\n" + " numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n" + " __global float4* tmp = pVtxOut;\n" + " pVtxOut = pVtxIn;\n" + " pVtxIn = tmp;\n" + " numVertsInB = numVertsOut;\n" + " numVertsOut = 0;\n" + " }\n" + " \n" + " float4 planeNormalWS = worldNormalsA1[pairIndex];\n" + " float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n" + " \n" + " for (int i=0;i<numVertsInB;i++)\n" + " {\n" + " float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n" + " if (depth <=minDist)\n" + " {\n" + " depth = minDist;\n" + " }\n" + " \n" + " if (depth <=maxDist)\n" + " {\n" + " float4 pointInWorld = pVtxIn[i];\n" + " pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n" + " }\n" + " }\n" + " \n" + " }\n" + " clippingFaces[pairIndex].w =numLocalContactsOut;\n" + " \n" + " }\n" + " \n" + " for (int i=0;i<numLocalContactsOut;i++)\n" + " pVtxIn[i] = pVtxOut[i];\n" + " \n" + " }// if (hasSeparatingAxis[i])\n" + " }// if (i<numPairs)\n" + " \n" + "}\n" + "__kernel void newContactReductionKernel( __global int4* pairs,\n" + " __global const b3RigidBodyData_t* rigidBodies,\n" + " __global const float4* separatingNormals,\n" + " __global const int* hasSeparatingAxis,\n" + " __global struct b3Contact4Data* globalContactsOut,\n" + " __global int4* clippingFaces,\n" + " __global float4* worldVertsB2,\n" + " volatile __global int* nGlobalContactsOut,\n" + " int vertexFaceCapacity,\n" + " int contactCapacity,\n" + " int numPairs\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " int pairIndex = i;\n" + " \n" + " int4 contactIdx;\n" + " contactIdx=make_int4(0,1,2,3);\n" + " \n" + " if (i<numPairs)\n" + " {\n" + " \n" + " if (hasSeparatingAxis[i])\n" + " {\n" + " \n" + " \n" + " \n" + " \n" + " int nPoints = clippingFaces[pairIndex].w;\n" + " \n" + " if (nPoints>0)\n" + " {\n" + " __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];\n" + " float4 normal = -separatingNormals[i];\n" + " \n" + " int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);\n" + " \n" + " int mprContactIndex = pairs[pairIndex].z;\n" + " int dstIdx = mprContactIndex;\n" + " if (dstIdx<0)\n" + " {\n" + " AppendInc( nGlobalContactsOut, dstIdx );\n" + " }\n" + "//#if 0\n" + " \n" + " if (dstIdx < contactCapacity)\n" + " {\n" + " __global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n" + " c->m_worldNormalOnB = -normal;\n" + " c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n" + " c->m_batchIdx = pairIndex;\n" + " int bodyA = pairs[pairIndex].x;\n" + " int bodyB = pairs[pairIndex].y;\n" + " pairs[pairIndex].w = dstIdx;\n" + " c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n" + " c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n" + " c->m_childIndexA =-1;\n" + " c->m_childIndexB =-1;\n" + " switch (nReducedContacts)\n" + " {\n" + " case 4:\n" + " c->m_worldPosB[3] = pointsIn[contactIdx.w];\n" + " case 3:\n" + " c->m_worldPosB[2] = pointsIn[contactIdx.z];\n" + " case 2:\n" + " c->m_worldPosB[1] = pointsIn[contactIdx.y];\n" + " case 1:\n" + " if (mprContactIndex<0)//test\n" + " c->m_worldPosB[0] = pointsIn[contactIdx.x];\n" + " default:\n" + " {\n" + " }\n" + " };\n" + " \n" + " GET_NPOINTS(*c) = nReducedContacts;\n" + " \n" + " }\n" + " \n" + " \n" + "//#endif\n" + " \n" + " }// if (numContactsOut>0)\n" + " }// if (hasSeparatingAxis[i])\n" + " }// if (i<numPairs)\n" + " \n" + " \n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h index 611569cacf..a60702ca62 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h @@ -1,1457 +1,1456 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* satConcaveKernelsCL= \ -"//keep this enum in sync with the CPU version (in btCollidable.h)\n" -"//written by Erwin Coumans\n" -"#define SHAPE_CONVEX_HULL 3\n" -"#define SHAPE_CONCAVE_TRIMESH 5\n" -"#define TRIANGLE_NUM_CONVEX_FACES 5\n" -"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" -"#define B3_MAX_STACK_DEPTH 256\n" -"typedef unsigned int u32;\n" -"///keep this in sync with btCollidable.h\n" -"typedef struct\n" -"{\n" -" union {\n" -" int m_numChildShapes;\n" -" int m_bvhIndex;\n" -" };\n" -" union\n" -" {\n" -" float m_radius;\n" -" int m_compoundBvhIndex;\n" -" };\n" -" \n" -" int m_shapeType;\n" -" int m_shapeIndex;\n" -" \n" -"} btCollidableGpu;\n" -"#define MAX_NUM_PARTS_IN_BITS 10\n" -"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n" -"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" -"typedef struct\n" -"{\n" -" //12 bytes\n" -" unsigned short int m_quantizedAabbMin[3];\n" -" unsigned short int m_quantizedAabbMax[3];\n" -" //4 bytes\n" -" int m_escapeIndexOrTriangleIndex;\n" -"} b3QuantizedBvhNode;\n" -"typedef struct\n" -"{\n" -" float4 m_aabbMin;\n" -" float4 m_aabbMax;\n" -" float4 m_quantization;\n" -" int m_numNodes;\n" -" int m_numSubTrees;\n" -" int m_nodeOffset;\n" -" int m_subTreeOffset;\n" -"} b3BvhInfo;\n" -"int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" unsigned int x=0;\n" -" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" -" // Get only the lower bits where the triangle index is stored\n" -" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" -"}\n" -"int getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" unsigned int x=0;\n" -" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" -" // Get only the lower bits where the triangle index is stored\n" -" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" -"}\n" -"int isLeafNode(const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" -" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" -"}\n" -"int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" -" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" -"}\n" -" \n" -"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" return -rootNode->m_escapeIndexOrTriangleIndex;\n" -"}\n" -"int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" return -rootNode->m_escapeIndexOrTriangleIndex;\n" -"}\n" -"typedef struct\n" -"{\n" -" //12 bytes\n" -" unsigned short int m_quantizedAabbMin[3];\n" -" unsigned short int m_quantizedAabbMax[3];\n" -" //4 bytes, points to the root of the subtree\n" -" int m_rootNodeIndex;\n" -" //4 bytes\n" -" int m_subtreeSize;\n" -" int m_padding[3];\n" -"} b3BvhSubtreeInfo;\n" -"typedef struct\n" -"{\n" -" float4 m_childPosition;\n" -" float4 m_childOrientation;\n" -" int m_shapeIndex;\n" -" int m_unused0;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"} btGpuChildShape;\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" float4 m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" u32 m_collidableIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} BodyData;\n" -"typedef struct \n" -"{\n" -" float4 m_localCenter;\n" -" float4 m_extents;\n" -" float4 mC;\n" -" float4 mE;\n" -" \n" -" float m_radius;\n" -" int m_faceOffset;\n" -" int m_numFaces;\n" -" int m_numVertices;\n" -" int m_vertexOffset;\n" -" int m_uniqueEdgesOffset;\n" -" int m_numUniqueEdges;\n" -" int m_unused;\n" -"} ConvexPolyhedronCL;\n" -"typedef struct \n" -"{\n" -" union\n" -" {\n" -" float4 m_min;\n" -" float m_minElems[4];\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float4 m_max;\n" -" float m_maxElems[4];\n" -" int m_maxIndices[4];\n" -" };\n" -"} btAabbCL;\n" -"#ifndef B3_AABB_H\n" -"#define B3_AABB_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_MAT3x3_H\n" -"#define B3_MAT3x3_H\n" -"#ifndef B3_QUAT_H\n" -"#define B3_QUAT_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif\n" -"#endif\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Quat;\n" -" #define b3QuatConstArg const b3Quat\n" -" \n" -" \n" -"inline float4 b3FastNormalize4(float4 v)\n" -"{\n" -" v = (float4)(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -" \n" -"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" -"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" -"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" -"{\n" -" b3Quat ans;\n" -" ans = b3Cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" -"{\n" -" b3Quat q;\n" -" q=in;\n" -" //return b3FastNormalize4(in);\n" -" float len = native_sqrt(dot(q, q));\n" -" if(len > 0.f)\n" -" {\n" -" q *= 1.f / len;\n" -" }\n" -" else\n" -" {\n" -" q.x = q.y = q.z = 0.f;\n" -" q.w = 1.f;\n" -" }\n" -" return q;\n" -"}\n" -"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" b3Quat qInv = b3QuatInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" return b3QuatRotate( b3QuatInvert( q ), vec );\n" -"}\n" -"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" -"{\n" -" return b3QuatRotate( orientation, point ) + (translation);\n" -"}\n" -" \n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"typedef struct\n" -"{\n" -" b3Float4 m_row[3];\n" -"}b3Mat3x3;\n" -"#define b3Mat3x3ConstArg const b3Mat3x3\n" -"#define b3GetRow(m,row) (m.m_row[row])\n" -"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" -"{\n" -" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" -" b3Mat3x3 out;\n" -" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" -" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" -" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" -" out.m_row[0].w = 0.f;\n" -" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" -" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" -" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" -" out.m_row[1].w = 0.f;\n" -" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" -" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" -" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" -" out.m_row[2].w = 0.f;\n" -" return out;\n" -"}\n" -"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = fabs(matIn.m_row[0]);\n" -" out.m_row[1] = fabs(matIn.m_row[1]);\n" -" out.m_row[2] = fabs(matIn.m_row[2]);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtZero();\n" -"__inline\n" -"b3Mat3x3 mtIdentity();\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Mat3x3 mtZero()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(0.f);\n" -" m.m_row[1] = (b3Float4)(0.f);\n" -" m.m_row[2] = (b3Float4)(0.f);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtIdentity()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(1,0,0,0);\n" -" m.m_row[1] = (b3Float4)(0,1,0,0);\n" -" m.m_row[2] = (b3Float4)(0,0,1,0);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" -" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" -" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" -"{\n" -" b3Mat3x3 transB;\n" -" transB = mtTranspose( b );\n" -" b3Mat3x3 ans;\n" -" // why this doesn't run when 0ing in the for{}\n" -" a.m_row[0].w = 0.f;\n" -" a.m_row[1].w = 0.f;\n" -" a.m_row[2].w = 0.f;\n" -" for(int i=0; i<3; i++)\n" -" {\n" -"// a.m_row[i].w = 0.f;\n" -" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" -" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" -" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" -" ans.m_row[i].w = 0.f;\n" -" }\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" -"{\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a.m_row[0], b );\n" -" ans.y = b3Dot3F4( a.m_row[1], b );\n" -" ans.z = b3Dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" -"{\n" -" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a, colx );\n" -" ans.y = b3Dot3F4( a, coly );\n" -" ans.z = b3Dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"#endif\n" -"#endif //B3_MAT3x3_H\n" -"typedef struct b3Aabb b3Aabb_t;\n" -"struct b3Aabb\n" -"{\n" -" union\n" -" {\n" -" float m_min[4];\n" -" b3Float4 m_minVec;\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float m_max[4];\n" -" b3Float4 m_maxVec;\n" -" int m_signedMaxIndices[4];\n" -" };\n" -"};\n" -"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" -" b3Float4ConstArg pos,\n" -" b3QuatConstArg orn,\n" -" b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" -"{\n" -" b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" -" localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" -" b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" -" b3Mat3x3 m;\n" -" m = b3QuatGetRotationMatrix(orn);\n" -" b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" -" b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" -" \n" -" b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" -" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" -" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" -" 0.f);\n" -" *aabbMinOut = center-extent;\n" -" *aabbMaxOut = center+extent;\n" -"}\n" -"/// conservative test for overlap between two aabbs\n" -"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" -" b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" -" overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" -" overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"#endif //B3_AABB_H\n" -"/*\n" -"Bullet Continuous Collision Detection and Physics Library\n" -"Copyright (c) 2003-2013 Erwin Coumans http://bulletphysics.org\n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose,\n" -"including commercial applications, and to alter it and redistribute it freely,\n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"#ifndef B3_INT2_H\n" -"#define B3_INT2_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#define b3UnsignedInt2 uint2\n" -"#define b3Int2 int2\n" -"#define b3MakeInt2 (int2)\n" -"#endif //__cplusplus\n" -"#endif\n" -"typedef struct\n" -"{\n" -" float4 m_plane;\n" -" int m_indexOffset;\n" -" int m_numIndices;\n" -"} btGpuFace;\n" -"#define make_float4 (float4)\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -" \n" -"// float4 a1 = make_float4(a.xyz,0.f);\n" -"// float4 b1 = make_float4(b.xyz,0.f);\n" -"// return cross(a1,b1);\n" -"//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n" -" \n" -" // float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n" -" \n" -" //return c;\n" -"}\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = make_float4(a.xyz,0.f);\n" -" float4 b1 = make_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" v = make_float4(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -"///////////////////////////////////////\n" -"// Quaternion\n" -"///////////////////////////////////////\n" -"typedef float4 Quaternion;\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b);\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in);\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec);\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q);\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b)\n" -"{\n" -" Quaternion ans;\n" -" ans = cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in)\n" -"{\n" -" return fastNormalize4(in);\n" -"// in /= length( in );\n" -"// return in;\n" -"}\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec)\n" -"{\n" -" Quaternion qInv = qtInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q)\n" -"{\n" -" return (Quaternion)(-q.xyz, q.w);\n" -"}\n" -"__inline\n" -"float4 qtInvRotate(const Quaternion q, float4 vec)\n" -"{\n" -" return qtRotate( qtInvert( q ), vec );\n" -"}\n" -"__inline\n" -"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" -"{\n" -" return qtRotate( *orientation, *p ) + (*translation);\n" -"}\n" -"__inline\n" -"float4 normalize3(const float4 a)\n" -"{\n" -" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" -" return fastNormalize4( n );\n" -"}\n" -"inline void projectLocal(const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" -"const float4* dir, const float4* vertices, float* min, float* max)\n" -"{\n" -" min[0] = FLT_MAX;\n" -" max[0] = -FLT_MAX;\n" -" int numVerts = hull->m_numVertices;\n" -" const float4 localDir = qtInvRotate(orn,*dir);\n" -" float offset = dot(pos,*dir);\n" -" for(int i=0;i<numVerts;i++)\n" -" {\n" -" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" -" if(dp < min[0]) \n" -" min[0] = dp;\n" -" if(dp > max[0]) \n" -" max[0] = dp;\n" -" }\n" -" if(min[0]>max[0])\n" -" {\n" -" float tmp = min[0];\n" -" min[0] = max[0];\n" -" max[0] = tmp;\n" -" }\n" -" min[0] += offset;\n" -" max[0] += offset;\n" -"}\n" -"inline void project(__global const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" -"const float4* dir, __global const float4* vertices, float* min, float* max)\n" -"{\n" -" min[0] = FLT_MAX;\n" -" max[0] = -FLT_MAX;\n" -" int numVerts = hull->m_numVertices;\n" -" const float4 localDir = qtInvRotate(orn,*dir);\n" -" float offset = dot(pos,*dir);\n" -" for(int i=0;i<numVerts;i++)\n" -" {\n" -" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" -" if(dp < min[0]) \n" -" min[0] = dp;\n" -" if(dp > max[0]) \n" -" max[0] = dp;\n" -" }\n" -" if(min[0]>max[0])\n" -" {\n" -" float tmp = min[0];\n" -" min[0] = max[0];\n" -" max[0] = tmp;\n" -" }\n" -" min[0] += offset;\n" -" max[0] += offset;\n" -"}\n" -"inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA,const float4 ornA,\n" -" const float4 posB,const float4 ornB,\n" -" float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n" -"{\n" -" float Min0,Max0;\n" -" float Min1,Max1;\n" -" projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n" -" project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n" -" if(Max0<Min1 || Max1<Min0)\n" -" return false;\n" -" float d0 = Max0 - Min1;\n" -" float d1 = Max1 - Min0;\n" -" *depth = d0<d1 ? d0:d1;\n" -" return true;\n" -"}\n" -"inline bool IsAlmostZero(const float4 v)\n" -"{\n" -" if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n" -" return false;\n" -" return true;\n" -"}\n" -"bool findSeparatingAxisLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" \n" -" const float4* verticesA, \n" -" const float4* uniqueEdgesA, \n" -" const btGpuFace* facesA,\n" -" const int* indicesA,\n" -" __global const float4* verticesB, \n" -" __global const float4* uniqueEdgesB, \n" -" __global const btGpuFace* facesB,\n" -" __global const int* indicesB,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" \n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" int curPlaneTests=0;\n" -" {\n" -" int numFacesA = hullA->m_numFaces;\n" -" // Test normals from hullA\n" -" for(int i=0;i<numFacesA;i++)\n" -" {\n" -" const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" -" float4 faceANormalWS = qtRotate(ornA,normal);\n" -" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" -" faceANormalWS*=-1.f;\n" -" curPlaneTests++;\n" -" float d;\n" -" if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n" -" return false;\n" -" if(d<*dmin)\n" -" {\n" -" *dmin = d;\n" -" *sep = faceANormalWS;\n" -" }\n" -" }\n" -" }\n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" return true;\n" -"}\n" -"bool findSeparatingAxisLocalB( __global const ConvexPolyhedronCL* hullA, const ConvexPolyhedronCL* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" __global const float4* verticesA, \n" -" __global const float4* uniqueEdgesA, \n" -" __global const btGpuFace* facesA,\n" -" __global const int* indicesA,\n" -" const float4* verticesB,\n" -" const float4* uniqueEdgesB, \n" -" const btGpuFace* facesB,\n" -" const int* indicesB,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" int curPlaneTests=0;\n" -" {\n" -" int numFacesA = hullA->m_numFaces;\n" -" // Test normals from hullA\n" -" for(int i=0;i<numFacesA;i++)\n" -" {\n" -" const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" -" float4 faceANormalWS = qtRotate(ornA,normal);\n" -" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" -" faceANormalWS *= -1.f;\n" -" curPlaneTests++;\n" -" float d;\n" -" if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n" -" return false;\n" -" if(d<*dmin)\n" -" {\n" -" *dmin = d;\n" -" *sep = faceANormalWS;\n" -" }\n" -" }\n" -" }\n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" return true;\n" -"}\n" -"bool findSeparatingAxisEdgeEdgeLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" const float4* verticesA, \n" -" const float4* uniqueEdgesA, \n" -" const btGpuFace* facesA,\n" -" const int* indicesA,\n" -" __global const float4* verticesB, \n" -" __global const float4* uniqueEdgesB, \n" -" __global const btGpuFace* facesB,\n" -" __global const int* indicesB,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" int curPlaneTests=0;\n" -" int curEdgeEdge = 0;\n" -" // Test edges\n" -" for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n" -" {\n" -" const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n" -" float4 edge0World = qtRotate(ornA,edge0);\n" -" for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n" -" {\n" -" const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n" -" float4 edge1World = qtRotate(ornB,edge1);\n" -" float4 crossje = cross3(edge0World,edge1World);\n" -" curEdgeEdge++;\n" -" if(!IsAlmostZero(crossje))\n" -" {\n" -" crossje = normalize3(crossje);\n" -" if (dot3F4(DeltaC2,crossje)<0)\n" -" crossje *= -1.f;\n" -" float dist;\n" -" bool result = true;\n" -" {\n" -" float Min0,Max0;\n" -" float Min1,Max1;\n" -" projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n" -" project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n" -" \n" -" if(Max0<Min1 || Max1<Min0)\n" -" result = false;\n" -" \n" -" float d0 = Max0 - Min1;\n" -" float d1 = Max1 - Min0;\n" -" dist = d0<d1 ? d0:d1;\n" -" result = true;\n" -" }\n" -" \n" -" if(dist<*dmin)\n" -" {\n" -" *dmin = dist;\n" -" *sep = crossje;\n" -" }\n" -" }\n" -" }\n" -" }\n" -" \n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" return true;\n" -"}\n" -"inline int findClippingFaces(const float4 separatingNormal,\n" -" const ConvexPolyhedronCL* hullA, \n" -" __global const ConvexPolyhedronCL* hullB,\n" -" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n" -" __global float4* worldVertsA1,\n" -" __global float4* worldNormalsA1,\n" -" __global float4* worldVertsB1,\n" -" int capacityWorldVerts,\n" -" const float minDist, float maxDist,\n" -" const float4* verticesA,\n" -" const btGpuFace* facesA,\n" -" const int* indicesA,\n" -" __global const float4* verticesB,\n" -" __global const btGpuFace* facesB,\n" -" __global const int* indicesB,\n" -" __global int4* clippingFaces, int pairIndex)\n" -"{\n" -" int numContactsOut = 0;\n" -" int numWorldVertsB1= 0;\n" -" \n" -" \n" -" int closestFaceB=0;\n" -" float dmax = -FLT_MAX;\n" -" \n" -" {\n" -" for(int face=0;face<hullB->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n" -" facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n" -" const float4 WorldNormal = qtRotate(ornB, Normal);\n" -" float d = dot3F4(WorldNormal,separatingNormal);\n" -" if (d > dmax)\n" -" {\n" -" dmax = d;\n" -" closestFaceB = face;\n" -" }\n" -" }\n" -" }\n" -" \n" -" {\n" -" const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" -" int numVertices = polyB.m_numIndices;\n" -" if (numVertices>capacityWorldVerts)\n" -" numVertices = capacityWorldVerts;\n" -" if (numVertices<0)\n" -" numVertices = 0;\n" -" \n" -" for(int e0=0;e0<numVertices;e0++)\n" -" {\n" -" if (e0<capacityWorldVerts)\n" -" {\n" -" const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n" -" worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" -" }\n" -" }\n" -" }\n" -" \n" -" int closestFaceA=0;\n" -" {\n" -" float dmin = FLT_MAX;\n" -" for(int face=0;face<hullA->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(\n" -" facesA[hullA->m_faceOffset+face].m_plane.x,\n" -" facesA[hullA->m_faceOffset+face].m_plane.y,\n" -" facesA[hullA->m_faceOffset+face].m_plane.z,\n" -" 0.f);\n" -" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" -" \n" -" float d = dot3F4(faceANormalWS,separatingNormal);\n" -" if (d < dmin)\n" -" {\n" -" dmin = d;\n" -" closestFaceA = face;\n" -" worldNormalsA1[pairIndex] = faceANormalWS;\n" -" }\n" -" }\n" -" }\n" -" \n" -" int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n" -" if (numVerticesA>capacityWorldVerts)\n" -" numVerticesA = capacityWorldVerts;\n" -" if (numVerticesA<0)\n" -" numVerticesA=0;\n" -" \n" -" for(int e0=0;e0<numVerticesA;e0++)\n" -" {\n" -" if (e0<capacityWorldVerts)\n" -" {\n" -" const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n" -" worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n" -" }\n" -" }\n" -" \n" -" clippingFaces[pairIndex].x = closestFaceA;\n" -" clippingFaces[pairIndex].y = closestFaceB;\n" -" clippingFaces[pairIndex].z = numVerticesA;\n" -" clippingFaces[pairIndex].w = numWorldVertsB1;\n" -" \n" -" \n" -" return numContactsOut;\n" -"}\n" -"// work-in-progress\n" -"__kernel void findConcaveSeparatingAxisVertexFaceKernel( __global int4* concavePairs,\n" -" __global const BodyData* rigidBodies,\n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes,\n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global const btGpuChildShape* gpuChildShapes,\n" -" __global btAabbCL* aabbs,\n" -" __global float4* concaveSeparatingNormalsOut,\n" -" __global int* concaveHasSeparatingNormals,\n" -" __global int4* clippingFacesOut,\n" -" __global float4* worldVertsA1GPU,\n" -" __global float4* worldNormalsAGPU,\n" -" __global float4* worldVertsB1GPU,\n" -" __global float* dmins,\n" -" int vertexFaceCapacity,\n" -" int numConcavePairs\n" -" )\n" -"{\n" -" \n" -" int i = get_global_id(0);\n" -" if (i>=numConcavePairs)\n" -" return;\n" -" \n" -" concaveHasSeparatingNormals[i] = 0;\n" -" \n" -" int pairIdx = i;\n" -" \n" -" int bodyIndexA = concavePairs[i].x;\n" -" int bodyIndexB = concavePairs[i].y;\n" -" \n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n" -" collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" {\n" -" concavePairs[pairIdx].w = -1;\n" -" return;\n" -" }\n" -" \n" -" \n" -" \n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" int numActualConcaveConvexTests = 0;\n" -" \n" -" int f = concavePairs[i].z;\n" -" \n" -" bool overlap = false;\n" -" \n" -" ConvexPolyhedronCL convexPolyhedronA;\n" -" \n" -" //add 3 vertices of the triangle\n" -" convexPolyhedronA.m_numVertices = 3;\n" -" convexPolyhedronA.m_vertexOffset = 0;\n" -" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" -" \n" -" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" -" float4 triMinAabb, triMaxAabb;\n" -" btAabbCL triAabb;\n" -" triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n" -" triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n" -" \n" -" float4 verticesA[3];\n" -" for (int i=0;i<3;i++)\n" -" {\n" -" int index = indices[face.m_indexOffset+i];\n" -" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" -" verticesA[i] = vert;\n" -" localCenter += vert;\n" -" \n" -" triAabb.m_min = min(triAabb.m_min,vert);\n" -" triAabb.m_max = max(triAabb.m_max,vert);\n" -" \n" -" }\n" -" \n" -" overlap = true;\n" -" overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n" -" overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n" -" overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n" -" \n" -" if (overlap)\n" -" {\n" -" float dmin = FLT_MAX;\n" -" int hasSeparatingAxis=5;\n" -" float4 sepAxis=make_float4(1,2,3,4);\n" -" \n" -" int localCC=0;\n" -" numActualConcaveConvexTests++;\n" -" \n" -" //a triangle has 3 unique edges\n" -" convexPolyhedronA.m_numUniqueEdges = 3;\n" -" convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" -" float4 uniqueEdgesA[3];\n" -" \n" -" uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" -" uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" -" uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" -" \n" -" \n" -" convexPolyhedronA.m_faceOffset = 0;\n" -" \n" -" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" -" \n" -" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n" -" int indicesA[3+3+2+2+2];\n" -" int curUsedIndices=0;\n" -" int fidx=0;\n" -" \n" -" //front size of triangle\n" -" {\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[0] = 0;\n" -" indicesA[1] = 1;\n" -" indicesA[2] = 2;\n" -" curUsedIndices+=3;\n" -" float c = face.m_plane.w;\n" -" facesA[fidx].m_plane.x = normal.x;\n" -" facesA[fidx].m_plane.y = normal.y;\n" -" facesA[fidx].m_plane.z = normal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" facesA[fidx].m_numIndices=3;\n" -" }\n" -" fidx++;\n" -" //back size of triangle\n" -" {\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[3]=2;\n" -" indicesA[4]=1;\n" -" indicesA[5]=0;\n" -" curUsedIndices+=3;\n" -" float c = dot(normal,verticesA[0]);\n" -" float c1 = -face.m_plane.w;\n" -" facesA[fidx].m_plane.x = -normal.x;\n" -" facesA[fidx].m_plane.y = -normal.y;\n" -" facesA[fidx].m_plane.z = -normal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" facesA[fidx].m_numIndices=3;\n" -" }\n" -" fidx++;\n" -" \n" -" bool addEdgePlanes = true;\n" -" if (addEdgePlanes)\n" -" {\n" -" int numVertices=3;\n" -" int prevVertex = numVertices-1;\n" -" for (int i=0;i<numVertices;i++)\n" -" {\n" -" float4 v0 = verticesA[i];\n" -" float4 v1 = verticesA[prevVertex];\n" -" \n" -" float4 edgeNormal = normalize(cross(normal,v1-v0));\n" -" float c = -dot(edgeNormal,v0);\n" -" \n" -" facesA[fidx].m_numIndices = 2;\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[curUsedIndices++]=i;\n" -" indicesA[curUsedIndices++]=prevVertex;\n" -" \n" -" facesA[fidx].m_plane.x = edgeNormal.x;\n" -" facesA[fidx].m_plane.y = edgeNormal.y;\n" -" facesA[fidx].m_plane.z = edgeNormal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" fidx++;\n" -" prevVertex = i;\n" -" }\n" -" }\n" -" convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" -" convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" -" \n" -" \n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" posA.w = 0.f;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" posB.w = 0.f;\n" -" \n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" -" \n" -" \n" -" \n" -" \n" -" ///////////////////\n" -" ///compound shape support\n" -" \n" -" if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" {\n" -" int compoundChild = concavePairs[pairIdx].w;\n" -" int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n" -" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" -" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" -" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" -" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" -" float4 newOrnB = qtMul(ornB,childOrnB);\n" -" posB = newPosB;\n" -" ornB = newOrnB;\n" -" shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" -" }\n" -" //////////////////\n" -" \n" -" float4 c0local = convexPolyhedronA.m_localCenter;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" \n" -" \n" -" bool sepA = findSeparatingAxisLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" -" posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" verticesA,uniqueEdgesA,facesA,indicesA,\n" -" vertices,uniqueEdges,faces,indices,\n" -" &sepAxis,&dmin);\n" -" hasSeparatingAxis = 4;\n" -" if (!sepA)\n" -" {\n" -" hasSeparatingAxis = 0;\n" -" } else\n" -" {\n" -" bool sepB = findSeparatingAxisLocalB( &convexShapes[shapeIndexB],&convexPolyhedronA,\n" -" posB,ornB,\n" -" posA,ornA,\n" -" DeltaC2,\n" -" vertices,uniqueEdges,faces,indices,\n" -" verticesA,uniqueEdgesA,facesA,indicesA,\n" -" &sepAxis,&dmin);\n" -" \n" -" if (!sepB)\n" -" {\n" -" hasSeparatingAxis = 0;\n" -" } else\n" -" {\n" -" hasSeparatingAxis = 1;\n" -" }\n" -" } \n" -" \n" -" if (hasSeparatingAxis)\n" -" {\n" -" dmins[i] = dmin;\n" -" concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n" -" concaveHasSeparatingNormals[i]=1;\n" -" \n" -" } else\n" -" { \n" -" //mark this pair as in-active\n" -" concavePairs[pairIdx].w = -1;\n" -" }\n" -" }\n" -" else\n" -" { \n" -" //mark this pair as in-active\n" -" concavePairs[pairIdx].w = -1;\n" -" }\n" -"}\n" -"// work-in-progress\n" -"__kernel void findConcaveSeparatingAxisEdgeEdgeKernel( __global int4* concavePairs,\n" -" __global const BodyData* rigidBodies,\n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes,\n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global const btGpuChildShape* gpuChildShapes,\n" -" __global btAabbCL* aabbs,\n" -" __global float4* concaveSeparatingNormalsOut,\n" -" __global int* concaveHasSeparatingNormals,\n" -" __global int4* clippingFacesOut,\n" -" __global float4* worldVertsA1GPU,\n" -" __global float4* worldNormalsAGPU,\n" -" __global float4* worldVertsB1GPU,\n" -" __global float* dmins,\n" -" int vertexFaceCapacity,\n" -" int numConcavePairs\n" -" )\n" -"{\n" -" \n" -" int i = get_global_id(0);\n" -" if (i>=numConcavePairs)\n" -" return;\n" -" \n" -" if (!concaveHasSeparatingNormals[i])\n" -" return;\n" -" \n" -" int pairIdx = i;\n" -" \n" -" int bodyIndexA = concavePairs[i].x;\n" -" int bodyIndexB = concavePairs[i].y;\n" -" \n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" \n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" int numActualConcaveConvexTests = 0;\n" -" \n" -" int f = concavePairs[i].z;\n" -" \n" -" bool overlap = false;\n" -" \n" -" ConvexPolyhedronCL convexPolyhedronA;\n" -" \n" -" //add 3 vertices of the triangle\n" -" convexPolyhedronA.m_numVertices = 3;\n" -" convexPolyhedronA.m_vertexOffset = 0;\n" -" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" -" \n" -" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" -" float4 triMinAabb, triMaxAabb;\n" -" btAabbCL triAabb;\n" -" triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n" -" triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n" -" \n" -" float4 verticesA[3];\n" -" for (int i=0;i<3;i++)\n" -" {\n" -" int index = indices[face.m_indexOffset+i];\n" -" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" -" verticesA[i] = vert;\n" -" localCenter += vert;\n" -" \n" -" triAabb.m_min = min(triAabb.m_min,vert);\n" -" triAabb.m_max = max(triAabb.m_max,vert);\n" -" \n" -" }\n" -" \n" -" overlap = true;\n" -" overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n" -" overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n" -" overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n" -" \n" -" if (overlap)\n" -" {\n" -" float dmin = dmins[i];\n" -" int hasSeparatingAxis=5;\n" -" float4 sepAxis=make_float4(1,2,3,4);\n" -" sepAxis = concaveSeparatingNormalsOut[pairIdx];\n" -" \n" -" int localCC=0;\n" -" numActualConcaveConvexTests++;\n" -" \n" -" //a triangle has 3 unique edges\n" -" convexPolyhedronA.m_numUniqueEdges = 3;\n" -" convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" -" float4 uniqueEdgesA[3];\n" -" \n" -" uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" -" uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" -" uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" -" \n" -" \n" -" convexPolyhedronA.m_faceOffset = 0;\n" -" \n" -" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" -" \n" -" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n" -" int indicesA[3+3+2+2+2];\n" -" int curUsedIndices=0;\n" -" int fidx=0;\n" -" \n" -" //front size of triangle\n" -" {\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[0] = 0;\n" -" indicesA[1] = 1;\n" -" indicesA[2] = 2;\n" -" curUsedIndices+=3;\n" -" float c = face.m_plane.w;\n" -" facesA[fidx].m_plane.x = normal.x;\n" -" facesA[fidx].m_plane.y = normal.y;\n" -" facesA[fidx].m_plane.z = normal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" facesA[fidx].m_numIndices=3;\n" -" }\n" -" fidx++;\n" -" //back size of triangle\n" -" {\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[3]=2;\n" -" indicesA[4]=1;\n" -" indicesA[5]=0;\n" -" curUsedIndices+=3;\n" -" float c = dot(normal,verticesA[0]);\n" -" float c1 = -face.m_plane.w;\n" -" facesA[fidx].m_plane.x = -normal.x;\n" -" facesA[fidx].m_plane.y = -normal.y;\n" -" facesA[fidx].m_plane.z = -normal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" facesA[fidx].m_numIndices=3;\n" -" }\n" -" fidx++;\n" -" \n" -" bool addEdgePlanes = true;\n" -" if (addEdgePlanes)\n" -" {\n" -" int numVertices=3;\n" -" int prevVertex = numVertices-1;\n" -" for (int i=0;i<numVertices;i++)\n" -" {\n" -" float4 v0 = verticesA[i];\n" -" float4 v1 = verticesA[prevVertex];\n" -" \n" -" float4 edgeNormal = normalize(cross(normal,v1-v0));\n" -" float c = -dot(edgeNormal,v0);\n" -" \n" -" facesA[fidx].m_numIndices = 2;\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[curUsedIndices++]=i;\n" -" indicesA[curUsedIndices++]=prevVertex;\n" -" \n" -" facesA[fidx].m_plane.x = edgeNormal.x;\n" -" facesA[fidx].m_plane.y = edgeNormal.y;\n" -" facesA[fidx].m_plane.z = edgeNormal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" fidx++;\n" -" prevVertex = i;\n" -" }\n" -" }\n" -" convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" -" convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" -" \n" -" \n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" posA.w = 0.f;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" posB.w = 0.f;\n" -" \n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" -" \n" -" \n" -" \n" -" \n" -" ///////////////////\n" -" ///compound shape support\n" -" \n" -" if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" {\n" -" int compoundChild = concavePairs[pairIdx].w;\n" -" int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n" -" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" -" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" -" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" -" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" -" float4 newOrnB = qtMul(ornB,childOrnB);\n" -" posB = newPosB;\n" -" ornB = newOrnB;\n" -" shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" -" }\n" -" //////////////////\n" -" \n" -" float4 c0local = convexPolyhedronA.m_localCenter;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" \n" -" \n" -" {\n" -" bool sepEE = findSeparatingAxisEdgeEdgeLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" -" posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" verticesA,uniqueEdgesA,facesA,indicesA,\n" -" vertices,uniqueEdges,faces,indices,\n" -" &sepAxis,&dmin);\n" -" \n" -" if (!sepEE)\n" -" {\n" -" hasSeparatingAxis = 0;\n" -" } else\n" -" {\n" -" hasSeparatingAxis = 1;\n" -" }\n" -" }\n" -" \n" -" \n" -" if (hasSeparatingAxis)\n" -" {\n" -" sepAxis.w = dmin;\n" -" dmins[i] = dmin;\n" -" concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n" -" concaveHasSeparatingNormals[i]=1;\n" -" \n" -" float minDist = -1e30f;\n" -" float maxDist = 0.02f;\n" -" \n" -" findClippingFaces(sepAxis,\n" -" &convexPolyhedronA,\n" -" &convexShapes[shapeIndexB],\n" -" posA,ornA,\n" -" posB,ornB,\n" -" worldVertsA1GPU,\n" -" worldNormalsAGPU,\n" -" worldVertsB1GPU,\n" -" vertexFaceCapacity,\n" -" minDist, maxDist,\n" -" verticesA,\n" -" facesA,\n" -" indicesA,\n" -" vertices,\n" -" faces,\n" -" indices,\n" -" clippingFacesOut, pairIdx);\n" -" \n" -" \n" -" } else\n" -" { \n" -" //mark this pair as in-active\n" -" concavePairs[pairIdx].w = -1;\n" -" }\n" -" }\n" -" else\n" -" { \n" -" //mark this pair as in-active\n" -" concavePairs[pairIdx].w = -1;\n" -" }\n" -" \n" -" concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points\n" -"}\n" -; +static const char* satConcaveKernelsCL = + "//keep this enum in sync with the CPU version (in btCollidable.h)\n" + "//written by Erwin Coumans\n" + "#define SHAPE_CONVEX_HULL 3\n" + "#define SHAPE_CONCAVE_TRIMESH 5\n" + "#define TRIANGLE_NUM_CONVEX_FACES 5\n" + "#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" + "#define B3_MAX_STACK_DEPTH 256\n" + "typedef unsigned int u32;\n" + "///keep this in sync with btCollidable.h\n" + "typedef struct\n" + "{\n" + " union {\n" + " int m_numChildShapes;\n" + " int m_bvhIndex;\n" + " };\n" + " union\n" + " {\n" + " float m_radius;\n" + " int m_compoundBvhIndex;\n" + " };\n" + " \n" + " int m_shapeType;\n" + " int m_shapeIndex;\n" + " \n" + "} btCollidableGpu;\n" + "#define MAX_NUM_PARTS_IN_BITS 10\n" + "///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n" + "///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" + "typedef struct\n" + "{\n" + " //12 bytes\n" + " unsigned short int m_quantizedAabbMin[3];\n" + " unsigned short int m_quantizedAabbMax[3];\n" + " //4 bytes\n" + " int m_escapeIndexOrTriangleIndex;\n" + "} b3QuantizedBvhNode;\n" + "typedef struct\n" + "{\n" + " float4 m_aabbMin;\n" + " float4 m_aabbMax;\n" + " float4 m_quantization;\n" + " int m_numNodes;\n" + " int m_numSubTrees;\n" + " int m_nodeOffset;\n" + " int m_subTreeOffset;\n" + "} b3BvhInfo;\n" + "int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " unsigned int x=0;\n" + " unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" + " // Get only the lower bits where the triangle index is stored\n" + " return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" + "}\n" + "int getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " unsigned int x=0;\n" + " unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" + " // Get only the lower bits where the triangle index is stored\n" + " return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" + "}\n" + "int isLeafNode(const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" + " return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" + "}\n" + "int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" + " return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" + "}\n" + " \n" + "int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " return -rootNode->m_escapeIndexOrTriangleIndex;\n" + "}\n" + "int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " return -rootNode->m_escapeIndexOrTriangleIndex;\n" + "}\n" + "typedef struct\n" + "{\n" + " //12 bytes\n" + " unsigned short int m_quantizedAabbMin[3];\n" + " unsigned short int m_quantizedAabbMax[3];\n" + " //4 bytes, points to the root of the subtree\n" + " int m_rootNodeIndex;\n" + " //4 bytes\n" + " int m_subtreeSize;\n" + " int m_padding[3];\n" + "} b3BvhSubtreeInfo;\n" + "typedef struct\n" + "{\n" + " float4 m_childPosition;\n" + " float4 m_childOrientation;\n" + " int m_shapeIndex;\n" + " int m_unused0;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "} btGpuChildShape;\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " float4 m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " u32 m_collidableIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} BodyData;\n" + "typedef struct \n" + "{\n" + " float4 m_localCenter;\n" + " float4 m_extents;\n" + " float4 mC;\n" + " float4 mE;\n" + " \n" + " float m_radius;\n" + " int m_faceOffset;\n" + " int m_numFaces;\n" + " int m_numVertices;\n" + " int m_vertexOffset;\n" + " int m_uniqueEdgesOffset;\n" + " int m_numUniqueEdges;\n" + " int m_unused;\n" + "} ConvexPolyhedronCL;\n" + "typedef struct \n" + "{\n" + " union\n" + " {\n" + " float4 m_min;\n" + " float m_minElems[4];\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float4 m_max;\n" + " float m_maxElems[4];\n" + " int m_maxIndices[4];\n" + " };\n" + "} btAabbCL;\n" + "#ifndef B3_AABB_H\n" + "#define B3_AABB_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_MAT3x3_H\n" + "#define B3_MAT3x3_H\n" + "#ifndef B3_QUAT_H\n" + "#define B3_QUAT_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif\n" + "#endif\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Quat;\n" + " #define b3QuatConstArg const b3Quat\n" + " \n" + " \n" + "inline float4 b3FastNormalize4(float4 v)\n" + "{\n" + " v = (float4)(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + " \n" + "inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" + "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" + "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" + "{\n" + " b3Quat ans;\n" + " ans = b3Cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - b3Dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" + "{\n" + " b3Quat q;\n" + " q=in;\n" + " //return b3FastNormalize4(in);\n" + " float len = native_sqrt(dot(q, q));\n" + " if(len > 0.f)\n" + " {\n" + " q *= 1.f / len;\n" + " }\n" + " else\n" + " {\n" + " q.x = q.y = q.z = 0.f;\n" + " q.w = 1.f;\n" + " }\n" + " return q;\n" + "}\n" + "inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " b3Quat qInv = b3QuatInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " return b3QuatRotate( b3QuatInvert( q ), vec );\n" + "}\n" + "inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" + "{\n" + " return b3QuatRotate( orientation, point ) + (translation);\n" + "}\n" + " \n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "typedef struct\n" + "{\n" + " b3Float4 m_row[3];\n" + "}b3Mat3x3;\n" + "#define b3Mat3x3ConstArg const b3Mat3x3\n" + "#define b3GetRow(m,row) (m.m_row[row])\n" + "inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" + "{\n" + " b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" + " b3Mat3x3 out;\n" + " out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" + " out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" + " out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" + " out.m_row[0].w = 0.f;\n" + " out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" + " out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" + " out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" + " out.m_row[1].w = 0.f;\n" + " out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" + " out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" + " out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" + " out.m_row[2].w = 0.f;\n" + " return out;\n" + "}\n" + "inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = fabs(matIn.m_row[0]);\n" + " out.m_row[1] = fabs(matIn.m_row[1]);\n" + " out.m_row[2] = fabs(matIn.m_row[2]);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtZero();\n" + "__inline\n" + "b3Mat3x3 mtIdentity();\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Mat3x3 mtZero()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(0.f);\n" + " m.m_row[1] = (b3Float4)(0.f);\n" + " m.m_row[2] = (b3Float4)(0.f);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtIdentity()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(1,0,0,0);\n" + " m.m_row[1] = (b3Float4)(0,1,0,0);\n" + " m.m_row[2] = (b3Float4)(0,0,1,0);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" + " out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" + " out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" + "{\n" + " b3Mat3x3 transB;\n" + " transB = mtTranspose( b );\n" + " b3Mat3x3 ans;\n" + " // why this doesn't run when 0ing in the for{}\n" + " a.m_row[0].w = 0.f;\n" + " a.m_row[1].w = 0.f;\n" + " a.m_row[2].w = 0.f;\n" + " for(int i=0; i<3; i++)\n" + " {\n" + "// a.m_row[i].w = 0.f;\n" + " ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" + " ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" + " ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" + " ans.m_row[i].w = 0.f;\n" + " }\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" + "{\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a.m_row[0], b );\n" + " ans.y = b3Dot3F4( a.m_row[1], b );\n" + " ans.z = b3Dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" + "{\n" + " b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a, colx );\n" + " ans.y = b3Dot3F4( a, coly );\n" + " ans.z = b3Dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "#endif\n" + "#endif //B3_MAT3x3_H\n" + "typedef struct b3Aabb b3Aabb_t;\n" + "struct b3Aabb\n" + "{\n" + " union\n" + " {\n" + " float m_min[4];\n" + " b3Float4 m_minVec;\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float m_max[4];\n" + " b3Float4 m_maxVec;\n" + " int m_signedMaxIndices[4];\n" + " };\n" + "};\n" + "inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" + " b3Float4ConstArg pos,\n" + " b3QuatConstArg orn,\n" + " b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" + "{\n" + " b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" + " localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" + " b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" + " b3Mat3x3 m;\n" + " m = b3QuatGetRotationMatrix(orn);\n" + " b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" + " b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" + " \n" + " b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" + " b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" + " b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" + " 0.f);\n" + " *aabbMinOut = center-extent;\n" + " *aabbMaxOut = center+extent;\n" + "}\n" + "/// conservative test for overlap between two aabbs\n" + "inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" + " b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" + " overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" + " overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "#endif //B3_AABB_H\n" + "/*\n" + "Bullet Continuous Collision Detection and Physics Library\n" + "Copyright (c) 2003-2013 Erwin Coumans http://bulletphysics.org\n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose,\n" + "including commercial applications, and to alter it and redistribute it freely,\n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "#ifndef B3_INT2_H\n" + "#define B3_INT2_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#define b3UnsignedInt2 uint2\n" + "#define b3Int2 int2\n" + "#define b3MakeInt2 (int2)\n" + "#endif //__cplusplus\n" + "#endif\n" + "typedef struct\n" + "{\n" + " float4 m_plane;\n" + " int m_indexOffset;\n" + " int m_numIndices;\n" + "} btGpuFace;\n" + "#define make_float4 (float4)\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + " \n" + "// float4 a1 = make_float4(a.xyz,0.f);\n" + "// float4 b1 = make_float4(b.xyz,0.f);\n" + "// return cross(a1,b1);\n" + "//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n" + " \n" + " // float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n" + " \n" + " //return c;\n" + "}\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = make_float4(a.xyz,0.f);\n" + " float4 b1 = make_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " v = make_float4(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + "///////////////////////////////////////\n" + "// Quaternion\n" + "///////////////////////////////////////\n" + "typedef float4 Quaternion;\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b);\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in);\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec);\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q);\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b)\n" + "{\n" + " Quaternion ans;\n" + " ans = cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in)\n" + "{\n" + " return fastNormalize4(in);\n" + "// in /= length( in );\n" + "// return in;\n" + "}\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec)\n" + "{\n" + " Quaternion qInv = qtInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = qtMul(qtMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q)\n" + "{\n" + " return (Quaternion)(-q.xyz, q.w);\n" + "}\n" + "__inline\n" + "float4 qtInvRotate(const Quaternion q, float4 vec)\n" + "{\n" + " return qtRotate( qtInvert( q ), vec );\n" + "}\n" + "__inline\n" + "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" + "{\n" + " return qtRotate( *orientation, *p ) + (*translation);\n" + "}\n" + "__inline\n" + "float4 normalize3(const float4 a)\n" + "{\n" + " float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" + " return fastNormalize4( n );\n" + "}\n" + "inline void projectLocal(const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" + "const float4* dir, const float4* vertices, float* min, float* max)\n" + "{\n" + " min[0] = FLT_MAX;\n" + " max[0] = -FLT_MAX;\n" + " int numVerts = hull->m_numVertices;\n" + " const float4 localDir = qtInvRotate(orn,*dir);\n" + " float offset = dot(pos,*dir);\n" + " for(int i=0;i<numVerts;i++)\n" + " {\n" + " float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" + " if(dp < min[0]) \n" + " min[0] = dp;\n" + " if(dp > max[0]) \n" + " max[0] = dp;\n" + " }\n" + " if(min[0]>max[0])\n" + " {\n" + " float tmp = min[0];\n" + " min[0] = max[0];\n" + " max[0] = tmp;\n" + " }\n" + " min[0] += offset;\n" + " max[0] += offset;\n" + "}\n" + "inline void project(__global const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" + "const float4* dir, __global const float4* vertices, float* min, float* max)\n" + "{\n" + " min[0] = FLT_MAX;\n" + " max[0] = -FLT_MAX;\n" + " int numVerts = hull->m_numVertices;\n" + " const float4 localDir = qtInvRotate(orn,*dir);\n" + " float offset = dot(pos,*dir);\n" + " for(int i=0;i<numVerts;i++)\n" + " {\n" + " float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" + " if(dp < min[0]) \n" + " min[0] = dp;\n" + " if(dp > max[0]) \n" + " max[0] = dp;\n" + " }\n" + " if(min[0]>max[0])\n" + " {\n" + " float tmp = min[0];\n" + " min[0] = max[0];\n" + " max[0] = tmp;\n" + " }\n" + " min[0] += offset;\n" + " max[0] += offset;\n" + "}\n" + "inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA,const float4 ornA,\n" + " const float4 posB,const float4 ornB,\n" + " float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n" + "{\n" + " float Min0,Max0;\n" + " float Min1,Max1;\n" + " projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n" + " project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n" + " if(Max0<Min1 || Max1<Min0)\n" + " return false;\n" + " float d0 = Max0 - Min1;\n" + " float d1 = Max1 - Min0;\n" + " *depth = d0<d1 ? d0:d1;\n" + " return true;\n" + "}\n" + "inline bool IsAlmostZero(const float4 v)\n" + "{\n" + " if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n" + " return false;\n" + " return true;\n" + "}\n" + "bool findSeparatingAxisLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " \n" + " const float4* verticesA, \n" + " const float4* uniqueEdgesA, \n" + " const btGpuFace* facesA,\n" + " const int* indicesA,\n" + " __global const float4* verticesB, \n" + " __global const float4* uniqueEdgesB, \n" + " __global const btGpuFace* facesB,\n" + " __global const int* indicesB,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " \n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " int curPlaneTests=0;\n" + " {\n" + " int numFacesA = hullA->m_numFaces;\n" + " // Test normals from hullA\n" + " for(int i=0;i<numFacesA;i++)\n" + " {\n" + " const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" + " float4 faceANormalWS = qtRotate(ornA,normal);\n" + " if (dot3F4(DeltaC2,faceANormalWS)<0)\n" + " faceANormalWS*=-1.f;\n" + " curPlaneTests++;\n" + " float d;\n" + " if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n" + " return false;\n" + " if(d<*dmin)\n" + " {\n" + " *dmin = d;\n" + " *sep = faceANormalWS;\n" + " }\n" + " }\n" + " }\n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " return true;\n" + "}\n" + "bool findSeparatingAxisLocalB( __global const ConvexPolyhedronCL* hullA, const ConvexPolyhedronCL* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " __global const float4* verticesA, \n" + " __global const float4* uniqueEdgesA, \n" + " __global const btGpuFace* facesA,\n" + " __global const int* indicesA,\n" + " const float4* verticesB,\n" + " const float4* uniqueEdgesB, \n" + " const btGpuFace* facesB,\n" + " const int* indicesB,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " int curPlaneTests=0;\n" + " {\n" + " int numFacesA = hullA->m_numFaces;\n" + " // Test normals from hullA\n" + " for(int i=0;i<numFacesA;i++)\n" + " {\n" + " const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" + " float4 faceANormalWS = qtRotate(ornA,normal);\n" + " if (dot3F4(DeltaC2,faceANormalWS)<0)\n" + " faceANormalWS *= -1.f;\n" + " curPlaneTests++;\n" + " float d;\n" + " if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n" + " return false;\n" + " if(d<*dmin)\n" + " {\n" + " *dmin = d;\n" + " *sep = faceANormalWS;\n" + " }\n" + " }\n" + " }\n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " return true;\n" + "}\n" + "bool findSeparatingAxisEdgeEdgeLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " const float4* verticesA, \n" + " const float4* uniqueEdgesA, \n" + " const btGpuFace* facesA,\n" + " const int* indicesA,\n" + " __global const float4* verticesB, \n" + " __global const float4* uniqueEdgesB, \n" + " __global const btGpuFace* facesB,\n" + " __global const int* indicesB,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " int curPlaneTests=0;\n" + " int curEdgeEdge = 0;\n" + " // Test edges\n" + " for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n" + " {\n" + " const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n" + " float4 edge0World = qtRotate(ornA,edge0);\n" + " for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n" + " {\n" + " const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n" + " float4 edge1World = qtRotate(ornB,edge1);\n" + " float4 crossje = cross3(edge0World,edge1World);\n" + " curEdgeEdge++;\n" + " if(!IsAlmostZero(crossje))\n" + " {\n" + " crossje = normalize3(crossje);\n" + " if (dot3F4(DeltaC2,crossje)<0)\n" + " crossje *= -1.f;\n" + " float dist;\n" + " bool result = true;\n" + " {\n" + " float Min0,Max0;\n" + " float Min1,Max1;\n" + " projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n" + " project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n" + " \n" + " if(Max0<Min1 || Max1<Min0)\n" + " result = false;\n" + " \n" + " float d0 = Max0 - Min1;\n" + " float d1 = Max1 - Min0;\n" + " dist = d0<d1 ? d0:d1;\n" + " result = true;\n" + " }\n" + " \n" + " if(dist<*dmin)\n" + " {\n" + " *dmin = dist;\n" + " *sep = crossje;\n" + " }\n" + " }\n" + " }\n" + " }\n" + " \n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " return true;\n" + "}\n" + "inline int findClippingFaces(const float4 separatingNormal,\n" + " const ConvexPolyhedronCL* hullA, \n" + " __global const ConvexPolyhedronCL* hullB,\n" + " const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n" + " __global float4* worldVertsA1,\n" + " __global float4* worldNormalsA1,\n" + " __global float4* worldVertsB1,\n" + " int capacityWorldVerts,\n" + " const float minDist, float maxDist,\n" + " const float4* verticesA,\n" + " const btGpuFace* facesA,\n" + " const int* indicesA,\n" + " __global const float4* verticesB,\n" + " __global const btGpuFace* facesB,\n" + " __global const int* indicesB,\n" + " __global int4* clippingFaces, int pairIndex)\n" + "{\n" + " int numContactsOut = 0;\n" + " int numWorldVertsB1= 0;\n" + " \n" + " \n" + " int closestFaceB=0;\n" + " float dmax = -FLT_MAX;\n" + " \n" + " {\n" + " for(int face=0;face<hullB->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n" + " facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n" + " const float4 WorldNormal = qtRotate(ornB, Normal);\n" + " float d = dot3F4(WorldNormal,separatingNormal);\n" + " if (d > dmax)\n" + " {\n" + " dmax = d;\n" + " closestFaceB = face;\n" + " }\n" + " }\n" + " }\n" + " \n" + " {\n" + " const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" + " int numVertices = polyB.m_numIndices;\n" + " if (numVertices>capacityWorldVerts)\n" + " numVertices = capacityWorldVerts;\n" + " if (numVertices<0)\n" + " numVertices = 0;\n" + " \n" + " for(int e0=0;e0<numVertices;e0++)\n" + " {\n" + " if (e0<capacityWorldVerts)\n" + " {\n" + " const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n" + " worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" + " }\n" + " }\n" + " }\n" + " \n" + " int closestFaceA=0;\n" + " {\n" + " float dmin = FLT_MAX;\n" + " for(int face=0;face<hullA->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(\n" + " facesA[hullA->m_faceOffset+face].m_plane.x,\n" + " facesA[hullA->m_faceOffset+face].m_plane.y,\n" + " facesA[hullA->m_faceOffset+face].m_plane.z,\n" + " 0.f);\n" + " const float4 faceANormalWS = qtRotate(ornA,Normal);\n" + " \n" + " float d = dot3F4(faceANormalWS,separatingNormal);\n" + " if (d < dmin)\n" + " {\n" + " dmin = d;\n" + " closestFaceA = face;\n" + " worldNormalsA1[pairIndex] = faceANormalWS;\n" + " }\n" + " }\n" + " }\n" + " \n" + " int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n" + " if (numVerticesA>capacityWorldVerts)\n" + " numVerticesA = capacityWorldVerts;\n" + " if (numVerticesA<0)\n" + " numVerticesA=0;\n" + " \n" + " for(int e0=0;e0<numVerticesA;e0++)\n" + " {\n" + " if (e0<capacityWorldVerts)\n" + " {\n" + " const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n" + " worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n" + " }\n" + " }\n" + " \n" + " clippingFaces[pairIndex].x = closestFaceA;\n" + " clippingFaces[pairIndex].y = closestFaceB;\n" + " clippingFaces[pairIndex].z = numVerticesA;\n" + " clippingFaces[pairIndex].w = numWorldVertsB1;\n" + " \n" + " \n" + " return numContactsOut;\n" + "}\n" + "// work-in-progress\n" + "__kernel void findConcaveSeparatingAxisVertexFaceKernel( __global int4* concavePairs,\n" + " __global const BodyData* rigidBodies,\n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes,\n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global const btGpuChildShape* gpuChildShapes,\n" + " __global btAabbCL* aabbs,\n" + " __global float4* concaveSeparatingNormalsOut,\n" + " __global int* concaveHasSeparatingNormals,\n" + " __global int4* clippingFacesOut,\n" + " __global float4* worldVertsA1GPU,\n" + " __global float4* worldNormalsAGPU,\n" + " __global float4* worldVertsB1GPU,\n" + " __global float* dmins,\n" + " int vertexFaceCapacity,\n" + " int numConcavePairs\n" + " )\n" + "{\n" + " \n" + " int i = get_global_id(0);\n" + " if (i>=numConcavePairs)\n" + " return;\n" + " \n" + " concaveHasSeparatingNormals[i] = 0;\n" + " \n" + " int pairIdx = i;\n" + " \n" + " int bodyIndexA = concavePairs[i].x;\n" + " int bodyIndexB = concavePairs[i].y;\n" + " \n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n" + " collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " {\n" + " concavePairs[pairIdx].w = -1;\n" + " return;\n" + " }\n" + " \n" + " \n" + " \n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " int numActualConcaveConvexTests = 0;\n" + " \n" + " int f = concavePairs[i].z;\n" + " \n" + " bool overlap = false;\n" + " \n" + " ConvexPolyhedronCL convexPolyhedronA;\n" + " \n" + " //add 3 vertices of the triangle\n" + " convexPolyhedronA.m_numVertices = 3;\n" + " convexPolyhedronA.m_vertexOffset = 0;\n" + " float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" + " \n" + " btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" + " float4 triMinAabb, triMaxAabb;\n" + " btAabbCL triAabb;\n" + " triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n" + " triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n" + " \n" + " float4 verticesA[3];\n" + " for (int i=0;i<3;i++)\n" + " {\n" + " int index = indices[face.m_indexOffset+i];\n" + " float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" + " verticesA[i] = vert;\n" + " localCenter += vert;\n" + " \n" + " triAabb.m_min = min(triAabb.m_min,vert);\n" + " triAabb.m_max = max(triAabb.m_max,vert);\n" + " \n" + " }\n" + " \n" + " overlap = true;\n" + " overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n" + " overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n" + " overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n" + " \n" + " if (overlap)\n" + " {\n" + " float dmin = FLT_MAX;\n" + " int hasSeparatingAxis=5;\n" + " float4 sepAxis=make_float4(1,2,3,4);\n" + " \n" + " int localCC=0;\n" + " numActualConcaveConvexTests++;\n" + " \n" + " //a triangle has 3 unique edges\n" + " convexPolyhedronA.m_numUniqueEdges = 3;\n" + " convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" + " float4 uniqueEdgesA[3];\n" + " \n" + " uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" + " uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" + " uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" + " \n" + " \n" + " convexPolyhedronA.m_faceOffset = 0;\n" + " \n" + " float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" + " \n" + " btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n" + " int indicesA[3+3+2+2+2];\n" + " int curUsedIndices=0;\n" + " int fidx=0;\n" + " \n" + " //front size of triangle\n" + " {\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[0] = 0;\n" + " indicesA[1] = 1;\n" + " indicesA[2] = 2;\n" + " curUsedIndices+=3;\n" + " float c = face.m_plane.w;\n" + " facesA[fidx].m_plane.x = normal.x;\n" + " facesA[fidx].m_plane.y = normal.y;\n" + " facesA[fidx].m_plane.z = normal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " facesA[fidx].m_numIndices=3;\n" + " }\n" + " fidx++;\n" + " //back size of triangle\n" + " {\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[3]=2;\n" + " indicesA[4]=1;\n" + " indicesA[5]=0;\n" + " curUsedIndices+=3;\n" + " float c = dot(normal,verticesA[0]);\n" + " float c1 = -face.m_plane.w;\n" + " facesA[fidx].m_plane.x = -normal.x;\n" + " facesA[fidx].m_plane.y = -normal.y;\n" + " facesA[fidx].m_plane.z = -normal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " facesA[fidx].m_numIndices=3;\n" + " }\n" + " fidx++;\n" + " \n" + " bool addEdgePlanes = true;\n" + " if (addEdgePlanes)\n" + " {\n" + " int numVertices=3;\n" + " int prevVertex = numVertices-1;\n" + " for (int i=0;i<numVertices;i++)\n" + " {\n" + " float4 v0 = verticesA[i];\n" + " float4 v1 = verticesA[prevVertex];\n" + " \n" + " float4 edgeNormal = normalize(cross(normal,v1-v0));\n" + " float c = -dot(edgeNormal,v0);\n" + " \n" + " facesA[fidx].m_numIndices = 2;\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[curUsedIndices++]=i;\n" + " indicesA[curUsedIndices++]=prevVertex;\n" + " \n" + " facesA[fidx].m_plane.x = edgeNormal.x;\n" + " facesA[fidx].m_plane.y = edgeNormal.y;\n" + " facesA[fidx].m_plane.z = edgeNormal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " fidx++;\n" + " prevVertex = i;\n" + " }\n" + " }\n" + " convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" + " convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" + " \n" + " \n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " posA.w = 0.f;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " posB.w = 0.f;\n" + " \n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" + " \n" + " \n" + " \n" + " \n" + " ///////////////////\n" + " ///compound shape support\n" + " \n" + " if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " {\n" + " int compoundChild = concavePairs[pairIdx].w;\n" + " int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n" + " int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" + " float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" + " float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" + " float4 newPosB = transform(&childPosB,&posB,&ornB);\n" + " float4 newOrnB = qtMul(ornB,childOrnB);\n" + " posB = newPosB;\n" + " ornB = newOrnB;\n" + " shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" + " }\n" + " //////////////////\n" + " \n" + " float4 c0local = convexPolyhedronA.m_localCenter;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " \n" + " \n" + " bool sepA = findSeparatingAxisLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" + " posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " verticesA,uniqueEdgesA,facesA,indicesA,\n" + " vertices,uniqueEdges,faces,indices,\n" + " &sepAxis,&dmin);\n" + " hasSeparatingAxis = 4;\n" + " if (!sepA)\n" + " {\n" + " hasSeparatingAxis = 0;\n" + " } else\n" + " {\n" + " bool sepB = findSeparatingAxisLocalB( &convexShapes[shapeIndexB],&convexPolyhedronA,\n" + " posB,ornB,\n" + " posA,ornA,\n" + " DeltaC2,\n" + " vertices,uniqueEdges,faces,indices,\n" + " verticesA,uniqueEdgesA,facesA,indicesA,\n" + " &sepAxis,&dmin);\n" + " \n" + " if (!sepB)\n" + " {\n" + " hasSeparatingAxis = 0;\n" + " } else\n" + " {\n" + " hasSeparatingAxis = 1;\n" + " }\n" + " } \n" + " \n" + " if (hasSeparatingAxis)\n" + " {\n" + " dmins[i] = dmin;\n" + " concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n" + " concaveHasSeparatingNormals[i]=1;\n" + " \n" + " } else\n" + " { \n" + " //mark this pair as in-active\n" + " concavePairs[pairIdx].w = -1;\n" + " }\n" + " }\n" + " else\n" + " { \n" + " //mark this pair as in-active\n" + " concavePairs[pairIdx].w = -1;\n" + " }\n" + "}\n" + "// work-in-progress\n" + "__kernel void findConcaveSeparatingAxisEdgeEdgeKernel( __global int4* concavePairs,\n" + " __global const BodyData* rigidBodies,\n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes,\n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global const btGpuChildShape* gpuChildShapes,\n" + " __global btAabbCL* aabbs,\n" + " __global float4* concaveSeparatingNormalsOut,\n" + " __global int* concaveHasSeparatingNormals,\n" + " __global int4* clippingFacesOut,\n" + " __global float4* worldVertsA1GPU,\n" + " __global float4* worldNormalsAGPU,\n" + " __global float4* worldVertsB1GPU,\n" + " __global float* dmins,\n" + " int vertexFaceCapacity,\n" + " int numConcavePairs\n" + " )\n" + "{\n" + " \n" + " int i = get_global_id(0);\n" + " if (i>=numConcavePairs)\n" + " return;\n" + " \n" + " if (!concaveHasSeparatingNormals[i])\n" + " return;\n" + " \n" + " int pairIdx = i;\n" + " \n" + " int bodyIndexA = concavePairs[i].x;\n" + " int bodyIndexB = concavePairs[i].y;\n" + " \n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " \n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " int numActualConcaveConvexTests = 0;\n" + " \n" + " int f = concavePairs[i].z;\n" + " \n" + " bool overlap = false;\n" + " \n" + " ConvexPolyhedronCL convexPolyhedronA;\n" + " \n" + " //add 3 vertices of the triangle\n" + " convexPolyhedronA.m_numVertices = 3;\n" + " convexPolyhedronA.m_vertexOffset = 0;\n" + " float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" + " \n" + " btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" + " float4 triMinAabb, triMaxAabb;\n" + " btAabbCL triAabb;\n" + " triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n" + " triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n" + " \n" + " float4 verticesA[3];\n" + " for (int i=0;i<3;i++)\n" + " {\n" + " int index = indices[face.m_indexOffset+i];\n" + " float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" + " verticesA[i] = vert;\n" + " localCenter += vert;\n" + " \n" + " triAabb.m_min = min(triAabb.m_min,vert);\n" + " triAabb.m_max = max(triAabb.m_max,vert);\n" + " \n" + " }\n" + " \n" + " overlap = true;\n" + " overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n" + " overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n" + " overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n" + " \n" + " if (overlap)\n" + " {\n" + " float dmin = dmins[i];\n" + " int hasSeparatingAxis=5;\n" + " float4 sepAxis=make_float4(1,2,3,4);\n" + " sepAxis = concaveSeparatingNormalsOut[pairIdx];\n" + " \n" + " int localCC=0;\n" + " numActualConcaveConvexTests++;\n" + " \n" + " //a triangle has 3 unique edges\n" + " convexPolyhedronA.m_numUniqueEdges = 3;\n" + " convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" + " float4 uniqueEdgesA[3];\n" + " \n" + " uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" + " uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" + " uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" + " \n" + " \n" + " convexPolyhedronA.m_faceOffset = 0;\n" + " \n" + " float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" + " \n" + " btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n" + " int indicesA[3+3+2+2+2];\n" + " int curUsedIndices=0;\n" + " int fidx=0;\n" + " \n" + " //front size of triangle\n" + " {\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[0] = 0;\n" + " indicesA[1] = 1;\n" + " indicesA[2] = 2;\n" + " curUsedIndices+=3;\n" + " float c = face.m_plane.w;\n" + " facesA[fidx].m_plane.x = normal.x;\n" + " facesA[fidx].m_plane.y = normal.y;\n" + " facesA[fidx].m_plane.z = normal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " facesA[fidx].m_numIndices=3;\n" + " }\n" + " fidx++;\n" + " //back size of triangle\n" + " {\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[3]=2;\n" + " indicesA[4]=1;\n" + " indicesA[5]=0;\n" + " curUsedIndices+=3;\n" + " float c = dot(normal,verticesA[0]);\n" + " float c1 = -face.m_plane.w;\n" + " facesA[fidx].m_plane.x = -normal.x;\n" + " facesA[fidx].m_plane.y = -normal.y;\n" + " facesA[fidx].m_plane.z = -normal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " facesA[fidx].m_numIndices=3;\n" + " }\n" + " fidx++;\n" + " \n" + " bool addEdgePlanes = true;\n" + " if (addEdgePlanes)\n" + " {\n" + " int numVertices=3;\n" + " int prevVertex = numVertices-1;\n" + " for (int i=0;i<numVertices;i++)\n" + " {\n" + " float4 v0 = verticesA[i];\n" + " float4 v1 = verticesA[prevVertex];\n" + " \n" + " float4 edgeNormal = normalize(cross(normal,v1-v0));\n" + " float c = -dot(edgeNormal,v0);\n" + " \n" + " facesA[fidx].m_numIndices = 2;\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[curUsedIndices++]=i;\n" + " indicesA[curUsedIndices++]=prevVertex;\n" + " \n" + " facesA[fidx].m_plane.x = edgeNormal.x;\n" + " facesA[fidx].m_plane.y = edgeNormal.y;\n" + " facesA[fidx].m_plane.z = edgeNormal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " fidx++;\n" + " prevVertex = i;\n" + " }\n" + " }\n" + " convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" + " convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" + " \n" + " \n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " posA.w = 0.f;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " posB.w = 0.f;\n" + " \n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" + " \n" + " \n" + " \n" + " \n" + " ///////////////////\n" + " ///compound shape support\n" + " \n" + " if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " {\n" + " int compoundChild = concavePairs[pairIdx].w;\n" + " int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n" + " int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" + " float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" + " float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" + " float4 newPosB = transform(&childPosB,&posB,&ornB);\n" + " float4 newOrnB = qtMul(ornB,childOrnB);\n" + " posB = newPosB;\n" + " ornB = newOrnB;\n" + " shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" + " }\n" + " //////////////////\n" + " \n" + " float4 c0local = convexPolyhedronA.m_localCenter;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " \n" + " \n" + " {\n" + " bool sepEE = findSeparatingAxisEdgeEdgeLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" + " posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " verticesA,uniqueEdgesA,facesA,indicesA,\n" + " vertices,uniqueEdges,faces,indices,\n" + " &sepAxis,&dmin);\n" + " \n" + " if (!sepEE)\n" + " {\n" + " hasSeparatingAxis = 0;\n" + " } else\n" + " {\n" + " hasSeparatingAxis = 1;\n" + " }\n" + " }\n" + " \n" + " \n" + " if (hasSeparatingAxis)\n" + " {\n" + " sepAxis.w = dmin;\n" + " dmins[i] = dmin;\n" + " concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n" + " concaveHasSeparatingNormals[i]=1;\n" + " \n" + " float minDist = -1e30f;\n" + " float maxDist = 0.02f;\n" + " \n" + " findClippingFaces(sepAxis,\n" + " &convexPolyhedronA,\n" + " &convexShapes[shapeIndexB],\n" + " posA,ornA,\n" + " posB,ornB,\n" + " worldVertsA1GPU,\n" + " worldNormalsAGPU,\n" + " worldVertsB1GPU,\n" + " vertexFaceCapacity,\n" + " minDist, maxDist,\n" + " verticesA,\n" + " facesA,\n" + " indicesA,\n" + " vertices,\n" + " faces,\n" + " indices,\n" + " clippingFacesOut, pairIdx);\n" + " \n" + " \n" + " } else\n" + " { \n" + " //mark this pair as in-active\n" + " concavePairs[pairIdx].w = -1;\n" + " }\n" + " }\n" + " else\n" + " { \n" + " //mark this pair as in-active\n" + " concavePairs[pairIdx].w = -1;\n" + " }\n" + " \n" + " concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h index 6f8b0a90db..e627af2799 100644 --- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h @@ -1,2104 +1,2103 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* satKernelsCL= \ -"//keep this enum in sync with the CPU version (in btCollidable.h)\n" -"//written by Erwin Coumans\n" -"#define SHAPE_CONVEX_HULL 3\n" -"#define SHAPE_CONCAVE_TRIMESH 5\n" -"#define TRIANGLE_NUM_CONVEX_FACES 5\n" -"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" -"#define B3_MAX_STACK_DEPTH 256\n" -"typedef unsigned int u32;\n" -"///keep this in sync with btCollidable.h\n" -"typedef struct\n" -"{\n" -" union {\n" -" int m_numChildShapes;\n" -" int m_bvhIndex;\n" -" };\n" -" union\n" -" {\n" -" float m_radius;\n" -" int m_compoundBvhIndex;\n" -" };\n" -" \n" -" int m_shapeType;\n" -" int m_shapeIndex;\n" -" \n" -"} btCollidableGpu;\n" -"#define MAX_NUM_PARTS_IN_BITS 10\n" -"///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n" -"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" -"typedef struct\n" -"{\n" -" //12 bytes\n" -" unsigned short int m_quantizedAabbMin[3];\n" -" unsigned short int m_quantizedAabbMax[3];\n" -" //4 bytes\n" -" int m_escapeIndexOrTriangleIndex;\n" -"} b3QuantizedBvhNode;\n" -"typedef struct\n" -"{\n" -" float4 m_aabbMin;\n" -" float4 m_aabbMax;\n" -" float4 m_quantization;\n" -" int m_numNodes;\n" -" int m_numSubTrees;\n" -" int m_nodeOffset;\n" -" int m_subTreeOffset;\n" -"} b3BvhInfo;\n" -"int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" unsigned int x=0;\n" -" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" -" // Get only the lower bits where the triangle index is stored\n" -" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" -"}\n" -"int getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" unsigned int x=0;\n" -" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" -" // Get only the lower bits where the triangle index is stored\n" -" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" -"}\n" -"int isLeafNode(const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" -" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" -"}\n" -"int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" -" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" -"}\n" -" \n" -"int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" return -rootNode->m_escapeIndexOrTriangleIndex;\n" -"}\n" -"int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" -"{\n" -" return -rootNode->m_escapeIndexOrTriangleIndex;\n" -"}\n" -"typedef struct\n" -"{\n" -" //12 bytes\n" -" unsigned short int m_quantizedAabbMin[3];\n" -" unsigned short int m_quantizedAabbMax[3];\n" -" //4 bytes, points to the root of the subtree\n" -" int m_rootNodeIndex;\n" -" //4 bytes\n" -" int m_subtreeSize;\n" -" int m_padding[3];\n" -"} b3BvhSubtreeInfo;\n" -"typedef struct\n" -"{\n" -" float4 m_childPosition;\n" -" float4 m_childOrientation;\n" -" int m_shapeIndex;\n" -" int m_unused0;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"} btGpuChildShape;\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" float4 m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" u32 m_collidableIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} BodyData;\n" -"typedef struct \n" -"{\n" -" float4 m_localCenter;\n" -" float4 m_extents;\n" -" float4 mC;\n" -" float4 mE;\n" -" \n" -" float m_radius;\n" -" int m_faceOffset;\n" -" int m_numFaces;\n" -" int m_numVertices;\n" -" int m_vertexOffset;\n" -" int m_uniqueEdgesOffset;\n" -" int m_numUniqueEdges;\n" -" int m_unused;\n" -"} ConvexPolyhedronCL;\n" -"typedef struct \n" -"{\n" -" union\n" -" {\n" -" float4 m_min;\n" -" float m_minElems[4];\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float4 m_max;\n" -" float m_maxElems[4];\n" -" int m_maxIndices[4];\n" -" };\n" -"} btAabbCL;\n" -"#ifndef B3_AABB_H\n" -"#define B3_AABB_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_MAT3x3_H\n" -"#define B3_MAT3x3_H\n" -"#ifndef B3_QUAT_H\n" -"#define B3_QUAT_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif\n" -"#endif\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Quat;\n" -" #define b3QuatConstArg const b3Quat\n" -" \n" -" \n" -"inline float4 b3FastNormalize4(float4 v)\n" -"{\n" -" v = (float4)(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -" \n" -"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" -"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" -"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" -"{\n" -" b3Quat ans;\n" -" ans = b3Cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" -"{\n" -" b3Quat q;\n" -" q=in;\n" -" //return b3FastNormalize4(in);\n" -" float len = native_sqrt(dot(q, q));\n" -" if(len > 0.f)\n" -" {\n" -" q *= 1.f / len;\n" -" }\n" -" else\n" -" {\n" -" q.x = q.y = q.z = 0.f;\n" -" q.w = 1.f;\n" -" }\n" -" return q;\n" -"}\n" -"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" b3Quat qInv = b3QuatInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" return b3QuatRotate( b3QuatInvert( q ), vec );\n" -"}\n" -"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" -"{\n" -" return b3QuatRotate( orientation, point ) + (translation);\n" -"}\n" -" \n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"typedef struct\n" -"{\n" -" b3Float4 m_row[3];\n" -"}b3Mat3x3;\n" -"#define b3Mat3x3ConstArg const b3Mat3x3\n" -"#define b3GetRow(m,row) (m.m_row[row])\n" -"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" -"{\n" -" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" -" b3Mat3x3 out;\n" -" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" -" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" -" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" -" out.m_row[0].w = 0.f;\n" -" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" -" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" -" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" -" out.m_row[1].w = 0.f;\n" -" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" -" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" -" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" -" out.m_row[2].w = 0.f;\n" -" return out;\n" -"}\n" -"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = fabs(matIn.m_row[0]);\n" -" out.m_row[1] = fabs(matIn.m_row[1]);\n" -" out.m_row[2] = fabs(matIn.m_row[2]);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtZero();\n" -"__inline\n" -"b3Mat3x3 mtIdentity();\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Mat3x3 mtZero()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(0.f);\n" -" m.m_row[1] = (b3Float4)(0.f);\n" -" m.m_row[2] = (b3Float4)(0.f);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtIdentity()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(1,0,0,0);\n" -" m.m_row[1] = (b3Float4)(0,1,0,0);\n" -" m.m_row[2] = (b3Float4)(0,0,1,0);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" -" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" -" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" -"{\n" -" b3Mat3x3 transB;\n" -" transB = mtTranspose( b );\n" -" b3Mat3x3 ans;\n" -" // why this doesn't run when 0ing in the for{}\n" -" a.m_row[0].w = 0.f;\n" -" a.m_row[1].w = 0.f;\n" -" a.m_row[2].w = 0.f;\n" -" for(int i=0; i<3; i++)\n" -" {\n" -"// a.m_row[i].w = 0.f;\n" -" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" -" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" -" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" -" ans.m_row[i].w = 0.f;\n" -" }\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" -"{\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a.m_row[0], b );\n" -" ans.y = b3Dot3F4( a.m_row[1], b );\n" -" ans.z = b3Dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" -"{\n" -" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a, colx );\n" -" ans.y = b3Dot3F4( a, coly );\n" -" ans.z = b3Dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"#endif\n" -"#endif //B3_MAT3x3_H\n" -"typedef struct b3Aabb b3Aabb_t;\n" -"struct b3Aabb\n" -"{\n" -" union\n" -" {\n" -" float m_min[4];\n" -" b3Float4 m_minVec;\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float m_max[4];\n" -" b3Float4 m_maxVec;\n" -" int m_signedMaxIndices[4];\n" -" };\n" -"};\n" -"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" -" b3Float4ConstArg pos,\n" -" b3QuatConstArg orn,\n" -" b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" -"{\n" -" b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" -" localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" -" b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" -" b3Mat3x3 m;\n" -" m = b3QuatGetRotationMatrix(orn);\n" -" b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" -" b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" -" \n" -" b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" -" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" -" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" -" 0.f);\n" -" *aabbMinOut = center-extent;\n" -" *aabbMaxOut = center+extent;\n" -"}\n" -"/// conservative test for overlap between two aabbs\n" -"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" -" b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" -" overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" -" overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"#endif //B3_AABB_H\n" -"/*\n" -"Bullet Continuous Collision Detection and Physics Library\n" -"Copyright (c) 2003-2013 Erwin Coumans http://bulletphysics.org\n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose,\n" -"including commercial applications, and to alter it and redistribute it freely,\n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"#ifndef B3_INT2_H\n" -"#define B3_INT2_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#define b3UnsignedInt2 uint2\n" -"#define b3Int2 int2\n" -"#define b3MakeInt2 (int2)\n" -"#endif //__cplusplus\n" -"#endif\n" -"typedef struct\n" -"{\n" -" float4 m_plane;\n" -" int m_indexOffset;\n" -" int m_numIndices;\n" -"} btGpuFace;\n" -"#define make_float4 (float4)\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -" \n" -"// float4 a1 = make_float4(a.xyz,0.f);\n" -"// float4 b1 = make_float4(b.xyz,0.f);\n" -"// return cross(a1,b1);\n" -"//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n" -" \n" -" // float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n" -" \n" -" //return c;\n" -"}\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = make_float4(a.xyz,0.f);\n" -" float4 b1 = make_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" v = make_float4(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -"///////////////////////////////////////\n" -"// Quaternion\n" -"///////////////////////////////////////\n" -"typedef float4 Quaternion;\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b);\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in);\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec);\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q);\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b)\n" -"{\n" -" Quaternion ans;\n" -" ans = cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in)\n" -"{\n" -" return fastNormalize4(in);\n" -"// in /= length( in );\n" -"// return in;\n" -"}\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec)\n" -"{\n" -" Quaternion qInv = qtInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q)\n" -"{\n" -" return (Quaternion)(-q.xyz, q.w);\n" -"}\n" -"__inline\n" -"float4 qtInvRotate(const Quaternion q, float4 vec)\n" -"{\n" -" return qtRotate( qtInvert( q ), vec );\n" -"}\n" -"__inline\n" -"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" -"{\n" -" return qtRotate( *orientation, *p ) + (*translation);\n" -"}\n" -"__inline\n" -"float4 normalize3(const float4 a)\n" -"{\n" -" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" -" return fastNormalize4( n );\n" -"}\n" -"inline void projectLocal(const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" -"const float4* dir, const float4* vertices, float* min, float* max)\n" -"{\n" -" min[0] = FLT_MAX;\n" -" max[0] = -FLT_MAX;\n" -" int numVerts = hull->m_numVertices;\n" -" const float4 localDir = qtInvRotate(orn,*dir);\n" -" float offset = dot(pos,*dir);\n" -" for(int i=0;i<numVerts;i++)\n" -" {\n" -" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" -" if(dp < min[0]) \n" -" min[0] = dp;\n" -" if(dp > max[0]) \n" -" max[0] = dp;\n" -" }\n" -" if(min[0]>max[0])\n" -" {\n" -" float tmp = min[0];\n" -" min[0] = max[0];\n" -" max[0] = tmp;\n" -" }\n" -" min[0] += offset;\n" -" max[0] += offset;\n" -"}\n" -"inline void project(__global const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" -"const float4* dir, __global const float4* vertices, float* min, float* max)\n" -"{\n" -" min[0] = FLT_MAX;\n" -" max[0] = -FLT_MAX;\n" -" int numVerts = hull->m_numVertices;\n" -" const float4 localDir = qtInvRotate(orn,*dir);\n" -" float offset = dot(pos,*dir);\n" -" for(int i=0;i<numVerts;i++)\n" -" {\n" -" float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" -" if(dp < min[0]) \n" -" min[0] = dp;\n" -" if(dp > max[0]) \n" -" max[0] = dp;\n" -" }\n" -" if(min[0]>max[0])\n" -" {\n" -" float tmp = min[0];\n" -" min[0] = max[0];\n" -" max[0] = tmp;\n" -" }\n" -" min[0] += offset;\n" -" max[0] += offset;\n" -"}\n" -"inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA,const float4 ornA,\n" -" const float4 posB,const float4 ornB,\n" -" float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n" -"{\n" -" float Min0,Max0;\n" -" float Min1,Max1;\n" -" projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n" -" project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n" -" if(Max0<Min1 || Max1<Min0)\n" -" return false;\n" -" float d0 = Max0 - Min1;\n" -" float d1 = Max1 - Min0;\n" -" *depth = d0<d1 ? d0:d1;\n" -" return true;\n" -"}\n" -"inline bool IsAlmostZero(const float4 v)\n" -"{\n" -" if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n" -" return false;\n" -" return true;\n" -"}\n" -"bool findSeparatingAxisLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" \n" -" const float4* verticesA, \n" -" const float4* uniqueEdgesA, \n" -" const btGpuFace* facesA,\n" -" const int* indicesA,\n" -" __global const float4* verticesB, \n" -" __global const float4* uniqueEdgesB, \n" -" __global const btGpuFace* facesB,\n" -" __global const int* indicesB,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" \n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" int curPlaneTests=0;\n" -" {\n" -" int numFacesA = hullA->m_numFaces;\n" -" // Test normals from hullA\n" -" for(int i=0;i<numFacesA;i++)\n" -" {\n" -" const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" -" float4 faceANormalWS = qtRotate(ornA,normal);\n" -" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" -" faceANormalWS*=-1.f;\n" -" curPlaneTests++;\n" -" float d;\n" -" if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n" -" return false;\n" -" if(d<*dmin)\n" -" {\n" -" *dmin = d;\n" -" *sep = faceANormalWS;\n" -" }\n" -" }\n" -" }\n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" return true;\n" -"}\n" -"bool findSeparatingAxisLocalB( __global const ConvexPolyhedronCL* hullA, const ConvexPolyhedronCL* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" __global const float4* verticesA, \n" -" __global const float4* uniqueEdgesA, \n" -" __global const btGpuFace* facesA,\n" -" __global const int* indicesA,\n" -" const float4* verticesB,\n" -" const float4* uniqueEdgesB, \n" -" const btGpuFace* facesB,\n" -" const int* indicesB,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" int curPlaneTests=0;\n" -" {\n" -" int numFacesA = hullA->m_numFaces;\n" -" // Test normals from hullA\n" -" for(int i=0;i<numFacesA;i++)\n" -" {\n" -" const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" -" float4 faceANormalWS = qtRotate(ornA,normal);\n" -" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" -" faceANormalWS *= -1.f;\n" -" curPlaneTests++;\n" -" float d;\n" -" if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n" -" return false;\n" -" if(d<*dmin)\n" -" {\n" -" *dmin = d;\n" -" *sep = faceANormalWS;\n" -" }\n" -" }\n" -" }\n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" return true;\n" -"}\n" -"bool findSeparatingAxisEdgeEdgeLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" const float4* verticesA, \n" -" const float4* uniqueEdgesA, \n" -" const btGpuFace* facesA,\n" -" const int* indicesA,\n" -" __global const float4* verticesB, \n" -" __global const float4* uniqueEdgesB, \n" -" __global const btGpuFace* facesB,\n" -" __global const int* indicesB,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" int curPlaneTests=0;\n" -" int curEdgeEdge = 0;\n" -" // Test edges\n" -" for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n" -" {\n" -" const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n" -" float4 edge0World = qtRotate(ornA,edge0);\n" -" for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n" -" {\n" -" const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n" -" float4 edge1World = qtRotate(ornB,edge1);\n" -" float4 crossje = cross3(edge0World,edge1World);\n" -" curEdgeEdge++;\n" -" if(!IsAlmostZero(crossje))\n" -" {\n" -" crossje = normalize3(crossje);\n" -" if (dot3F4(DeltaC2,crossje)<0)\n" -" crossje *= -1.f;\n" -" float dist;\n" -" bool result = true;\n" -" {\n" -" float Min0,Max0;\n" -" float Min1,Max1;\n" -" projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n" -" project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n" -" \n" -" if(Max0<Min1 || Max1<Min0)\n" -" result = false;\n" -" \n" -" float d0 = Max0 - Min1;\n" -" float d1 = Max1 - Min0;\n" -" dist = d0<d1 ? d0:d1;\n" -" result = true;\n" -" }\n" -" \n" -" if(dist<*dmin)\n" -" {\n" -" *dmin = dist;\n" -" *sep = crossje;\n" -" }\n" -" }\n" -" }\n" -" }\n" -" \n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" return true;\n" -"}\n" -"inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA,const float4 ornA,\n" -" const float4 posB,const float4 ornB,\n" -" float4* sep_axis, __global const float4* vertices,float* depth)\n" -"{\n" -" float Min0,Max0;\n" -" float Min1,Max1;\n" -" project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);\n" -" project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);\n" -" if(Max0<Min1 || Max1<Min0)\n" -" return false;\n" -" float d0 = Max0 - Min1;\n" -" float d1 = Max1 - Min0;\n" -" *depth = d0<d1 ? d0:d1;\n" -" return true;\n" -"}\n" -"bool findSeparatingAxis( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" __global const float4* vertices, \n" -" __global const float4* uniqueEdges, \n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" \n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" \n" -" int curPlaneTests=0;\n" -" {\n" -" int numFacesA = hullA->m_numFaces;\n" -" // Test normals from hullA\n" -" for(int i=0;i<numFacesA;i++)\n" -" {\n" -" const float4 normal = faces[hullA->m_faceOffset+i].m_plane;\n" -" float4 faceANormalWS = qtRotate(ornA,normal);\n" -" \n" -" if (dot3F4(DeltaC2,faceANormalWS)<0)\n" -" faceANormalWS*=-1.f;\n" -" \n" -" curPlaneTests++;\n" -" \n" -" float d;\n" -" if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))\n" -" return false;\n" -" \n" -" if(d<*dmin)\n" -" {\n" -" *dmin = d;\n" -" *sep = faceANormalWS;\n" -" }\n" -" }\n" -" }\n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" \n" -" return true;\n" -"}\n" -"bool findSeparatingAxisUnitSphere( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" __global const float4* vertices,\n" -" __global const float4* unitSphereDirections,\n" -" int numUnitSphereDirections,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" \n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" int curPlaneTests=0;\n" -" int curEdgeEdge = 0;\n" -" // Test unit sphere directions\n" -" for (int i=0;i<numUnitSphereDirections;i++)\n" -" {\n" -" float4 crossje;\n" -" crossje = unitSphereDirections[i]; \n" -" if (dot3F4(DeltaC2,crossje)>0)\n" -" crossje *= -1.f;\n" -" {\n" -" float dist;\n" -" bool result = true;\n" -" float Min0,Max0;\n" -" float Min1,Max1;\n" -" project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n" -" project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n" -" \n" -" if(Max0<Min1 || Max1<Min0)\n" -" return false;\n" -" \n" -" float d0 = Max0 - Min1;\n" -" float d1 = Max1 - Min0;\n" -" dist = d0<d1 ? d0:d1;\n" -" result = true;\n" -" \n" -" if(dist<*dmin)\n" -" {\n" -" *dmin = dist;\n" -" *sep = crossje;\n" -" }\n" -" }\n" -" }\n" -" \n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" return true;\n" -"}\n" -"bool findSeparatingAxisEdgeEdge( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" -" const float4 posA1,\n" -" const float4 ornA,\n" -" const float4 posB1,\n" -" const float4 ornB,\n" -" const float4 DeltaC2,\n" -" __global const float4* vertices, \n" -" __global const float4* uniqueEdges, \n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" float4* sep,\n" -" float* dmin)\n" -"{\n" -" \n" -" float4 posA = posA1;\n" -" posA.w = 0.f;\n" -" float4 posB = posB1;\n" -" posB.w = 0.f;\n" -" int curPlaneTests=0;\n" -" int curEdgeEdge = 0;\n" -" // Test edges\n" -" for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n" -" {\n" -" const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];\n" -" float4 edge0World = qtRotate(ornA,edge0);\n" -" for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n" -" {\n" -" const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];\n" -" float4 edge1World = qtRotate(ornB,edge1);\n" -" float4 crossje = cross3(edge0World,edge1World);\n" -" curEdgeEdge++;\n" -" if(!IsAlmostZero(crossje))\n" -" {\n" -" crossje = normalize3(crossje);\n" -" if (dot3F4(DeltaC2,crossje)<0)\n" -" crossje*=-1.f;\n" -" \n" -" float dist;\n" -" bool result = true;\n" -" {\n" -" float Min0,Max0;\n" -" float Min1,Max1;\n" -" project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n" -" project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n" -" \n" -" if(Max0<Min1 || Max1<Min0)\n" -" return false;\n" -" \n" -" float d0 = Max0 - Min1;\n" -" float d1 = Max1 - Min0;\n" -" dist = d0<d1 ? d0:d1;\n" -" result = true;\n" -" }\n" -" \n" -" if(dist<*dmin)\n" -" {\n" -" *dmin = dist;\n" -" *sep = crossje;\n" -" }\n" -" }\n" -" }\n" -" }\n" -" \n" -" if((dot3F4(-DeltaC2,*sep))>0.0f)\n" -" {\n" -" *sep = -(*sep);\n" -" }\n" -" return true;\n" -"}\n" -"// work-in-progress\n" -"__kernel void processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global btAabbCL* aabbs,\n" -" __global const btGpuChildShape* gpuChildShapes,\n" -" __global volatile float4* gpuCompoundSepNormalsOut,\n" -" __global volatile int* gpuHasCompoundSepNormalsOut,\n" -" int numCompoundPairs\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i<numCompoundPairs)\n" -" {\n" -" int bodyIndexA = gpuCompoundPairs[i].x;\n" -" int bodyIndexB = gpuCompoundPairs[i].y;\n" -" int childShapeIndexA = gpuCompoundPairs[i].z;\n" -" int childShapeIndexB = gpuCompoundPairs[i].w;\n" -" \n" -" int collidableIndexA = -1;\n" -" int collidableIndexB = -1;\n" -" \n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" \n" -" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" \n" -" if (childShapeIndexA >= 0)\n" -" {\n" -" collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" -" float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" -" float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" -" float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" -" float4 newOrnA = qtMul(ornA,childOrnA);\n" -" posA = newPosA;\n" -" ornA = newOrnA;\n" -" } else\n" -" {\n" -" collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" }\n" -" \n" -" if (childShapeIndexB>=0)\n" -" {\n" -" collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" -" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" -" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" -" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" -" float4 newOrnB = qtMul(ornB,childOrnB);\n" -" posB = newPosB;\n" -" ornB = newOrnB;\n" -" } else\n" -" {\n" -" collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; \n" -" }\n" -" \n" -" gpuHasCompoundSepNormalsOut[i] = 0;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n" -" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" -" \n" -" if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))\n" -" {\n" -" return;\n" -" }\n" -" int hasSeparatingAxis = 5;\n" -" \n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" float dmin = FLT_MAX;\n" -" posA.w = 0.f;\n" -" posB.w = 0.f;\n" -" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" float4 sepNormal = make_float4(1,0,0,0);\n" -" bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n" -" hasSeparatingAxis = 4;\n" -" if (!sepA)\n" -" {\n" -" hasSeparatingAxis = 0;\n" -" } else\n" -" {\n" -" bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n" -" if (!sepB)\n" -" {\n" -" hasSeparatingAxis = 0;\n" -" } else//(!sepB)\n" -" {\n" -" bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n" -" if (sepEE)\n" -" {\n" -" gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);\n" -" gpuHasCompoundSepNormalsOut[i] = 1;\n" -" }//sepEE\n" -" }//(!sepB)\n" -" }//(!sepA)\n" -" \n" -" \n" -" }\n" -" \n" -"}\n" -"inline b3Float4 MyUnQuantize(const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n" -"{\n" -" b3Float4 vecOut;\n" -" vecOut = b3MakeFloat4(\n" -" (float)(vecIn[0]) / (quantization.x),\n" -" (float)(vecIn[1]) / (quantization.y),\n" -" (float)(vecIn[2]) / (quantization.z),\n" -" 0.f);\n" -" vecOut += bvhAabbMin;\n" -" return vecOut;\n" -"}\n" -"inline b3Float4 MyUnQuantizeGlobal(__global const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n" -"{\n" -" b3Float4 vecOut;\n" -" vecOut = b3MakeFloat4(\n" -" (float)(vecIn[0]) / (quantization.x),\n" -" (float)(vecIn[1]) / (quantization.y),\n" -" (float)(vecIn[2]) / (quantization.z),\n" -" 0.f);\n" -" vecOut += bvhAabbMin;\n" -" return vecOut;\n" -"}\n" -"// work-in-progress\n" -"__kernel void findCompoundPairsKernel( __global const int4* pairs, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global b3Aabb_t* aabbLocalSpace,\n" -" __global const btGpuChildShape* gpuChildShapes,\n" -" __global volatile int4* gpuCompoundPairsOut,\n" -" __global volatile int* numCompoundPairsOut,\n" -" __global const b3BvhSubtreeInfo* subtrees,\n" -" __global const b3QuantizedBvhNode* quantizedNodes,\n" -" __global const b3BvhInfo* bvhInfos,\n" -" int numPairs,\n" -" int maxNumCompoundPairsCapacity\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i<numPairs)\n" -" {\n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" //once the broadphase avoids static-static pairs, we can remove this test\n" -" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" -" {\n" -" return;\n" -" }\n" -" if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" -" {\n" -" int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;\n" -" int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;\n" -" int numSubTreesA = bvhInfos[bvhA].m_numSubTrees;\n" -" int subTreesOffsetA = bvhInfos[bvhA].m_subTreeOffset;\n" -" int subTreesOffsetB = bvhInfos[bvhB].m_subTreeOffset;\n" -" int numSubTreesB = bvhInfos[bvhB].m_numSubTrees;\n" -" \n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" b3Quat ornA = rigidBodies[bodyIndexA].m_quat;\n" -" b3Quat ornB = rigidBodies[bodyIndexB].m_quat;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" \n" -" for (int p=0;p<numSubTreesA;p++)\n" -" {\n" -" b3BvhSubtreeInfo subtreeA = subtrees[subTreesOffsetA+p];\n" -" //bvhInfos[bvhA].m_quantization\n" -" b3Float4 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" -" b3Float4 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" -" b3Float4 aabbAMinOut,aabbAMaxOut;\n" -" float margin=0.f;\n" -" b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n" -" \n" -" for (int q=0;q<numSubTreesB;q++)\n" -" {\n" -" b3BvhSubtreeInfo subtreeB = subtrees[subTreesOffsetB+q];\n" -" b3Float4 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" -" b3Float4 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" -" b3Float4 aabbBMinOut,aabbBMaxOut;\n" -" float margin=0.f;\n" -" b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n" -" \n" -" \n" -" bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n" -" if (aabbOverlap)\n" -" {\n" -" \n" -" int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfos[bvhA].m_nodeOffset;\n" -" int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;\n" -" int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfos[bvhB].m_nodeOffset;\n" -" int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;\n" -" b3Int2 nodeStack[B3_MAX_STACK_DEPTH];\n" -" b3Int2 node0;\n" -" node0.x = startNodeIndexA;\n" -" node0.y = startNodeIndexB;\n" -" int maxStackDepth = B3_MAX_STACK_DEPTH;\n" -" int depth=0;\n" -" nodeStack[depth++]=node0;\n" -" do\n" -" {\n" -" b3Int2 node = nodeStack[--depth];\n" -" b3Float4 aMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" -" b3Float4 aMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" -" b3Float4 bMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" -" b3Float4 bMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" -" float margin=0.f;\n" -" b3Float4 aabbAMinOut,aabbAMaxOut;\n" -" b3TransformAabb2(aMinLocal,aMaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n" -" b3Float4 aabbBMinOut,aabbBMaxOut;\n" -" b3TransformAabb2(bMinLocal,bMaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n" -" \n" -" bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n" -" if (nodeOverlap)\n" -" {\n" -" bool isLeafA = isLeafNodeGlobal(&quantizedNodes[node.x]);\n" -" bool isLeafB = isLeafNodeGlobal(&quantizedNodes[node.y]);\n" -" bool isInternalA = !isLeafA;\n" -" bool isInternalB = !isLeafB;\n" -" //fail, even though it might hit two leaf nodes\n" -" if (depth+4>maxStackDepth && !(isLeafA && isLeafB))\n" -" {\n" -" //printf(\"Error: traversal exceeded maxStackDepth\");\n" -" continue;\n" -" }\n" -" if(isInternalA)\n" -" {\n" -" int nodeAleftChild = node.x+1;\n" -" bool isNodeALeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.x+1]);\n" -" int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + getEscapeIndexGlobal(&quantizedNodes[node.x+1]);\n" -" if(isInternalB)\n" -" { \n" -" int nodeBleftChild = node.y+1;\n" -" bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n" -" int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n" -" nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);\n" -" nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);\n" -" nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);\n" -" nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);\n" -" }\n" -" else\n" -" {\n" -" nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);\n" -" nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);\n" -" }\n" -" }\n" -" else\n" -" {\n" -" if(isInternalB)\n" -" {\n" -" int nodeBleftChild = node.y+1;\n" -" bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n" -" int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n" -" nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);\n" -" nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);\n" -" }\n" -" else\n" -" {\n" -" int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" -" if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" -" {\n" -" int childShapeIndexA = getTriangleIndexGlobal(&quantizedNodes[node.x]);\n" -" int childShapeIndexB = getTriangleIndexGlobal(&quantizedNodes[node.y]);\n" -" gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n" -" }\n" -" }\n" -" }\n" -" }\n" -" } while (depth);\n" -" }\n" -" }\n" -" }\n" -" \n" -" return;\n" -" }\n" -" if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" -" {\n" -" if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) \n" -" {\n" -" int numChildrenA = collidables[collidableIndexA].m_numChildShapes;\n" -" for (int c=0;c<numChildrenA;c++)\n" -" {\n" -" int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;\n" -" int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" -" float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" -" float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" -" float4 newOrnA = qtMul(ornA,childOrnA);\n" -" int shapeIndexA = collidables[childColIndexA].m_shapeIndex;\n" -" b3Aabb_t aabbAlocal = aabbLocalSpace[shapeIndexA];\n" -" float margin = 0.f;\n" -" \n" -" b3Float4 aabbAMinWS;\n" -" b3Float4 aabbAMaxWS;\n" -" \n" -" b3TransformAabb2(aabbAlocal.m_minVec,aabbAlocal.m_maxVec,margin,\n" -" newPosA,\n" -" newOrnA,\n" -" &aabbAMinWS,&aabbAMaxWS);\n" -" \n" -" \n" -" if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" {\n" -" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n" -" for (int b=0;b<numChildrenB;b++)\n" -" {\n" -" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n" -" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" -" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" -" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" -" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" -" float4 newOrnB = qtMul(ornB,childOrnB);\n" -" int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" -" b3Aabb_t aabbBlocal = aabbLocalSpace[shapeIndexB];\n" -" \n" -" b3Float4 aabbBMinWS;\n" -" b3Float4 aabbBMaxWS;\n" -" \n" -" b3TransformAabb2(aabbBlocal.m_minVec,aabbBlocal.m_maxVec,margin,\n" -" newPosB,\n" -" newOrnB,\n" -" &aabbBMinWS,&aabbBMaxWS);\n" -" \n" -" \n" -" \n" -" bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinWS,aabbAMaxWS,aabbBMinWS,aabbBMaxWS);\n" -" if (aabbOverlap)\n" -" {\n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" float dmin = FLT_MAX;\n" -" float4 posA = newPosA;\n" -" posA.w = 0.f;\n" -" float4 posB = newPosB;\n" -" posB.w = 0.f;\n" -" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" -" float4 ornA = newOrnA;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 ornB =newOrnB;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" {//\n" -" int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" -" if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" -" {\n" -" gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n" -" }\n" -" }//\n" -" }//fi(1)\n" -" } //for (int b=0\n" -" }//if (collidables[collidableIndexB].\n" -" else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" {\n" -" if (1)\n" -" {\n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" float dmin = FLT_MAX;\n" -" float4 posA = newPosA;\n" -" posA.w = 0.f;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" posB.w = 0.f;\n" -" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" -" float4 ornA = newOrnA;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" {\n" -" int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" -" if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" -" {\n" -" gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);\n" -" }//if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" -" }//\n" -" }//fi (1)\n" -" }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" }//for (int b=0;b<numChildrenB;b++) \n" -" return;\n" -" }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) \n" -" && (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" -" {\n" -" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n" -" for (int b=0;b<numChildrenB;b++)\n" -" {\n" -" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n" -" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" -" float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" -" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" -" float4 newPosB = qtRotate(ornB,childPosB)+posB;\n" -" float4 newOrnB = qtMul(ornB,childOrnB);\n" -" int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" -" //////////////////////////////////////\n" -" if (1)\n" -" {\n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" float dmin = FLT_MAX;\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" posA.w = 0.f;\n" -" float4 posB = newPosB;\n" -" posB.w = 0.f;\n" -" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 ornB =newOrnB;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" {//\n" -" int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" -" if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" -" {\n" -" gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);\n" -" }//fi (compoundPairIdx<maxNumCompoundPairsCapacity)\n" -" }//\n" -" }//fi (1) \n" -" }//for (int b=0;b<numChildrenB;b++)\n" -" return;\n" -" }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" return;\n" -" }//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" -" }//i<numPairs\n" -"}\n" -"// work-in-progress\n" -"__kernel void findSeparatingAxisKernel( __global const int4* pairs, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global btAabbCL* aabbs,\n" -" __global volatile float4* separatingNormals,\n" -" __global volatile int* hasSeparatingAxis,\n" -" int numPairs\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" \n" -" if (i<numPairs)\n" -" {\n" -" \n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" \n" -" //once the broadphase avoids static-static pairs, we can remove this test\n" -" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" return;\n" -" }\n" -" \n" -" if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" return;\n" -" }\n" -" \n" -" if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" return;\n" -" }\n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" float dmin = FLT_MAX;\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" posA.w = 0.f;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" posB.w = 0.f;\n" -" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" float4 sepNormal;\n" -" \n" -" bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" vertices,uniqueEdges,faces,\n" -" indices,&sepNormal,&dmin);\n" -" hasSeparatingAxis[i] = 4;\n" -" if (!sepA)\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" } else\n" -" {\n" -" bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n" -" posA,ornA,\n" -" DeltaC2,\n" -" vertices,uniqueEdges,faces,\n" -" indices,&sepNormal,&dmin);\n" -" if (!sepB)\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" } else\n" -" {\n" -" bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" vertices,uniqueEdges,faces,\n" -" indices,&sepNormal,&dmin);\n" -" if (!sepEE)\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" } else\n" -" {\n" -" hasSeparatingAxis[i] = 1;\n" -" separatingNormals[i] = sepNormal;\n" -" }\n" -" }\n" -" }\n" -" \n" -" }\n" -"}\n" -"__kernel void findSeparatingAxisVertexFaceKernel( __global const int4* pairs, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global btAabbCL* aabbs,\n" -" __global volatile float4* separatingNormals,\n" -" __global volatile int* hasSeparatingAxis,\n" -" __global float* dmins,\n" -" int numPairs\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" \n" -" if (i<numPairs)\n" -" {\n" -" \n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" hasSeparatingAxis[i] = 0; \n" -" \n" -" //once the broadphase avoids static-static pairs, we can remove this test\n" -" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" -" {\n" -" return;\n" -" }\n" -" \n" -" if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n" -" {\n" -" return;\n" -" }\n" -" \n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" float dmin = FLT_MAX;\n" -" dmins[i] = dmin;\n" -" \n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" posA.w = 0.f;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" posB.w = 0.f;\n" -" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" float4 sepNormal;\n" -" \n" -" bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" vertices,uniqueEdges,faces,\n" -" indices,&sepNormal,&dmin);\n" -" hasSeparatingAxis[i] = 4;\n" -" if (!sepA)\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" } else\n" -" {\n" -" bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n" -" posA,ornA,\n" -" DeltaC2,\n" -" vertices,uniqueEdges,faces,\n" -" indices,&sepNormal,&dmin);\n" -" if (sepB)\n" -" {\n" -" dmins[i] = dmin;\n" -" hasSeparatingAxis[i] = 1;\n" -" separatingNormals[i] = sepNormal;\n" -" }\n" -" }\n" -" \n" -" }\n" -"}\n" -"__kernel void findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs, \n" -" __global const BodyData* rigidBodies, \n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global btAabbCL* aabbs,\n" -" __global float4* separatingNormals,\n" -" __global int* hasSeparatingAxis,\n" -" __global float* dmins,\n" -" __global const float4* unitSphereDirections,\n" -" int numUnitSphereDirections,\n" -" int numPairs\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" \n" -" if (i<numPairs)\n" -" {\n" -" if (hasSeparatingAxis[i])\n" -" {\n" -" \n" -" int bodyIndexA = pairs[i].x;\n" -" int bodyIndexB = pairs[i].y;\n" -" \n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" \n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" \n" -" \n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" \n" -" float dmin = dmins[i];\n" -" \n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" posA.w = 0.f;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" posB.w = 0.f;\n" -" float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" float4 sepNormal = separatingNormals[i];\n" -" \n" -" \n" -" \n" -" bool sepEE = false;\n" -" int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n" -" if (numEdgeEdgeDirections<=numUnitSphereDirections)\n" -" {\n" -" sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" vertices,uniqueEdges,faces,\n" -" indices,&sepNormal,&dmin);\n" -" \n" -" if (!sepEE)\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" } else\n" -" {\n" -" hasSeparatingAxis[i] = 1;\n" -" separatingNormals[i] = sepNormal;\n" -" }\n" -" }\n" -" /*\n" -" ///else case is a separate kernel, to make Mac OSX OpenCL compiler happy\n" -" else\n" -" {\n" -" sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" vertices,unitSphereDirections,numUnitSphereDirections,\n" -" &sepNormal,&dmin);\n" -" if (!sepEE)\n" -" {\n" -" hasSeparatingAxis[i] = 0;\n" -" } else\n" -" {\n" -" hasSeparatingAxis[i] = 1;\n" -" separatingNormals[i] = sepNormal;\n" -" }\n" -" }\n" -" */\n" -" } //if (hasSeparatingAxis[i])\n" -" }//(i<numPairs)\n" -"}\n" -"inline int findClippingFaces(const float4 separatingNormal,\n" -" const ConvexPolyhedronCL* hullA, \n" -" __global const ConvexPolyhedronCL* hullB,\n" -" const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n" -" __global float4* worldVertsA1,\n" -" __global float4* worldNormalsA1,\n" -" __global float4* worldVertsB1,\n" -" int capacityWorldVerts,\n" -" const float minDist, float maxDist,\n" -" const float4* verticesA,\n" -" const btGpuFace* facesA,\n" -" const int* indicesA,\n" -" __global const float4* verticesB,\n" -" __global const btGpuFace* facesB,\n" -" __global const int* indicesB,\n" -" __global int4* clippingFaces, int pairIndex)\n" -"{\n" -" int numContactsOut = 0;\n" -" int numWorldVertsB1= 0;\n" -" \n" -" \n" -" int closestFaceB=0;\n" -" float dmax = -FLT_MAX;\n" -" \n" -" {\n" -" for(int face=0;face<hullB->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n" -" facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n" -" const float4 WorldNormal = qtRotate(ornB, Normal);\n" -" float d = dot3F4(WorldNormal,separatingNormal);\n" -" if (d > dmax)\n" -" {\n" -" dmax = d;\n" -" closestFaceB = face;\n" -" }\n" -" }\n" -" }\n" -" \n" -" {\n" -" const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" -" int numVertices = polyB.m_numIndices;\n" -" if (numVertices>capacityWorldVerts)\n" -" numVertices = capacityWorldVerts;\n" -" \n" -" for(int e0=0;e0<numVertices;e0++)\n" -" {\n" -" if (e0<capacityWorldVerts)\n" -" {\n" -" const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n" -" worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" -" }\n" -" }\n" -" }\n" -" \n" -" int closestFaceA=0;\n" -" {\n" -" float dmin = FLT_MAX;\n" -" for(int face=0;face<hullA->m_numFaces;face++)\n" -" {\n" -" const float4 Normal = make_float4(\n" -" facesA[hullA->m_faceOffset+face].m_plane.x,\n" -" facesA[hullA->m_faceOffset+face].m_plane.y,\n" -" facesA[hullA->m_faceOffset+face].m_plane.z,\n" -" 0.f);\n" -" const float4 faceANormalWS = qtRotate(ornA,Normal);\n" -" \n" -" float d = dot3F4(faceANormalWS,separatingNormal);\n" -" if (d < dmin)\n" -" {\n" -" dmin = d;\n" -" closestFaceA = face;\n" -" worldNormalsA1[pairIndex] = faceANormalWS;\n" -" }\n" -" }\n" -" }\n" -" \n" -" int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n" -" if (numVerticesA>capacityWorldVerts)\n" -" numVerticesA = capacityWorldVerts;\n" -" \n" -" for(int e0=0;e0<numVerticesA;e0++)\n" -" {\n" -" if (e0<capacityWorldVerts)\n" -" {\n" -" const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n" -" worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n" -" }\n" -" }\n" -" \n" -" clippingFaces[pairIndex].x = closestFaceA;\n" -" clippingFaces[pairIndex].y = closestFaceB;\n" -" clippingFaces[pairIndex].z = numVerticesA;\n" -" clippingFaces[pairIndex].w = numWorldVertsB1;\n" -" \n" -" \n" -" return numContactsOut;\n" -"}\n" -"// work-in-progress\n" -"__kernel void findConcaveSeparatingAxisKernel( __global int4* concavePairs,\n" -" __global const BodyData* rigidBodies,\n" -" __global const btCollidableGpu* collidables,\n" -" __global const ConvexPolyhedronCL* convexShapes, \n" -" __global const float4* vertices,\n" -" __global const float4* uniqueEdges,\n" -" __global const btGpuFace* faces,\n" -" __global const int* indices,\n" -" __global const btGpuChildShape* gpuChildShapes,\n" -" __global btAabbCL* aabbs,\n" -" __global float4* concaveSeparatingNormalsOut,\n" -" __global int* concaveHasSeparatingNormals,\n" -" __global int4* clippingFacesOut,\n" -" __global float4* worldVertsA1GPU,\n" -" __global float4* worldNormalsAGPU,\n" -" __global float4* worldVertsB1GPU,\n" -" int vertexFaceCapacity,\n" -" int numConcavePairs\n" -" )\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numConcavePairs)\n" -" return;\n" -" concaveHasSeparatingNormals[i] = 0;\n" -" int pairIdx = i;\n" -" int bodyIndexA = concavePairs[i].x;\n" -" int bodyIndexB = concavePairs[i].y;\n" -" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" -" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" -" int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" -" int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" -" if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n" -" collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" {\n" -" concavePairs[pairIdx].w = -1;\n" -" return;\n" -" }\n" -" int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" -" int numActualConcaveConvexTests = 0;\n" -" \n" -" int f = concavePairs[i].z;\n" -" \n" -" bool overlap = false;\n" -" \n" -" ConvexPolyhedronCL convexPolyhedronA;\n" -" //add 3 vertices of the triangle\n" -" convexPolyhedronA.m_numVertices = 3;\n" -" convexPolyhedronA.m_vertexOffset = 0;\n" -" float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" -" btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" -" float4 triMinAabb, triMaxAabb;\n" -" btAabbCL triAabb;\n" -" triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n" -" triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n" -" \n" -" float4 verticesA[3];\n" -" for (int i=0;i<3;i++)\n" -" {\n" -" int index = indices[face.m_indexOffset+i];\n" -" float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" -" verticesA[i] = vert;\n" -" localCenter += vert;\n" -" \n" -" triAabb.m_min = min(triAabb.m_min,vert); \n" -" triAabb.m_max = max(triAabb.m_max,vert); \n" -" }\n" -" overlap = true;\n" -" overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n" -" overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n" -" overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n" -" \n" -" if (overlap)\n" -" {\n" -" float dmin = FLT_MAX;\n" -" int hasSeparatingAxis=5;\n" -" float4 sepAxis=make_float4(1,2,3,4);\n" -" int localCC=0;\n" -" numActualConcaveConvexTests++;\n" -" //a triangle has 3 unique edges\n" -" convexPolyhedronA.m_numUniqueEdges = 3;\n" -" convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" -" float4 uniqueEdgesA[3];\n" -" \n" -" uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" -" uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" -" uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" -" convexPolyhedronA.m_faceOffset = 0;\n" -" \n" -" float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" -" \n" -" btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n" -" int indicesA[3+3+2+2+2];\n" -" int curUsedIndices=0;\n" -" int fidx=0;\n" -" //front size of triangle\n" -" {\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[0] = 0;\n" -" indicesA[1] = 1;\n" -" indicesA[2] = 2;\n" -" curUsedIndices+=3;\n" -" float c = face.m_plane.w;\n" -" facesA[fidx].m_plane.x = normal.x;\n" -" facesA[fidx].m_plane.y = normal.y;\n" -" facesA[fidx].m_plane.z = normal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" facesA[fidx].m_numIndices=3;\n" -" }\n" -" fidx++;\n" -" //back size of triangle\n" -" {\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[3]=2;\n" -" indicesA[4]=1;\n" -" indicesA[5]=0;\n" -" curUsedIndices+=3;\n" -" float c = dot(normal,verticesA[0]);\n" -" float c1 = -face.m_plane.w;\n" -" facesA[fidx].m_plane.x = -normal.x;\n" -" facesA[fidx].m_plane.y = -normal.y;\n" -" facesA[fidx].m_plane.z = -normal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" facesA[fidx].m_numIndices=3;\n" -" }\n" -" fidx++;\n" -" bool addEdgePlanes = true;\n" -" if (addEdgePlanes)\n" -" {\n" -" int numVertices=3;\n" -" int prevVertex = numVertices-1;\n" -" for (int i=0;i<numVertices;i++)\n" -" {\n" -" float4 v0 = verticesA[i];\n" -" float4 v1 = verticesA[prevVertex];\n" -" \n" -" float4 edgeNormal = normalize(cross(normal,v1-v0));\n" -" float c = -dot(edgeNormal,v0);\n" -" facesA[fidx].m_numIndices = 2;\n" -" facesA[fidx].m_indexOffset=curUsedIndices;\n" -" indicesA[curUsedIndices++]=i;\n" -" indicesA[curUsedIndices++]=prevVertex;\n" -" \n" -" facesA[fidx].m_plane.x = edgeNormal.x;\n" -" facesA[fidx].m_plane.y = edgeNormal.y;\n" -" facesA[fidx].m_plane.z = edgeNormal.z;\n" -" facesA[fidx].m_plane.w = c;\n" -" fidx++;\n" -" prevVertex = i;\n" -" }\n" -" }\n" -" convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" -" convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" -" float4 posA = rigidBodies[bodyIndexA].m_pos;\n" -" posA.w = 0.f;\n" -" float4 posB = rigidBodies[bodyIndexB].m_pos;\n" -" posB.w = 0.f;\n" -" float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" -" float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" -" \n" -" ///////////////////\n" -" ///compound shape support\n" -" if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" -" {\n" -" int compoundChild = concavePairs[pairIdx].w;\n" -" int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n" -" int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" -" float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" -" float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" -" float4 newPosB = transform(&childPosB,&posB,&ornB);\n" -" float4 newOrnB = qtMul(ornB,childOrnB);\n" -" posB = newPosB;\n" -" ornB = newOrnB;\n" -" shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" -" }\n" -" //////////////////\n" -" float4 c0local = convexPolyhedronA.m_localCenter;\n" -" float4 c0 = transform(&c0local, &posA, &ornA);\n" -" float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" -" float4 c1 = transform(&c1local,&posB,&ornB);\n" -" const float4 DeltaC2 = c0 - c1;\n" -" bool sepA = findSeparatingAxisLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" -" posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" verticesA,uniqueEdgesA,facesA,indicesA,\n" -" vertices,uniqueEdges,faces,indices,\n" -" &sepAxis,&dmin);\n" -" hasSeparatingAxis = 4;\n" -" if (!sepA)\n" -" {\n" -" hasSeparatingAxis = 0;\n" -" } else\n" -" {\n" -" bool sepB = findSeparatingAxisLocalB( &convexShapes[shapeIndexB],&convexPolyhedronA,\n" -" posB,ornB,\n" -" posA,ornA,\n" -" DeltaC2,\n" -" vertices,uniqueEdges,faces,indices,\n" -" verticesA,uniqueEdgesA,facesA,indicesA,\n" -" &sepAxis,&dmin);\n" -" if (!sepB)\n" -" {\n" -" hasSeparatingAxis = 0;\n" -" } else\n" -" {\n" -" bool sepEE = findSeparatingAxisEdgeEdgeLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" -" posA,ornA,\n" -" posB,ornB,\n" -" DeltaC2,\n" -" verticesA,uniqueEdgesA,facesA,indicesA,\n" -" vertices,uniqueEdges,faces,indices,\n" -" &sepAxis,&dmin);\n" -" \n" -" if (!sepEE)\n" -" {\n" -" hasSeparatingAxis = 0;\n" -" } else\n" -" {\n" -" hasSeparatingAxis = 1;\n" -" }\n" -" }\n" -" } \n" -" \n" -" if (hasSeparatingAxis)\n" -" {\n" -" sepAxis.w = dmin;\n" -" concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n" -" concaveHasSeparatingNormals[i]=1;\n" -" float minDist = -1e30f;\n" -" float maxDist = 0.02f;\n" -" \n" -" findClippingFaces(sepAxis,\n" -" &convexPolyhedronA,\n" -" &convexShapes[shapeIndexB],\n" -" posA,ornA,\n" -" posB,ornB,\n" -" worldVertsA1GPU,\n" -" worldNormalsAGPU,\n" -" worldVertsB1GPU,\n" -" vertexFaceCapacity,\n" -" minDist, maxDist,\n" -" verticesA,\n" -" facesA,\n" -" indicesA,\n" -" vertices,\n" -" faces,\n" -" indices,\n" -" clippingFacesOut, pairIdx);\n" -" } else\n" -" { \n" -" //mark this pair as in-active\n" -" concavePairs[pairIdx].w = -1;\n" -" }\n" -" }\n" -" else\n" -" { \n" -" //mark this pair as in-active\n" -" concavePairs[pairIdx].w = -1;\n" -" }\n" -" \n" -" concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts\n" -"}\n" -; +static const char* satKernelsCL = + "//keep this enum in sync with the CPU version (in btCollidable.h)\n" + "//written by Erwin Coumans\n" + "#define SHAPE_CONVEX_HULL 3\n" + "#define SHAPE_CONCAVE_TRIMESH 5\n" + "#define TRIANGLE_NUM_CONVEX_FACES 5\n" + "#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" + "#define B3_MAX_STACK_DEPTH 256\n" + "typedef unsigned int u32;\n" + "///keep this in sync with btCollidable.h\n" + "typedef struct\n" + "{\n" + " union {\n" + " int m_numChildShapes;\n" + " int m_bvhIndex;\n" + " };\n" + " union\n" + " {\n" + " float m_radius;\n" + " int m_compoundBvhIndex;\n" + " };\n" + " \n" + " int m_shapeType;\n" + " int m_shapeIndex;\n" + " \n" + "} btCollidableGpu;\n" + "#define MAX_NUM_PARTS_IN_BITS 10\n" + "///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.\n" + "///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n" + "typedef struct\n" + "{\n" + " //12 bytes\n" + " unsigned short int m_quantizedAabbMin[3];\n" + " unsigned short int m_quantizedAabbMax[3];\n" + " //4 bytes\n" + " int m_escapeIndexOrTriangleIndex;\n" + "} b3QuantizedBvhNode;\n" + "typedef struct\n" + "{\n" + " float4 m_aabbMin;\n" + " float4 m_aabbMax;\n" + " float4 m_quantization;\n" + " int m_numNodes;\n" + " int m_numSubTrees;\n" + " int m_nodeOffset;\n" + " int m_subTreeOffset;\n" + "} b3BvhInfo;\n" + "int getTriangleIndex(const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " unsigned int x=0;\n" + " unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" + " // Get only the lower bits where the triangle index is stored\n" + " return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" + "}\n" + "int getTriangleIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " unsigned int x=0;\n" + " unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n" + " // Get only the lower bits where the triangle index is stored\n" + " return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n" + "}\n" + "int isLeafNode(const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" + " return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" + "}\n" + "int isLeafNodeGlobal(__global const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n" + " return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n" + "}\n" + " \n" + "int getEscapeIndex(const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " return -rootNode->m_escapeIndexOrTriangleIndex;\n" + "}\n" + "int getEscapeIndexGlobal(__global const b3QuantizedBvhNode* rootNode)\n" + "{\n" + " return -rootNode->m_escapeIndexOrTriangleIndex;\n" + "}\n" + "typedef struct\n" + "{\n" + " //12 bytes\n" + " unsigned short int m_quantizedAabbMin[3];\n" + " unsigned short int m_quantizedAabbMax[3];\n" + " //4 bytes, points to the root of the subtree\n" + " int m_rootNodeIndex;\n" + " //4 bytes\n" + " int m_subtreeSize;\n" + " int m_padding[3];\n" + "} b3BvhSubtreeInfo;\n" + "typedef struct\n" + "{\n" + " float4 m_childPosition;\n" + " float4 m_childOrientation;\n" + " int m_shapeIndex;\n" + " int m_unused0;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "} btGpuChildShape;\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " float4 m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " u32 m_collidableIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} BodyData;\n" + "typedef struct \n" + "{\n" + " float4 m_localCenter;\n" + " float4 m_extents;\n" + " float4 mC;\n" + " float4 mE;\n" + " \n" + " float m_radius;\n" + " int m_faceOffset;\n" + " int m_numFaces;\n" + " int m_numVertices;\n" + " int m_vertexOffset;\n" + " int m_uniqueEdgesOffset;\n" + " int m_numUniqueEdges;\n" + " int m_unused;\n" + "} ConvexPolyhedronCL;\n" + "typedef struct \n" + "{\n" + " union\n" + " {\n" + " float4 m_min;\n" + " float m_minElems[4];\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float4 m_max;\n" + " float m_maxElems[4];\n" + " int m_maxIndices[4];\n" + " };\n" + "} btAabbCL;\n" + "#ifndef B3_AABB_H\n" + "#define B3_AABB_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_MAT3x3_H\n" + "#define B3_MAT3x3_H\n" + "#ifndef B3_QUAT_H\n" + "#define B3_QUAT_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif\n" + "#endif\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Quat;\n" + " #define b3QuatConstArg const b3Quat\n" + " \n" + " \n" + "inline float4 b3FastNormalize4(float4 v)\n" + "{\n" + " v = (float4)(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + " \n" + "inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" + "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" + "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" + "{\n" + " b3Quat ans;\n" + " ans = b3Cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - b3Dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" + "{\n" + " b3Quat q;\n" + " q=in;\n" + " //return b3FastNormalize4(in);\n" + " float len = native_sqrt(dot(q, q));\n" + " if(len > 0.f)\n" + " {\n" + " q *= 1.f / len;\n" + " }\n" + " else\n" + " {\n" + " q.x = q.y = q.z = 0.f;\n" + " q.w = 1.f;\n" + " }\n" + " return q;\n" + "}\n" + "inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " b3Quat qInv = b3QuatInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " return b3QuatRotate( b3QuatInvert( q ), vec );\n" + "}\n" + "inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" + "{\n" + " return b3QuatRotate( orientation, point ) + (translation);\n" + "}\n" + " \n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "typedef struct\n" + "{\n" + " b3Float4 m_row[3];\n" + "}b3Mat3x3;\n" + "#define b3Mat3x3ConstArg const b3Mat3x3\n" + "#define b3GetRow(m,row) (m.m_row[row])\n" + "inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" + "{\n" + " b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" + " b3Mat3x3 out;\n" + " out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" + " out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" + " out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" + " out.m_row[0].w = 0.f;\n" + " out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" + " out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" + " out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" + " out.m_row[1].w = 0.f;\n" + " out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" + " out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" + " out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" + " out.m_row[2].w = 0.f;\n" + " return out;\n" + "}\n" + "inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = fabs(matIn.m_row[0]);\n" + " out.m_row[1] = fabs(matIn.m_row[1]);\n" + " out.m_row[2] = fabs(matIn.m_row[2]);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtZero();\n" + "__inline\n" + "b3Mat3x3 mtIdentity();\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Mat3x3 mtZero()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(0.f);\n" + " m.m_row[1] = (b3Float4)(0.f);\n" + " m.m_row[2] = (b3Float4)(0.f);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtIdentity()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(1,0,0,0);\n" + " m.m_row[1] = (b3Float4)(0,1,0,0);\n" + " m.m_row[2] = (b3Float4)(0,0,1,0);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" + " out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" + " out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" + "{\n" + " b3Mat3x3 transB;\n" + " transB = mtTranspose( b );\n" + " b3Mat3x3 ans;\n" + " // why this doesn't run when 0ing in the for{}\n" + " a.m_row[0].w = 0.f;\n" + " a.m_row[1].w = 0.f;\n" + " a.m_row[2].w = 0.f;\n" + " for(int i=0; i<3; i++)\n" + " {\n" + "// a.m_row[i].w = 0.f;\n" + " ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" + " ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" + " ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" + " ans.m_row[i].w = 0.f;\n" + " }\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" + "{\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a.m_row[0], b );\n" + " ans.y = b3Dot3F4( a.m_row[1], b );\n" + " ans.z = b3Dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" + "{\n" + " b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a, colx );\n" + " ans.y = b3Dot3F4( a, coly );\n" + " ans.z = b3Dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "#endif\n" + "#endif //B3_MAT3x3_H\n" + "typedef struct b3Aabb b3Aabb_t;\n" + "struct b3Aabb\n" + "{\n" + " union\n" + " {\n" + " float m_min[4];\n" + " b3Float4 m_minVec;\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float m_max[4];\n" + " b3Float4 m_maxVec;\n" + " int m_signedMaxIndices[4];\n" + " };\n" + "};\n" + "inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" + " b3Float4ConstArg pos,\n" + " b3QuatConstArg orn,\n" + " b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" + "{\n" + " b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" + " localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" + " b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" + " b3Mat3x3 m;\n" + " m = b3QuatGetRotationMatrix(orn);\n" + " b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" + " b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" + " \n" + " b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" + " b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" + " b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" + " 0.f);\n" + " *aabbMinOut = center-extent;\n" + " *aabbMaxOut = center+extent;\n" + "}\n" + "/// conservative test for overlap between two aabbs\n" + "inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" + " b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" + " overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" + " overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "#endif //B3_AABB_H\n" + "/*\n" + "Bullet Continuous Collision Detection and Physics Library\n" + "Copyright (c) 2003-2013 Erwin Coumans http://bulletphysics.org\n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose,\n" + "including commercial applications, and to alter it and redistribute it freely,\n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "#ifndef B3_INT2_H\n" + "#define B3_INT2_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#define b3UnsignedInt2 uint2\n" + "#define b3Int2 int2\n" + "#define b3MakeInt2 (int2)\n" + "#endif //__cplusplus\n" + "#endif\n" + "typedef struct\n" + "{\n" + " float4 m_plane;\n" + " int m_indexOffset;\n" + " int m_numIndices;\n" + "} btGpuFace;\n" + "#define make_float4 (float4)\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + " \n" + "// float4 a1 = make_float4(a.xyz,0.f);\n" + "// float4 b1 = make_float4(b.xyz,0.f);\n" + "// return cross(a1,b1);\n" + "//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n" + " \n" + " // float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n" + " \n" + " //return c;\n" + "}\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = make_float4(a.xyz,0.f);\n" + " float4 b1 = make_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " v = make_float4(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + "///////////////////////////////////////\n" + "// Quaternion\n" + "///////////////////////////////////////\n" + "typedef float4 Quaternion;\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b);\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in);\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec);\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q);\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b)\n" + "{\n" + " Quaternion ans;\n" + " ans = cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in)\n" + "{\n" + " return fastNormalize4(in);\n" + "// in /= length( in );\n" + "// return in;\n" + "}\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec)\n" + "{\n" + " Quaternion qInv = qtInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = qtMul(qtMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q)\n" + "{\n" + " return (Quaternion)(-q.xyz, q.w);\n" + "}\n" + "__inline\n" + "float4 qtInvRotate(const Quaternion q, float4 vec)\n" + "{\n" + " return qtRotate( qtInvert( q ), vec );\n" + "}\n" + "__inline\n" + "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n" + "{\n" + " return qtRotate( *orientation, *p ) + (*translation);\n" + "}\n" + "__inline\n" + "float4 normalize3(const float4 a)\n" + "{\n" + " float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" + " return fastNormalize4( n );\n" + "}\n" + "inline void projectLocal(const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" + "const float4* dir, const float4* vertices, float* min, float* max)\n" + "{\n" + " min[0] = FLT_MAX;\n" + " max[0] = -FLT_MAX;\n" + " int numVerts = hull->m_numVertices;\n" + " const float4 localDir = qtInvRotate(orn,*dir);\n" + " float offset = dot(pos,*dir);\n" + " for(int i=0;i<numVerts;i++)\n" + " {\n" + " float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" + " if(dp < min[0]) \n" + " min[0] = dp;\n" + " if(dp > max[0]) \n" + " max[0] = dp;\n" + " }\n" + " if(min[0]>max[0])\n" + " {\n" + " float tmp = min[0];\n" + " min[0] = max[0];\n" + " max[0] = tmp;\n" + " }\n" + " min[0] += offset;\n" + " max[0] += offset;\n" + "}\n" + "inline void project(__global const ConvexPolyhedronCL* hull, const float4 pos, const float4 orn, \n" + "const float4* dir, __global const float4* vertices, float* min, float* max)\n" + "{\n" + " min[0] = FLT_MAX;\n" + " max[0] = -FLT_MAX;\n" + " int numVerts = hull->m_numVertices;\n" + " const float4 localDir = qtInvRotate(orn,*dir);\n" + " float offset = dot(pos,*dir);\n" + " for(int i=0;i<numVerts;i++)\n" + " {\n" + " float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n" + " if(dp < min[0]) \n" + " min[0] = dp;\n" + " if(dp > max[0]) \n" + " max[0] = dp;\n" + " }\n" + " if(min[0]>max[0])\n" + " {\n" + " float tmp = min[0];\n" + " min[0] = max[0];\n" + " max[0] = tmp;\n" + " }\n" + " min[0] += offset;\n" + " max[0] += offset;\n" + "}\n" + "inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA,const float4 ornA,\n" + " const float4 posB,const float4 ornB,\n" + " float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n" + "{\n" + " float Min0,Max0;\n" + " float Min1,Max1;\n" + " projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n" + " project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n" + " if(Max0<Min1 || Max1<Min0)\n" + " return false;\n" + " float d0 = Max0 - Min1;\n" + " float d1 = Max1 - Min0;\n" + " *depth = d0<d1 ? d0:d1;\n" + " return true;\n" + "}\n" + "inline bool IsAlmostZero(const float4 v)\n" + "{\n" + " if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n" + " return false;\n" + " return true;\n" + "}\n" + "bool findSeparatingAxisLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " \n" + " const float4* verticesA, \n" + " const float4* uniqueEdgesA, \n" + " const btGpuFace* facesA,\n" + " const int* indicesA,\n" + " __global const float4* verticesB, \n" + " __global const float4* uniqueEdgesB, \n" + " __global const btGpuFace* facesB,\n" + " __global const int* indicesB,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " \n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " int curPlaneTests=0;\n" + " {\n" + " int numFacesA = hullA->m_numFaces;\n" + " // Test normals from hullA\n" + " for(int i=0;i<numFacesA;i++)\n" + " {\n" + " const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" + " float4 faceANormalWS = qtRotate(ornA,normal);\n" + " if (dot3F4(DeltaC2,faceANormalWS)<0)\n" + " faceANormalWS*=-1.f;\n" + " curPlaneTests++;\n" + " float d;\n" + " if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n" + " return false;\n" + " if(d<*dmin)\n" + " {\n" + " *dmin = d;\n" + " *sep = faceANormalWS;\n" + " }\n" + " }\n" + " }\n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " return true;\n" + "}\n" + "bool findSeparatingAxisLocalB( __global const ConvexPolyhedronCL* hullA, const ConvexPolyhedronCL* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " __global const float4* verticesA, \n" + " __global const float4* uniqueEdgesA, \n" + " __global const btGpuFace* facesA,\n" + " __global const int* indicesA,\n" + " const float4* verticesB,\n" + " const float4* uniqueEdgesB, \n" + " const btGpuFace* facesB,\n" + " const int* indicesB,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " int curPlaneTests=0;\n" + " {\n" + " int numFacesA = hullA->m_numFaces;\n" + " // Test normals from hullA\n" + " for(int i=0;i<numFacesA;i++)\n" + " {\n" + " const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n" + " float4 faceANormalWS = qtRotate(ornA,normal);\n" + " if (dot3F4(DeltaC2,faceANormalWS)<0)\n" + " faceANormalWS *= -1.f;\n" + " curPlaneTests++;\n" + " float d;\n" + " if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n" + " return false;\n" + " if(d<*dmin)\n" + " {\n" + " *dmin = d;\n" + " *sep = faceANormalWS;\n" + " }\n" + " }\n" + " }\n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " return true;\n" + "}\n" + "bool findSeparatingAxisEdgeEdgeLocalA( const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " const float4* verticesA, \n" + " const float4* uniqueEdgesA, \n" + " const btGpuFace* facesA,\n" + " const int* indicesA,\n" + " __global const float4* verticesB, \n" + " __global const float4* uniqueEdgesB, \n" + " __global const btGpuFace* facesB,\n" + " __global const int* indicesB,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " int curPlaneTests=0;\n" + " int curEdgeEdge = 0;\n" + " // Test edges\n" + " for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n" + " {\n" + " const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n" + " float4 edge0World = qtRotate(ornA,edge0);\n" + " for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n" + " {\n" + " const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n" + " float4 edge1World = qtRotate(ornB,edge1);\n" + " float4 crossje = cross3(edge0World,edge1World);\n" + " curEdgeEdge++;\n" + " if(!IsAlmostZero(crossje))\n" + " {\n" + " crossje = normalize3(crossje);\n" + " if (dot3F4(DeltaC2,crossje)<0)\n" + " crossje *= -1.f;\n" + " float dist;\n" + " bool result = true;\n" + " {\n" + " float Min0,Max0;\n" + " float Min1,Max1;\n" + " projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n" + " project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n" + " \n" + " if(Max0<Min1 || Max1<Min0)\n" + " result = false;\n" + " \n" + " float d0 = Max0 - Min1;\n" + " float d1 = Max1 - Min0;\n" + " dist = d0<d1 ? d0:d1;\n" + " result = true;\n" + " }\n" + " \n" + " if(dist<*dmin)\n" + " {\n" + " *dmin = dist;\n" + " *sep = crossje;\n" + " }\n" + " }\n" + " }\n" + " }\n" + " \n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " return true;\n" + "}\n" + "inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA,const float4 ornA,\n" + " const float4 posB,const float4 ornB,\n" + " float4* sep_axis, __global const float4* vertices,float* depth)\n" + "{\n" + " float Min0,Max0;\n" + " float Min1,Max1;\n" + " project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);\n" + " project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);\n" + " if(Max0<Min1 || Max1<Min0)\n" + " return false;\n" + " float d0 = Max0 - Min1;\n" + " float d1 = Max1 - Min0;\n" + " *depth = d0<d1 ? d0:d1;\n" + " return true;\n" + "}\n" + "bool findSeparatingAxis( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " __global const float4* vertices, \n" + " __global const float4* uniqueEdges, \n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " \n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " \n" + " int curPlaneTests=0;\n" + " {\n" + " int numFacesA = hullA->m_numFaces;\n" + " // Test normals from hullA\n" + " for(int i=0;i<numFacesA;i++)\n" + " {\n" + " const float4 normal = faces[hullA->m_faceOffset+i].m_plane;\n" + " float4 faceANormalWS = qtRotate(ornA,normal);\n" + " \n" + " if (dot3F4(DeltaC2,faceANormalWS)<0)\n" + " faceANormalWS*=-1.f;\n" + " \n" + " curPlaneTests++;\n" + " \n" + " float d;\n" + " if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))\n" + " return false;\n" + " \n" + " if(d<*dmin)\n" + " {\n" + " *dmin = d;\n" + " *sep = faceANormalWS;\n" + " }\n" + " }\n" + " }\n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " \n" + " return true;\n" + "}\n" + "bool findSeparatingAxisUnitSphere( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " __global const float4* vertices,\n" + " __global const float4* unitSphereDirections,\n" + " int numUnitSphereDirections,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " \n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " int curPlaneTests=0;\n" + " int curEdgeEdge = 0;\n" + " // Test unit sphere directions\n" + " for (int i=0;i<numUnitSphereDirections;i++)\n" + " {\n" + " float4 crossje;\n" + " crossje = unitSphereDirections[i]; \n" + " if (dot3F4(DeltaC2,crossje)>0)\n" + " crossje *= -1.f;\n" + " {\n" + " float dist;\n" + " bool result = true;\n" + " float Min0,Max0;\n" + " float Min1,Max1;\n" + " project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n" + " project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n" + " \n" + " if(Max0<Min1 || Max1<Min0)\n" + " return false;\n" + " \n" + " float d0 = Max0 - Min1;\n" + " float d1 = Max1 - Min0;\n" + " dist = d0<d1 ? d0:d1;\n" + " result = true;\n" + " \n" + " if(dist<*dmin)\n" + " {\n" + " *dmin = dist;\n" + " *sep = crossje;\n" + " }\n" + " }\n" + " }\n" + " \n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " return true;\n" + "}\n" + "bool findSeparatingAxisEdgeEdge( __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n" + " const float4 posA1,\n" + " const float4 ornA,\n" + " const float4 posB1,\n" + " const float4 ornB,\n" + " const float4 DeltaC2,\n" + " __global const float4* vertices, \n" + " __global const float4* uniqueEdges, \n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " float4* sep,\n" + " float* dmin)\n" + "{\n" + " \n" + " float4 posA = posA1;\n" + " posA.w = 0.f;\n" + " float4 posB = posB1;\n" + " posB.w = 0.f;\n" + " int curPlaneTests=0;\n" + " int curEdgeEdge = 0;\n" + " // Test edges\n" + " for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n" + " {\n" + " const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];\n" + " float4 edge0World = qtRotate(ornA,edge0);\n" + " for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n" + " {\n" + " const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];\n" + " float4 edge1World = qtRotate(ornB,edge1);\n" + " float4 crossje = cross3(edge0World,edge1World);\n" + " curEdgeEdge++;\n" + " if(!IsAlmostZero(crossje))\n" + " {\n" + " crossje = normalize3(crossje);\n" + " if (dot3F4(DeltaC2,crossje)<0)\n" + " crossje*=-1.f;\n" + " \n" + " float dist;\n" + " bool result = true;\n" + " {\n" + " float Min0,Max0;\n" + " float Min1,Max1;\n" + " project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n" + " project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n" + " \n" + " if(Max0<Min1 || Max1<Min0)\n" + " return false;\n" + " \n" + " float d0 = Max0 - Min1;\n" + " float d1 = Max1 - Min0;\n" + " dist = d0<d1 ? d0:d1;\n" + " result = true;\n" + " }\n" + " \n" + " if(dist<*dmin)\n" + " {\n" + " *dmin = dist;\n" + " *sep = crossje;\n" + " }\n" + " }\n" + " }\n" + " }\n" + " \n" + " if((dot3F4(-DeltaC2,*sep))>0.0f)\n" + " {\n" + " *sep = -(*sep);\n" + " }\n" + " return true;\n" + "}\n" + "// work-in-progress\n" + "__kernel void processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global btAabbCL* aabbs,\n" + " __global const btGpuChildShape* gpuChildShapes,\n" + " __global volatile float4* gpuCompoundSepNormalsOut,\n" + " __global volatile int* gpuHasCompoundSepNormalsOut,\n" + " int numCompoundPairs\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i<numCompoundPairs)\n" + " {\n" + " int bodyIndexA = gpuCompoundPairs[i].x;\n" + " int bodyIndexB = gpuCompoundPairs[i].y;\n" + " int childShapeIndexA = gpuCompoundPairs[i].z;\n" + " int childShapeIndexB = gpuCompoundPairs[i].w;\n" + " \n" + " int collidableIndexA = -1;\n" + " int collidableIndexB = -1;\n" + " \n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " \n" + " float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " \n" + " if (childShapeIndexA >= 0)\n" + " {\n" + " collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" + " float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" + " float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" + " float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" + " float4 newOrnA = qtMul(ornA,childOrnA);\n" + " posA = newPosA;\n" + " ornA = newOrnA;\n" + " } else\n" + " {\n" + " collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " }\n" + " \n" + " if (childShapeIndexB>=0)\n" + " {\n" + " collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" + " float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" + " float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" + " float4 newPosB = transform(&childPosB,&posB,&ornB);\n" + " float4 newOrnB = qtMul(ornB,childOrnB);\n" + " posB = newPosB;\n" + " ornB = newOrnB;\n" + " } else\n" + " {\n" + " collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx; \n" + " }\n" + " \n" + " gpuHasCompoundSepNormalsOut[i] = 0;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n" + " int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n" + " \n" + " if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))\n" + " {\n" + " return;\n" + " }\n" + " int hasSeparatingAxis = 5;\n" + " \n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " float dmin = FLT_MAX;\n" + " posA.w = 0.f;\n" + " posB.w = 0.f;\n" + " float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " float4 sepNormal = make_float4(1,0,0,0);\n" + " bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n" + " hasSeparatingAxis = 4;\n" + " if (!sepA)\n" + " {\n" + " hasSeparatingAxis = 0;\n" + " } else\n" + " {\n" + " bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n" + " if (!sepB)\n" + " {\n" + " hasSeparatingAxis = 0;\n" + " } else//(!sepB)\n" + " {\n" + " bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n" + " if (sepEE)\n" + " {\n" + " gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);\n" + " gpuHasCompoundSepNormalsOut[i] = 1;\n" + " }//sepEE\n" + " }//(!sepB)\n" + " }//(!sepA)\n" + " \n" + " \n" + " }\n" + " \n" + "}\n" + "inline b3Float4 MyUnQuantize(const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n" + "{\n" + " b3Float4 vecOut;\n" + " vecOut = b3MakeFloat4(\n" + " (float)(vecIn[0]) / (quantization.x),\n" + " (float)(vecIn[1]) / (quantization.y),\n" + " (float)(vecIn[2]) / (quantization.z),\n" + " 0.f);\n" + " vecOut += bvhAabbMin;\n" + " return vecOut;\n" + "}\n" + "inline b3Float4 MyUnQuantizeGlobal(__global const unsigned short* vecIn, b3Float4 quantization, b3Float4 bvhAabbMin)\n" + "{\n" + " b3Float4 vecOut;\n" + " vecOut = b3MakeFloat4(\n" + " (float)(vecIn[0]) / (quantization.x),\n" + " (float)(vecIn[1]) / (quantization.y),\n" + " (float)(vecIn[2]) / (quantization.z),\n" + " 0.f);\n" + " vecOut += bvhAabbMin;\n" + " return vecOut;\n" + "}\n" + "// work-in-progress\n" + "__kernel void findCompoundPairsKernel( __global const int4* pairs, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global b3Aabb_t* aabbLocalSpace,\n" + " __global const btGpuChildShape* gpuChildShapes,\n" + " __global volatile int4* gpuCompoundPairsOut,\n" + " __global volatile int* numCompoundPairsOut,\n" + " __global const b3BvhSubtreeInfo* subtrees,\n" + " __global const b3QuantizedBvhNode* quantizedNodes,\n" + " __global const b3BvhInfo* bvhInfos,\n" + " int numPairs,\n" + " int maxNumCompoundPairsCapacity\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i<numPairs)\n" + " {\n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " //once the broadphase avoids static-static pairs, we can remove this test\n" + " if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" + " {\n" + " return;\n" + " }\n" + " if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) &&(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" + " {\n" + " int bvhA = collidables[collidableIndexA].m_compoundBvhIndex;\n" + " int bvhB = collidables[collidableIndexB].m_compoundBvhIndex;\n" + " int numSubTreesA = bvhInfos[bvhA].m_numSubTrees;\n" + " int subTreesOffsetA = bvhInfos[bvhA].m_subTreeOffset;\n" + " int subTreesOffsetB = bvhInfos[bvhB].m_subTreeOffset;\n" + " int numSubTreesB = bvhInfos[bvhB].m_numSubTrees;\n" + " \n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " b3Quat ornA = rigidBodies[bodyIndexA].m_quat;\n" + " b3Quat ornB = rigidBodies[bodyIndexB].m_quat;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " \n" + " for (int p=0;p<numSubTreesA;p++)\n" + " {\n" + " b3BvhSubtreeInfo subtreeA = subtrees[subTreesOffsetA+p];\n" + " //bvhInfos[bvhA].m_quantization\n" + " b3Float4 treeAminLocal = MyUnQuantize(subtreeA.m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" + " b3Float4 treeAmaxLocal = MyUnQuantize(subtreeA.m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" + " b3Float4 aabbAMinOut,aabbAMaxOut;\n" + " float margin=0.f;\n" + " b3TransformAabb2(treeAminLocal,treeAmaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n" + " \n" + " for (int q=0;q<numSubTreesB;q++)\n" + " {\n" + " b3BvhSubtreeInfo subtreeB = subtrees[subTreesOffsetB+q];\n" + " b3Float4 treeBminLocal = MyUnQuantize(subtreeB.m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" + " b3Float4 treeBmaxLocal = MyUnQuantize(subtreeB.m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" + " b3Float4 aabbBMinOut,aabbBMaxOut;\n" + " float margin=0.f;\n" + " b3TransformAabb2(treeBminLocal,treeBmaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n" + " \n" + " \n" + " bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n" + " if (aabbOverlap)\n" + " {\n" + " \n" + " int startNodeIndexA = subtreeA.m_rootNodeIndex+bvhInfos[bvhA].m_nodeOffset;\n" + " int endNodeIndexA = startNodeIndexA+subtreeA.m_subtreeSize;\n" + " int startNodeIndexB = subtreeB.m_rootNodeIndex+bvhInfos[bvhB].m_nodeOffset;\n" + " int endNodeIndexB = startNodeIndexB+subtreeB.m_subtreeSize;\n" + " b3Int2 nodeStack[B3_MAX_STACK_DEPTH];\n" + " b3Int2 node0;\n" + " node0.x = startNodeIndexA;\n" + " node0.y = startNodeIndexB;\n" + " int maxStackDepth = B3_MAX_STACK_DEPTH;\n" + " int depth=0;\n" + " nodeStack[depth++]=node0;\n" + " do\n" + " {\n" + " b3Int2 node = nodeStack[--depth];\n" + " b3Float4 aMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMin,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" + " b3Float4 aMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.x].m_quantizedAabbMax,bvhInfos[bvhA].m_quantization,bvhInfos[bvhA].m_aabbMin);\n" + " b3Float4 bMinLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMin,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" + " b3Float4 bMaxLocal = MyUnQuantizeGlobal(quantizedNodes[node.y].m_quantizedAabbMax,bvhInfos[bvhB].m_quantization,bvhInfos[bvhB].m_aabbMin);\n" + " float margin=0.f;\n" + " b3Float4 aabbAMinOut,aabbAMaxOut;\n" + " b3TransformAabb2(aMinLocal,aMaxLocal, margin,posA,ornA,&aabbAMinOut,&aabbAMaxOut);\n" + " b3Float4 aabbBMinOut,aabbBMaxOut;\n" + " b3TransformAabb2(bMinLocal,bMaxLocal, margin,posB,ornB,&aabbBMinOut,&aabbBMaxOut);\n" + " \n" + " bool nodeOverlap = b3TestAabbAgainstAabb(aabbAMinOut,aabbAMaxOut,aabbBMinOut,aabbBMaxOut);\n" + " if (nodeOverlap)\n" + " {\n" + " bool isLeafA = isLeafNodeGlobal(&quantizedNodes[node.x]);\n" + " bool isLeafB = isLeafNodeGlobal(&quantizedNodes[node.y]);\n" + " bool isInternalA = !isLeafA;\n" + " bool isInternalB = !isLeafB;\n" + " //fail, even though it might hit two leaf nodes\n" + " if (depth+4>maxStackDepth && !(isLeafA && isLeafB))\n" + " {\n" + " //printf(\"Error: traversal exceeded maxStackDepth\");\n" + " continue;\n" + " }\n" + " if(isInternalA)\n" + " {\n" + " int nodeAleftChild = node.x+1;\n" + " bool isNodeALeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.x+1]);\n" + " int nodeArightChild = isNodeALeftChildLeaf? node.x+2 : node.x+1 + getEscapeIndexGlobal(&quantizedNodes[node.x+1]);\n" + " if(isInternalB)\n" + " { \n" + " int nodeBleftChild = node.y+1;\n" + " bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n" + " int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n" + " nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBleftChild);\n" + " nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBleftChild);\n" + " nodeStack[depth++] = b3MakeInt2(nodeAleftChild, nodeBrightChild);\n" + " nodeStack[depth++] = b3MakeInt2(nodeArightChild, nodeBrightChild);\n" + " }\n" + " else\n" + " {\n" + " nodeStack[depth++] = b3MakeInt2(nodeAleftChild,node.y);\n" + " nodeStack[depth++] = b3MakeInt2(nodeArightChild,node.y);\n" + " }\n" + " }\n" + " else\n" + " {\n" + " if(isInternalB)\n" + " {\n" + " int nodeBleftChild = node.y+1;\n" + " bool isNodeBLeftChildLeaf = isLeafNodeGlobal(&quantizedNodes[node.y+1]);\n" + " int nodeBrightChild = isNodeBLeftChildLeaf? node.y+2 : node.y+1 + getEscapeIndexGlobal(&quantizedNodes[node.y+1]);\n" + " nodeStack[depth++] = b3MakeInt2(node.x,nodeBleftChild);\n" + " nodeStack[depth++] = b3MakeInt2(node.x,nodeBrightChild);\n" + " }\n" + " else\n" + " {\n" + " int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" + " if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" + " {\n" + " int childShapeIndexA = getTriangleIndexGlobal(&quantizedNodes[node.x]);\n" + " int childShapeIndexB = getTriangleIndexGlobal(&quantizedNodes[node.y]);\n" + " gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n" + " }\n" + " }\n" + " }\n" + " }\n" + " } while (depth);\n" + " }\n" + " }\n" + " }\n" + " \n" + " return;\n" + " }\n" + " if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" + " {\n" + " if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) \n" + " {\n" + " int numChildrenA = collidables[collidableIndexA].m_numChildShapes;\n" + " for (int c=0;c<numChildrenA;c++)\n" + " {\n" + " int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;\n" + " int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n" + " float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n" + " float4 newPosA = qtRotate(ornA,childPosA)+posA;\n" + " float4 newOrnA = qtMul(ornA,childOrnA);\n" + " int shapeIndexA = collidables[childColIndexA].m_shapeIndex;\n" + " b3Aabb_t aabbAlocal = aabbLocalSpace[shapeIndexA];\n" + " float margin = 0.f;\n" + " \n" + " b3Float4 aabbAMinWS;\n" + " b3Float4 aabbAMaxWS;\n" + " \n" + " b3TransformAabb2(aabbAlocal.m_minVec,aabbAlocal.m_maxVec,margin,\n" + " newPosA,\n" + " newOrnA,\n" + " &aabbAMinWS,&aabbAMaxWS);\n" + " \n" + " \n" + " if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " {\n" + " int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n" + " for (int b=0;b<numChildrenB;b++)\n" + " {\n" + " int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n" + " int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" + " float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" + " float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" + " float4 newPosB = transform(&childPosB,&posB,&ornB);\n" + " float4 newOrnB = qtMul(ornB,childOrnB);\n" + " int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" + " b3Aabb_t aabbBlocal = aabbLocalSpace[shapeIndexB];\n" + " \n" + " b3Float4 aabbBMinWS;\n" + " b3Float4 aabbBMaxWS;\n" + " \n" + " b3TransformAabb2(aabbBlocal.m_minVec,aabbBlocal.m_maxVec,margin,\n" + " newPosB,\n" + " newOrnB,\n" + " &aabbBMinWS,&aabbBMaxWS);\n" + " \n" + " \n" + " \n" + " bool aabbOverlap = b3TestAabbAgainstAabb(aabbAMinWS,aabbAMaxWS,aabbBMinWS,aabbBMaxWS);\n" + " if (aabbOverlap)\n" + " {\n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " float dmin = FLT_MAX;\n" + " float4 posA = newPosA;\n" + " posA.w = 0.f;\n" + " float4 posB = newPosB;\n" + " posB.w = 0.f;\n" + " float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" + " float4 ornA = newOrnA;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 ornB =newOrnB;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " {//\n" + " int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" + " if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" + " {\n" + " gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n" + " }\n" + " }//\n" + " }//fi(1)\n" + " } //for (int b=0\n" + " }//if (collidables[collidableIndexB].\n" + " else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " {\n" + " if (1)\n" + " {\n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " float dmin = FLT_MAX;\n" + " float4 posA = newPosA;\n" + " posA.w = 0.f;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " posB.w = 0.f;\n" + " float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" + " float4 ornA = newOrnA;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " {\n" + " int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" + " if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" + " {\n" + " gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);\n" + " }//if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" + " }//\n" + " }//fi (1)\n" + " }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " }//for (int b=0;b<numChildrenB;b++) \n" + " return;\n" + " }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH) \n" + " && (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" + " {\n" + " int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n" + " for (int b=0;b<numChildrenB;b++)\n" + " {\n" + " int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n" + " int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" + " float4 ornB = rigidBodies[bodyIndexB].m_quat;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" + " float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" + " float4 newPosB = qtRotate(ornB,childPosB)+posB;\n" + " float4 newOrnB = qtMul(ornB,childOrnB);\n" + " int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" + " //////////////////////////////////////\n" + " if (1)\n" + " {\n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " float dmin = FLT_MAX;\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " posA.w = 0.f;\n" + " float4 posB = newPosB;\n" + " posB.w = 0.f;\n" + " float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 ornB =newOrnB;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " {//\n" + " int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n" + " if (compoundPairIdx<maxNumCompoundPairsCapacity)\n" + " {\n" + " gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);\n" + " }//fi (compoundPairIdx<maxNumCompoundPairsCapacity)\n" + " }//\n" + " }//fi (1) \n" + " }//for (int b=0;b<numChildrenB;b++)\n" + " return;\n" + " }//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " return;\n" + " }//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n" + " }//i<numPairs\n" + "}\n" + "// work-in-progress\n" + "__kernel void findSeparatingAxisKernel( __global const int4* pairs, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global btAabbCL* aabbs,\n" + " __global volatile float4* separatingNormals,\n" + " __global volatile int* hasSeparatingAxis,\n" + " int numPairs\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " \n" + " if (i<numPairs)\n" + " {\n" + " \n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " \n" + " //once the broadphase avoids static-static pairs, we can remove this test\n" + " if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " return;\n" + " }\n" + " \n" + " if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " return;\n" + " }\n" + " \n" + " if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " return;\n" + " }\n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " float dmin = FLT_MAX;\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " posA.w = 0.f;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " posB.w = 0.f;\n" + " float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " float4 sepNormal;\n" + " \n" + " bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " vertices,uniqueEdges,faces,\n" + " indices,&sepNormal,&dmin);\n" + " hasSeparatingAxis[i] = 4;\n" + " if (!sepA)\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " } else\n" + " {\n" + " bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n" + " posA,ornA,\n" + " DeltaC2,\n" + " vertices,uniqueEdges,faces,\n" + " indices,&sepNormal,&dmin);\n" + " if (!sepB)\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " } else\n" + " {\n" + " bool sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " vertices,uniqueEdges,faces,\n" + " indices,&sepNormal,&dmin);\n" + " if (!sepEE)\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " } else\n" + " {\n" + " hasSeparatingAxis[i] = 1;\n" + " separatingNormals[i] = sepNormal;\n" + " }\n" + " }\n" + " }\n" + " \n" + " }\n" + "}\n" + "__kernel void findSeparatingAxisVertexFaceKernel( __global const int4* pairs, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global btAabbCL* aabbs,\n" + " __global volatile float4* separatingNormals,\n" + " __global volatile int* hasSeparatingAxis,\n" + " __global float* dmins,\n" + " int numPairs\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " \n" + " if (i<numPairs)\n" + " {\n" + " \n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " hasSeparatingAxis[i] = 0; \n" + " \n" + " //once the broadphase avoids static-static pairs, we can remove this test\n" + " if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n" + " {\n" + " return;\n" + " }\n" + " \n" + " if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n" + " {\n" + " return;\n" + " }\n" + " \n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " float dmin = FLT_MAX;\n" + " dmins[i] = dmin;\n" + " \n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " posA.w = 0.f;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " posB.w = 0.f;\n" + " float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " float4 sepNormal;\n" + " \n" + " bool sepA = findSeparatingAxis( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " vertices,uniqueEdges,faces,\n" + " indices,&sepNormal,&dmin);\n" + " hasSeparatingAxis[i] = 4;\n" + " if (!sepA)\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " } else\n" + " {\n" + " bool sepB = findSeparatingAxis( &convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n" + " posA,ornA,\n" + " DeltaC2,\n" + " vertices,uniqueEdges,faces,\n" + " indices,&sepNormal,&dmin);\n" + " if (sepB)\n" + " {\n" + " dmins[i] = dmin;\n" + " hasSeparatingAxis[i] = 1;\n" + " separatingNormals[i] = sepNormal;\n" + " }\n" + " }\n" + " \n" + " }\n" + "}\n" + "__kernel void findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs, \n" + " __global const BodyData* rigidBodies, \n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global btAabbCL* aabbs,\n" + " __global float4* separatingNormals,\n" + " __global int* hasSeparatingAxis,\n" + " __global float* dmins,\n" + " __global const float4* unitSphereDirections,\n" + " int numUnitSphereDirections,\n" + " int numPairs\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " \n" + " if (i<numPairs)\n" + " {\n" + " if (hasSeparatingAxis[i])\n" + " {\n" + " \n" + " int bodyIndexA = pairs[i].x;\n" + " int bodyIndexB = pairs[i].y;\n" + " \n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " \n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " \n" + " \n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " \n" + " float dmin = dmins[i];\n" + " \n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " posA.w = 0.f;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " posB.w = 0.f;\n" + " float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " float4 sepNormal = separatingNormals[i];\n" + " \n" + " \n" + " \n" + " bool sepEE = false;\n" + " int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n" + " if (numEdgeEdgeDirections<=numUnitSphereDirections)\n" + " {\n" + " sepEE = findSeparatingAxisEdgeEdge( &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " vertices,uniqueEdges,faces,\n" + " indices,&sepNormal,&dmin);\n" + " \n" + " if (!sepEE)\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " } else\n" + " {\n" + " hasSeparatingAxis[i] = 1;\n" + " separatingNormals[i] = sepNormal;\n" + " }\n" + " }\n" + " /*\n" + " ///else case is a separate kernel, to make Mac OSX OpenCL compiler happy\n" + " else\n" + " {\n" + " sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " vertices,unitSphereDirections,numUnitSphereDirections,\n" + " &sepNormal,&dmin);\n" + " if (!sepEE)\n" + " {\n" + " hasSeparatingAxis[i] = 0;\n" + " } else\n" + " {\n" + " hasSeparatingAxis[i] = 1;\n" + " separatingNormals[i] = sepNormal;\n" + " }\n" + " }\n" + " */\n" + " } //if (hasSeparatingAxis[i])\n" + " }//(i<numPairs)\n" + "}\n" + "inline int findClippingFaces(const float4 separatingNormal,\n" + " const ConvexPolyhedronCL* hullA, \n" + " __global const ConvexPolyhedronCL* hullB,\n" + " const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n" + " __global float4* worldVertsA1,\n" + " __global float4* worldNormalsA1,\n" + " __global float4* worldVertsB1,\n" + " int capacityWorldVerts,\n" + " const float minDist, float maxDist,\n" + " const float4* verticesA,\n" + " const btGpuFace* facesA,\n" + " const int* indicesA,\n" + " __global const float4* verticesB,\n" + " __global const btGpuFace* facesB,\n" + " __global const int* indicesB,\n" + " __global int4* clippingFaces, int pairIndex)\n" + "{\n" + " int numContactsOut = 0;\n" + " int numWorldVertsB1= 0;\n" + " \n" + " \n" + " int closestFaceB=0;\n" + " float dmax = -FLT_MAX;\n" + " \n" + " {\n" + " for(int face=0;face<hullB->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x,\n" + " facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n" + " const float4 WorldNormal = qtRotate(ornB, Normal);\n" + " float d = dot3F4(WorldNormal,separatingNormal);\n" + " if (d > dmax)\n" + " {\n" + " dmax = d;\n" + " closestFaceB = face;\n" + " }\n" + " }\n" + " }\n" + " \n" + " {\n" + " const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n" + " int numVertices = polyB.m_numIndices;\n" + " if (numVertices>capacityWorldVerts)\n" + " numVertices = capacityWorldVerts;\n" + " \n" + " for(int e0=0;e0<numVertices;e0++)\n" + " {\n" + " if (e0<capacityWorldVerts)\n" + " {\n" + " const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n" + " worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n" + " }\n" + " }\n" + " }\n" + " \n" + " int closestFaceA=0;\n" + " {\n" + " float dmin = FLT_MAX;\n" + " for(int face=0;face<hullA->m_numFaces;face++)\n" + " {\n" + " const float4 Normal = make_float4(\n" + " facesA[hullA->m_faceOffset+face].m_plane.x,\n" + " facesA[hullA->m_faceOffset+face].m_plane.y,\n" + " facesA[hullA->m_faceOffset+face].m_plane.z,\n" + " 0.f);\n" + " const float4 faceANormalWS = qtRotate(ornA,Normal);\n" + " \n" + " float d = dot3F4(faceANormalWS,separatingNormal);\n" + " if (d < dmin)\n" + " {\n" + " dmin = d;\n" + " closestFaceA = face;\n" + " worldNormalsA1[pairIndex] = faceANormalWS;\n" + " }\n" + " }\n" + " }\n" + " \n" + " int numVerticesA = facesA[hullA->m_faceOffset+closestFaceA].m_numIndices;\n" + " if (numVerticesA>capacityWorldVerts)\n" + " numVerticesA = capacityWorldVerts;\n" + " \n" + " for(int e0=0;e0<numVerticesA;e0++)\n" + " {\n" + " if (e0<capacityWorldVerts)\n" + " {\n" + " const float4 a = verticesA[hullA->m_vertexOffset+indicesA[facesA[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n" + " worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n" + " }\n" + " }\n" + " \n" + " clippingFaces[pairIndex].x = closestFaceA;\n" + " clippingFaces[pairIndex].y = closestFaceB;\n" + " clippingFaces[pairIndex].z = numVerticesA;\n" + " clippingFaces[pairIndex].w = numWorldVertsB1;\n" + " \n" + " \n" + " return numContactsOut;\n" + "}\n" + "// work-in-progress\n" + "__kernel void findConcaveSeparatingAxisKernel( __global int4* concavePairs,\n" + " __global const BodyData* rigidBodies,\n" + " __global const btCollidableGpu* collidables,\n" + " __global const ConvexPolyhedronCL* convexShapes, \n" + " __global const float4* vertices,\n" + " __global const float4* uniqueEdges,\n" + " __global const btGpuFace* faces,\n" + " __global const int* indices,\n" + " __global const btGpuChildShape* gpuChildShapes,\n" + " __global btAabbCL* aabbs,\n" + " __global float4* concaveSeparatingNormalsOut,\n" + " __global int* concaveHasSeparatingNormals,\n" + " __global int4* clippingFacesOut,\n" + " __global float4* worldVertsA1GPU,\n" + " __global float4* worldNormalsAGPU,\n" + " __global float4* worldVertsB1GPU,\n" + " int vertexFaceCapacity,\n" + " int numConcavePairs\n" + " )\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numConcavePairs)\n" + " return;\n" + " concaveHasSeparatingNormals[i] = 0;\n" + " int pairIdx = i;\n" + " int bodyIndexA = concavePairs[i].x;\n" + " int bodyIndexB = concavePairs[i].y;\n" + " int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n" + " int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n" + " int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n" + " int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n" + " if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n" + " collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " {\n" + " concavePairs[pairIdx].w = -1;\n" + " return;\n" + " }\n" + " int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n" + " int numActualConcaveConvexTests = 0;\n" + " \n" + " int f = concavePairs[i].z;\n" + " \n" + " bool overlap = false;\n" + " \n" + " ConvexPolyhedronCL convexPolyhedronA;\n" + " //add 3 vertices of the triangle\n" + " convexPolyhedronA.m_numVertices = 3;\n" + " convexPolyhedronA.m_vertexOffset = 0;\n" + " float4 localCenter = make_float4(0.f,0.f,0.f,0.f);\n" + " btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n" + " float4 triMinAabb, triMaxAabb;\n" + " btAabbCL triAabb;\n" + " triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n" + " triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n" + " \n" + " float4 verticesA[3];\n" + " for (int i=0;i<3;i++)\n" + " {\n" + " int index = indices[face.m_indexOffset+i];\n" + " float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n" + " verticesA[i] = vert;\n" + " localCenter += vert;\n" + " \n" + " triAabb.m_min = min(triAabb.m_min,vert); \n" + " triAabb.m_max = max(triAabb.m_max,vert); \n" + " }\n" + " overlap = true;\n" + " overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n" + " overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n" + " overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n" + " \n" + " if (overlap)\n" + " {\n" + " float dmin = FLT_MAX;\n" + " int hasSeparatingAxis=5;\n" + " float4 sepAxis=make_float4(1,2,3,4);\n" + " int localCC=0;\n" + " numActualConcaveConvexTests++;\n" + " //a triangle has 3 unique edges\n" + " convexPolyhedronA.m_numUniqueEdges = 3;\n" + " convexPolyhedronA.m_uniqueEdgesOffset = 0;\n" + " float4 uniqueEdgesA[3];\n" + " \n" + " uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n" + " uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n" + " uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n" + " convexPolyhedronA.m_faceOffset = 0;\n" + " \n" + " float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n" + " \n" + " btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n" + " int indicesA[3+3+2+2+2];\n" + " int curUsedIndices=0;\n" + " int fidx=0;\n" + " //front size of triangle\n" + " {\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[0] = 0;\n" + " indicesA[1] = 1;\n" + " indicesA[2] = 2;\n" + " curUsedIndices+=3;\n" + " float c = face.m_plane.w;\n" + " facesA[fidx].m_plane.x = normal.x;\n" + " facesA[fidx].m_plane.y = normal.y;\n" + " facesA[fidx].m_plane.z = normal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " facesA[fidx].m_numIndices=3;\n" + " }\n" + " fidx++;\n" + " //back size of triangle\n" + " {\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[3]=2;\n" + " indicesA[4]=1;\n" + " indicesA[5]=0;\n" + " curUsedIndices+=3;\n" + " float c = dot(normal,verticesA[0]);\n" + " float c1 = -face.m_plane.w;\n" + " facesA[fidx].m_plane.x = -normal.x;\n" + " facesA[fidx].m_plane.y = -normal.y;\n" + " facesA[fidx].m_plane.z = -normal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " facesA[fidx].m_numIndices=3;\n" + " }\n" + " fidx++;\n" + " bool addEdgePlanes = true;\n" + " if (addEdgePlanes)\n" + " {\n" + " int numVertices=3;\n" + " int prevVertex = numVertices-1;\n" + " for (int i=0;i<numVertices;i++)\n" + " {\n" + " float4 v0 = verticesA[i];\n" + " float4 v1 = verticesA[prevVertex];\n" + " \n" + " float4 edgeNormal = normalize(cross(normal,v1-v0));\n" + " float c = -dot(edgeNormal,v0);\n" + " facesA[fidx].m_numIndices = 2;\n" + " facesA[fidx].m_indexOffset=curUsedIndices;\n" + " indicesA[curUsedIndices++]=i;\n" + " indicesA[curUsedIndices++]=prevVertex;\n" + " \n" + " facesA[fidx].m_plane.x = edgeNormal.x;\n" + " facesA[fidx].m_plane.y = edgeNormal.y;\n" + " facesA[fidx].m_plane.z = edgeNormal.z;\n" + " facesA[fidx].m_plane.w = c;\n" + " fidx++;\n" + " prevVertex = i;\n" + " }\n" + " }\n" + " convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n" + " convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n" + " float4 posA = rigidBodies[bodyIndexA].m_pos;\n" + " posA.w = 0.f;\n" + " float4 posB = rigidBodies[bodyIndexB].m_pos;\n" + " posB.w = 0.f;\n" + " float4 ornA = rigidBodies[bodyIndexA].m_quat;\n" + " float4 ornB =rigidBodies[bodyIndexB].m_quat;\n" + " \n" + " ///////////////////\n" + " ///compound shape support\n" + " if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n" + " {\n" + " int compoundChild = concavePairs[pairIdx].w;\n" + " int childShapeIndexB = compoundChild;//collidables[collidableIndexB].m_shapeIndex+compoundChild;\n" + " int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n" + " float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n" + " float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n" + " float4 newPosB = transform(&childPosB,&posB,&ornB);\n" + " float4 newOrnB = qtMul(ornB,childOrnB);\n" + " posB = newPosB;\n" + " ornB = newOrnB;\n" + " shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n" + " }\n" + " //////////////////\n" + " float4 c0local = convexPolyhedronA.m_localCenter;\n" + " float4 c0 = transform(&c0local, &posA, &ornA);\n" + " float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n" + " float4 c1 = transform(&c1local,&posB,&ornB);\n" + " const float4 DeltaC2 = c0 - c1;\n" + " bool sepA = findSeparatingAxisLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" + " posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " verticesA,uniqueEdgesA,facesA,indicesA,\n" + " vertices,uniqueEdges,faces,indices,\n" + " &sepAxis,&dmin);\n" + " hasSeparatingAxis = 4;\n" + " if (!sepA)\n" + " {\n" + " hasSeparatingAxis = 0;\n" + " } else\n" + " {\n" + " bool sepB = findSeparatingAxisLocalB( &convexShapes[shapeIndexB],&convexPolyhedronA,\n" + " posB,ornB,\n" + " posA,ornA,\n" + " DeltaC2,\n" + " vertices,uniqueEdges,faces,indices,\n" + " verticesA,uniqueEdgesA,facesA,indicesA,\n" + " &sepAxis,&dmin);\n" + " if (!sepB)\n" + " {\n" + " hasSeparatingAxis = 0;\n" + " } else\n" + " {\n" + " bool sepEE = findSeparatingAxisEdgeEdgeLocalA( &convexPolyhedronA, &convexShapes[shapeIndexB],\n" + " posA,ornA,\n" + " posB,ornB,\n" + " DeltaC2,\n" + " verticesA,uniqueEdgesA,facesA,indicesA,\n" + " vertices,uniqueEdges,faces,indices,\n" + " &sepAxis,&dmin);\n" + " \n" + " if (!sepEE)\n" + " {\n" + " hasSeparatingAxis = 0;\n" + " } else\n" + " {\n" + " hasSeparatingAxis = 1;\n" + " }\n" + " }\n" + " } \n" + " \n" + " if (hasSeparatingAxis)\n" + " {\n" + " sepAxis.w = dmin;\n" + " concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n" + " concaveHasSeparatingNormals[i]=1;\n" + " float minDist = -1e30f;\n" + " float maxDist = 0.02f;\n" + " \n" + " findClippingFaces(sepAxis,\n" + " &convexPolyhedronA,\n" + " &convexShapes[shapeIndexB],\n" + " posA,ornA,\n" + " posB,ornB,\n" + " worldVertsA1GPU,\n" + " worldNormalsAGPU,\n" + " worldVertsB1GPU,\n" + " vertexFaceCapacity,\n" + " minDist, maxDist,\n" + " verticesA,\n" + " facesA,\n" + " indicesA,\n" + " vertices,\n" + " faces,\n" + " indices,\n" + " clippingFacesOut, pairIdx);\n" + " } else\n" + " { \n" + " //mark this pair as in-active\n" + " concavePairs[pairIdx].w = -1;\n" + " }\n" + " }\n" + " else\n" + " { \n" + " //mark this pair as in-active\n" + " concavePairs[pairIdx].w = -1;\n" + " }\n" + " \n" + " concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp index a4980f71e1..c0e11bfb26 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp @@ -19,149 +19,139 @@ subject to the following restrictions: #define KERNEL1 "SearchSortDataUpperKernel" #define KERNEL2 "SubtractKernel" - #include "b3BoundSearchCL.h" #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "b3LauncherCL.h" #include "kernels/BoundSearchKernelsCL.h" b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize) - :m_context(ctx), - m_device(device), - m_queue(queue) + : m_context(ctx), + m_device(device), + m_queue(queue) { - const char* additionalMacros = ""; //const char* srcFileNameForCaching=""; cl_int pErrNum; const char* kernelSource = boundSearchKernelsCL; - cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH); + cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, BOUNDSEARCH_PATH); b3Assert(boundSearchProg); - m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros ); - b3Assert(m_lowerSortDataKernel ); + m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg, additionalMacros); + b3Assert(m_lowerSortDataKernel); - m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros ); + m_upperSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg, additionalMacros); b3Assert(m_upperSortDataKernel); m_subtractKernel = 0; - if( maxSize ) + if (maxSize) { - m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros ); + m_subtractKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg, additionalMacros); b3Assert(m_subtractKernel); } //m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST ); - - m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize ); - m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize ); - m_filler = new b3FillCL(ctx,device,queue); + m_lower = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize); + m_upper = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize); + + m_filler = new b3FillCL(ctx, device, queue); } b3BoundSearchCL::~b3BoundSearchCL() { - delete m_lower; delete m_upper; delete m_filler; - + clReleaseKernel(m_lowerSortDataKernel); clReleaseKernel(m_upperSortDataKernel); clReleaseKernel(m_subtractKernel); - - } - -void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option ) +void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option) { b3Int4 constBuffer; constBuffer.x = nSrc; constBuffer.y = nDst; - if( option == BOUND_LOWER ) + if (option == BOUND_LOWER) { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) }; - - b3LauncherCL launcher( m_queue, m_lowerSortDataKernel,"m_lowerSortDataKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( nSrc ); - launcher.setConst( nDst ); - - launcher.launch1D( nSrc, 64 ); + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())}; + + b3LauncherCL launcher(m_queue, m_lowerSortDataKernel, "m_lowerSortDataKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(nSrc); + launcher.setConst(nDst); + + launcher.launch1D(nSrc, 64); } - else if( option == BOUND_UPPER ) + else if (option == BOUND_UPPER) { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) }; + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())}; - b3LauncherCL launcher(m_queue, m_upperSortDataKernel,"m_upperSortDataKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( nSrc ); - launcher.setConst( nDst ); + b3LauncherCL launcher(m_queue, m_upperSortDataKernel, "m_upperSortDataKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(nSrc); + launcher.setConst(nDst); - launcher.launch1D( nSrc, 64 ); + launcher.launch1D(nSrc, 64); } - else if( option == COUNT ) + else if (option == COUNT) { - b3Assert( m_lower ); - b3Assert( m_upper ); - b3Assert( m_lower->capacity() <= (int)nDst ); - b3Assert( m_upper->capacity() <= (int)nDst ); + b3Assert(m_lower); + b3Assert(m_upper); + b3Assert(m_lower->capacity() <= (int)nDst); + b3Assert(m_upper->capacity() <= (int)nDst); int zero = 0; - m_filler->execute( *m_lower, zero, nDst ); - m_filler->execute( *m_upper, zero, nDst ); + m_filler->execute(*m_lower, zero, nDst); + m_filler->execute(*m_upper, zero, nDst); - execute( src, nSrc, *m_lower, nDst, BOUND_LOWER ); - execute( src, nSrc, *m_upper, nDst, BOUND_UPPER ); + execute(src, nSrc, *m_lower, nDst, BOUND_LOWER); + execute(src, nSrc, *m_upper, nDst, BOUND_UPPER); { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) }; + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_upper->getBufferCL(), true), b3BufferInfoCL(m_lower->getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())}; - b3LauncherCL launcher( m_queue, m_subtractKernel ,"m_subtractKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( nSrc ); - launcher.setConst( nDst ); + b3LauncherCL launcher(m_queue, m_subtractKernel, "m_subtractKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(nSrc); + launcher.setConst(nDst); - launcher.launch1D( nDst, 64 ); + launcher.launch1D(nDst, 64); } } else { - b3Assert( 0 ); + b3Assert(0); } - } - -void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, - b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option ) +void b3BoundSearchCL::executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc, + b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option) { + for (int i = 0; i < nSrc - 1; i++) + b3Assert(src[i].m_key <= src[i + 1].m_key); - - for(int i=0; i<nSrc-1; i++) - b3Assert( src[i].m_key <= src[i+1].m_key ); - - b3SortData minData,zeroData,maxData; + b3SortData minData, zeroData, maxData; minData.m_key = -1; minData.m_value = -1; - zeroData.m_key=0; - zeroData.m_value=0; + zeroData.m_key = 0; + zeroData.m_value = 0; maxData.m_key = nDst; maxData.m_value = nDst; - if( option == BOUND_LOWER ) + if (option == BOUND_LOWER) { - for(int i=0; i<nSrc; i++) + for (int i = 0; i < nSrc; i++) { - b3SortData& iData = (i==0)? minData: src[i-1]; - b3SortData& jData = (i==nSrc)? maxData: src[i]; + b3SortData& iData = (i == 0) ? minData : src[i - 1]; + b3SortData& jData = (i == nSrc) ? maxData : src[i]; - if( iData.m_key != jData.m_key ) + if (iData.m_key != jData.m_key) { int k = jData.m_key; { @@ -170,14 +160,14 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS } } } - else if( option == BOUND_UPPER ) + else if (option == BOUND_UPPER) { - for(int i=1; i<nSrc+1; i++) + for (int i = 1; i < nSrc + 1; i++) { - b3SortData& iData = src[i-1]; - b3SortData& jData = (i==nSrc)? maxData: src[i]; + b3SortData& iData = src[i - 1]; + b3SortData& jData = (i == nSrc) ? maxData : src[i]; - if( iData.m_key != jData.m_key ) + if (iData.m_key != jData.m_key) { int k = iData.m_key; { @@ -186,28 +176,28 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS } } } - else if( option == COUNT ) + else if (option == COUNT) { b3AlignedObjectArray<unsigned int> lower; - lower.resize(nDst ); + lower.resize(nDst); b3AlignedObjectArray<unsigned int> upper; - upper.resize(nDst ); + upper.resize(nDst); - for(int i=0; i<nDst; i++) - { - lower[i] = upper[i] = 0; + for (int i = 0; i < nDst; i++) + { + lower[i] = upper[i] = 0; } - executeHost( src, nSrc, lower, nDst, BOUND_LOWER ); - executeHost( src, nSrc, upper, nDst, BOUND_UPPER ); + executeHost(src, nSrc, lower, nDst, BOUND_LOWER); + executeHost(src, nSrc, upper, nDst, BOUND_UPPER); - for( int i=0; i<nDst; i++) - { - dst[i] = upper[i] - lower[i]; + for (int i = 0; i < nDst; i++) + { + dst[i] = upper[i] - lower[i]; } } else { - b3Assert( 0 ); + b3Assert(0); } } diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h index 7e2940965c..0d633e3d23 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h @@ -26,42 +26,39 @@ subject to the following restrictions: #include "b3OpenCLArray.h" #include "b3FillCL.h" -#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?) +#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?) class b3BoundSearchCL { - public: +public: + enum Option + { + BOUND_LOWER, + BOUND_UPPER, + COUNT, + }; - enum Option - { - BOUND_LOWER, - BOUND_UPPER, - COUNT, - }; + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; - cl_context m_context; - cl_device_id m_device; - cl_command_queue m_queue; + cl_kernel m_lowerSortDataKernel; + cl_kernel m_upperSortDataKernel; + cl_kernel m_subtractKernel; - - cl_kernel m_lowerSortDataKernel; - cl_kernel m_upperSortDataKernel; - cl_kernel m_subtractKernel; - - b3OpenCLArray<b3Int4>* m_constbtOpenCLArray; - b3OpenCLArray<unsigned int>* m_lower; - b3OpenCLArray<unsigned int>* m_upper; - - b3FillCL* m_filler; - - b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size); + b3OpenCLArray<b3Int4>* m_constbtOpenCLArray; + b3OpenCLArray<unsigned int>* m_lower; + b3OpenCLArray<unsigned int>* m_upper; - virtual ~b3BoundSearchCL(); + b3FillCL* m_filler; - // src has to be src[i].m_key <= src[i+1].m_key - void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER ); + b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size); - void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER); -}; + virtual ~b3BoundSearchCL(); + + // src has to be src[i].m_key <= src[i+1].m_key + void execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER); + void executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER); +}; -#endif //B3_BOUNDSEARCH_H +#endif //B3_BOUNDSEARCH_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h index 52f219ae3f..35fc467b20 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h @@ -4,16 +4,15 @@ #include "b3OpenCLArray.h" - struct b3BufferInfoCL { //b3BufferInfoCL(){} -// template<typename T> - b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){} + // template<typename T> + b3BufferInfoCL(cl_mem buff, bool isReadOnly = false) : m_clBuffer(buff), m_isReadOnly(isReadOnly) {} cl_mem m_clBuffer; bool m_isReadOnly; }; -#endif //B3_BUFFER_INFO_CL_H +#endif //B3_BUFFER_INFO_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp index f05c2648f1..bd25bb2101 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp @@ -8,29 +8,26 @@ #include "kernels/FillKernelsCL.h" b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue) -:m_commandQueue(queue) + : m_commandQueue(queue) { const char* kernelSource = fillKernelsCL; cl_int pErrNum; const char* additionalMacros = ""; - cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH); + cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, FILL_CL_PROGRAM_PATH); b3Assert(fillProg); - m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros ); + m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg, additionalMacros); b3Assert(m_fillIntKernel); - m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros ); + m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg, additionalMacros); b3Assert(m_fillIntKernel); - m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros ); + m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg, additionalMacros); b3Assert(m_fillFloatKernel); - - - m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros ); + m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg, additionalMacros); b3Assert(m_fillKernelInt2); - } b3FillCL::~b3FillCL() @@ -39,88 +36,84 @@ b3FillCL::~b3FillCL() clReleaseKernel(m_fillIntKernel); clReleaseKernel(m_fillUnsignedIntKernel); clReleaseKernel(m_fillFloatKernel); - } void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset) { - b3Assert( n>0 ); + b3Assert(n > 0); { - b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel,"m_fillFloatKernel" ); - launcher.setBuffer( src.getBufferCL()); - launcher.setConst( n ); - launcher.setConst( value ); - launcher.setConst( offset); + b3LauncherCL launcher(m_commandQueue, m_fillFloatKernel, "m_fillFloatKernel"); + launcher.setBuffer(src.getBufferCL()); + launcher.setConst(n); + launcher.setConst(value); + launcher.setConst(offset); - launcher.launch1D( n ); + launcher.launch1D(n); } } void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset) { - b3Assert( n>0 ); - + b3Assert(n > 0); { - b3LauncherCL launcher( m_commandQueue, m_fillIntKernel ,"m_fillIntKernel"); + b3LauncherCL launcher(m_commandQueue, m_fillIntKernel, "m_fillIntKernel"); launcher.setBuffer(src.getBufferCL()); - launcher.setConst( n); - launcher.setConst( value); - launcher.setConst( offset); - launcher.launch1D( n ); + launcher.setConst(n); + launcher.setConst(value); + launcher.setConst(offset); + launcher.launch1D(n); } } - void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset) { - b3Assert( n>0 ); + b3Assert(n > 0); { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) }; + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())}; - b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel,"m_fillUnsignedIntKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( n ); - launcher.setConst(value); + b3LauncherCL launcher(m_commandQueue, m_fillUnsignedIntKernel, "m_fillUnsignedIntKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(n); + launcher.setConst(value); launcher.setConst(offset); - launcher.launch1D( n ); + launcher.launch1D(n); } } -void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset) +void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset) { - for (int i=0;i<n;i++) + for (int i = 0; i < n; i++) { - src[i+offset]=value; + src[i + offset] = value; } } -void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset) +void b3FillCL::executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset) { - for (int i=0;i<n;i++) + for (int i = 0; i < n; i++) { - src[i+offset]=value; + src[i + offset] = value; } } -void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset) +void b3FillCL::execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset) { - b3Assert( n>0 ); - + b3Assert(n > 0); { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) }; + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())}; - b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2,"m_fillKernelInt2"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2, "m_fillKernelInt2"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(n); launcher.setConst(value); launcher.setConst(offset); //( constBuffer ); - launcher.launch1D( n ); + launcher.launch1D(n); } } diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h index 1609676b9d..c92c3e5119 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h @@ -7,57 +7,46 @@ #include "Bullet3Common/shared/b3Int2.h" #include "Bullet3Common/shared/b3Int4.h" - class b3FillCL { - - cl_command_queue m_commandQueue; - - cl_kernel m_fillKernelInt2; - cl_kernel m_fillIntKernel; - cl_kernel m_fillUnsignedIntKernel; - cl_kernel m_fillFloatKernel; - - public: - - struct b3ConstData - { - union - { - b3Int4 m_data; - b3UnsignedInt4 m_UnsignedData; - }; - int m_offset; - int m_n; - int m_padding[2]; + cl_command_queue m_commandQueue; + + cl_kernel m_fillKernelInt2; + cl_kernel m_fillIntKernel; + cl_kernel m_fillUnsignedIntKernel; + cl_kernel m_fillFloatKernel; + +public: + struct b3ConstData + { + union { + b3Int4 m_data; + b3UnsignedInt4 m_UnsignedData; }; + int m_offset; + int m_n; + int m_padding[2]; + }; protected: - public: + b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue); - b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue); + virtual ~b3FillCL(); - virtual ~b3FillCL(); + void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0); - void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0); - - void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0); + void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0); - void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0); + void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0); - void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0); + void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0); - void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset); + void executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset); - void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset); + void executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset); // void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0); - }; - - - - -#endif //B3_FILL_CL_H +#endif //B3_FILL_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp index 94590d11ca..c97d02eb45 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp @@ -1,13 +1,13 @@ #include "b3LauncherCL.h" bool gDebugLauncherCL = false; - + b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name) -:m_commandQueue(queue), -m_kernel(kernel), -m_idx(0), -m_enableSerialization(false), -m_name(name) + : m_commandQueue(queue), + m_kernel(kernel), + m_idx(0), + m_enableSerialization(false), + m_name(name) { if (gDebugLauncherCL) { @@ -15,59 +15,58 @@ m_name(name) printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name); } - m_serializationSizeInBytes = sizeof(int); + m_serializationSizeInBytes = sizeof(int); } - + b3LauncherCL::~b3LauncherCL() - { - for (int i=0;i<m_arrays.size();i++) - { - delete (m_arrays[i]); - } - - m_arrays.clear(); - if (gDebugLauncherCL) - { +{ + for (int i = 0; i < m_arrays.size(); i++) + { + delete (m_arrays[i]); + } + + m_arrays.clear(); + if (gDebugLauncherCL) + { static int counter = 0; - printf("[%d] Finished launching OpenCL kernel %s\n", counter++,m_name); - } - } + printf("[%d] Finished launching OpenCL kernel %s\n", counter++, m_name); + } +} -void b3LauncherCL::setBuffer( cl_mem clBuffer) +void b3LauncherCL::setBuffer(cl_mem clBuffer) { - if (m_enableSerialization) - { - b3KernelArgData kernelArg; - kernelArg.m_argIndex = m_idx; - kernelArg.m_isBuffer = 1; - kernelArg.m_clBuffer = clBuffer; - - cl_mem_info param_name = CL_MEM_SIZE; - size_t param_value; - size_t sizeInBytes = sizeof(size_t); - size_t actualSizeInBytes; - cl_int err; - err = clGetMemObjectInfo ( kernelArg.m_clBuffer, - param_name, - sizeInBytes, - ¶m_value, - &actualSizeInBytes); - - b3Assert( err == CL_SUCCESS ); - kernelArg.m_argSizeInBytes = param_value; - - m_kernelArguments.push_back(kernelArg); - m_serializationSizeInBytes+= sizeof(b3KernelArgData); - m_serializationSizeInBytes+=param_value; - } - cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer); - b3Assert( status == CL_SUCCESS ); -} + if (m_enableSerialization) + { + b3KernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 1; + kernelArg.m_clBuffer = clBuffer; + cl_mem_info param_name = CL_MEM_SIZE; + size_t param_value; + size_t sizeInBytes = sizeof(size_t); + size_t actualSizeInBytes; + cl_int err; + err = clGetMemObjectInfo(kernelArg.m_clBuffer, + param_name, + sizeInBytes, + ¶m_value, + &actualSizeInBytes); + + b3Assert(err == CL_SUCCESS); + kernelArg.m_argSizeInBytes = param_value; + + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes += sizeof(b3KernelArgData); + m_serializationSizeInBytes += param_value; + } + cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &clBuffer); + b3Assert(status == CL_SUCCESS); +} -void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n ) +void b3LauncherCL::setBuffers(b3BufferInfoCL* buffInfo, int n) { - for(int i=0; i<n; i++) + for (int i = 0; i < n; i++) { if (m_enableSerialization) { @@ -75,106 +74,103 @@ void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n ) kernelArg.m_argIndex = m_idx; kernelArg.m_isBuffer = 1; kernelArg.m_clBuffer = buffInfo[i].m_clBuffer; - + cl_mem_info param_name = CL_MEM_SIZE; size_t param_value; size_t sizeInBytes = sizeof(size_t); size_t actualSizeInBytes; cl_int err; - err = clGetMemObjectInfo ( kernelArg.m_clBuffer, - param_name, - sizeInBytes, - ¶m_value, - &actualSizeInBytes); - - b3Assert( err == CL_SUCCESS ); + err = clGetMemObjectInfo(kernelArg.m_clBuffer, + param_name, + sizeInBytes, + ¶m_value, + &actualSizeInBytes); + + b3Assert(err == CL_SUCCESS); kernelArg.m_argSizeInBytes = param_value; - + m_kernelArguments.push_back(kernelArg); - m_serializationSizeInBytes+= sizeof(b3KernelArgData); - m_serializationSizeInBytes+=param_value; - } - cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer); - b3Assert( status == CL_SUCCESS ); - } + m_serializationSizeInBytes += sizeof(b3KernelArgData); + m_serializationSizeInBytes += param_value; + } + cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer); + b3Assert(status == CL_SUCCESS); + } } struct b3KernelArgDataUnaligned { - int m_isBuffer; - int m_argIndex; - int m_argSizeInBytes; + int m_isBuffer; + int m_argIndex; + int m_argSizeInBytes; int m_unusedPadding; - union - { - cl_mem m_clBuffer; - unsigned char m_argData[B3_CL_MAX_ARG_SIZE]; - }; - + union { + cl_mem m_clBuffer; + unsigned char m_argData[B3_CL_MAX_ARG_SIZE]; + }; }; #include <string.h> - - int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx) { - int index=0; - - int numArguments = *(int*) &buf[index]; - index+=sizeof(int); - - for (int i=0;i<numArguments;i++) - { - b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index]; - - index+=sizeof(b3KernelArgData); - if (arg->m_isBuffer) - { - b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes); - clData->resize(arg->m_argSizeInBytes); - - clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes); - - arg->m_clBuffer = clData->getBufferCL(); - - m_arrays.push_back(clData); - - cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer); - b3Assert( status == CL_SUCCESS ); - index+=arg->m_argSizeInBytes; - } else - { - cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData); - b3Assert( status == CL_SUCCESS ); - } + int index = 0; + + int numArguments = *(int*)&buf[index]; + index += sizeof(int); + + for (int i = 0; i < numArguments; i++) + { + b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index]; + + index += sizeof(b3KernelArgData); + if (arg->m_isBuffer) + { + b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx, m_commandQueue, arg->m_argSizeInBytes); + clData->resize(arg->m_argSizeInBytes); + + clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes); + + arg->m_clBuffer = clData->getBufferCL(); + + m_arrays.push_back(clData); + + cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer); + b3Assert(status == CL_SUCCESS); + index += arg->m_argSizeInBytes; + } + else + { + cl_int status = clSetKernelArg(m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData); + b3Assert(status == CL_SUCCESS); + } b3KernelArgData b; - memcpy(&b,arg,sizeof(b3KernelArgDataUnaligned)); - m_kernelArguments.push_back(b); - } -m_serializationSizeInBytes = index; - return index; + memcpy(&b, arg, sizeof(b3KernelArgDataUnaligned)); + m_kernelArguments.push_back(b); + } + m_serializationSizeInBytes = index; + return index; } int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx) - { - int index=0; - - int numArguments = *(int*) &goldBuffer[index]; - index+=sizeof(int); +{ + int index = 0; + + int numArguments = *(int*)&goldBuffer[index]; + index += sizeof(int); if (numArguments != m_kernelArguments.size()) { - printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size()); + printf("failed validation: expected %d arguments, found %d\n", numArguments, m_kernelArguments.size()); return -1; } - - for (int ii=0;ii<numArguments;ii++) - { - b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index]; + + for (int ii = 0; ii < numArguments; ii++) + { + b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index]; if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes) { - printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes); + printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n", ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes); return -2; } @@ -184,125 +180,117 @@ int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapac if (expected != found) { - printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found); + printf("failed validation: argument %d isBuffer expected: %d, found %d\n", ii, expected, found); return -3; } } - index+=sizeof(b3KernelArgData); + index += sizeof(b3KernelArgData); if (argGold->m_isBuffer) - { - - unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes); + { + unsigned char* memBuf = (unsigned char*)malloc(m_kernelArguments[ii].m_argSizeInBytes); unsigned char* goldBuf = &goldBuffer[index]; - for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++) + for (int j = 0; j < m_kernelArguments[j].m_argSizeInBytes; j++) { memBuf[j] = 0xaa; } cl_int status = 0; - status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes, - memBuf, 0,0,0 ); - b3Assert( status==CL_SUCCESS ); - clFinish(m_commandQueue); + status = clEnqueueReadBuffer(m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes, + memBuf, 0, 0, 0); + b3Assert(status == CL_SUCCESS); + clFinish(m_commandQueue); - for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++) + for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++) { int expected = goldBuf[b]; int found = memBuf[b]; if (expected != found) { printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n", - ii, b, expected, found); + ii, b, expected, found); return -4; } } - - index+=argGold->m_argSizeInBytes; - } else - { - + index += argGold->m_argSizeInBytes; + } + else + { //compare content - for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++) + for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++) { int expected = argGold->m_argData[b]; - int found =m_kernelArguments[ii].m_argData[b]; + int found = m_kernelArguments[ii].m_argData[b]; if (expected != found) { printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n", - ii, b, expected, found); + ii, b, expected, found); return -5; } } - - } - } - return index; - + } + } + return index; } int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity) { -//initialize to known values -for (int i=0;i<destBufferCapacity;i++) - destBuffer[i] = 0xec; - - assert(destBufferCapacity>=m_serializationSizeInBytes); - - //todo: use the b3Serializer for this to allow for 32/64bit, endianness etc - int numArguments = m_kernelArguments.size(); - int curBufferSize = 0; - int* dest = (int*)&destBuffer[curBufferSize]; - *dest = numArguments; - curBufferSize += sizeof(int); - - - - for (int i=0;i<this->m_kernelArguments.size();i++) - { - b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize]; - *arg = m_kernelArguments[i]; - curBufferSize+=sizeof(b3KernelArgData); - if (arg->m_isBuffer==1) - { - //copy the OpenCL buffer content - cl_int status = 0; - status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes, - &destBuffer[curBufferSize], 0,0,0 ); - b3Assert( status==CL_SUCCESS ); - clFinish(m_commandQueue); - curBufferSize+=arg->m_argSizeInBytes; - } - - } - return curBufferSize; + //initialize to known values + for (int i = 0; i < destBufferCapacity; i++) + destBuffer[i] = 0xec; + + assert(destBufferCapacity >= m_serializationSizeInBytes); + + //todo: use the b3Serializer for this to allow for 32/64bit, endianness etc + int numArguments = m_kernelArguments.size(); + int curBufferSize = 0; + int* dest = (int*)&destBuffer[curBufferSize]; + *dest = numArguments; + curBufferSize += sizeof(int); + + for (int i = 0; i < this->m_kernelArguments.size(); i++) + { + b3KernelArgData* arg = (b3KernelArgData*)&destBuffer[curBufferSize]; + *arg = m_kernelArguments[i]; + curBufferSize += sizeof(b3KernelArgData); + if (arg->m_isBuffer == 1) + { + //copy the OpenCL buffer content + cl_int status = 0; + status = clEnqueueReadBuffer(m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes, + &destBuffer[curBufferSize], 0, 0, 0); + b3Assert(status == CL_SUCCESS); + clFinish(m_commandQueue); + curBufferSize += arg->m_argSizeInBytes; + } + } + return curBufferSize; } void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems) { int num = numWorkItems; int buffSize = getSerializationBufferSize(); - unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; - for (int i=0;i<buffSize+1;i++) + unsigned char* buf = new unsigned char[buffSize + sizeof(int)]; + for (int i = 0; i < buffSize + 1; i++) { unsigned char* ptr = (unsigned char*)&buf[i]; *ptr = 0xff; } -// int actualWrite = serializeArguments(buf,buffSize); - -// unsigned char* cptr = (unsigned char*)&buf[buffSize]; -// printf("buf[buffSize] = %d\n",*cptr); - - assert(buf[buffSize]==0xff);//check for buffer overrun + // int actualWrite = serializeArguments(buf,buffSize); + + // unsigned char* cptr = (unsigned char*)&buf[buffSize]; + // printf("buf[buffSize] = %d\n",*cptr); + + assert(buf[buffSize] == 0xff); //check for buffer overrun int* ptr = (int*)&buf[buffSize]; - + *ptr = num; - - FILE* f = fopen(fileName,"wb"); - fwrite(buf,buffSize+sizeof(int),1,f); + + FILE* f = fopen(fileName, "wb"); + fwrite(buf, buffSize + sizeof(int), 1, f); fclose(f); delete[] buf; -} - +} diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h index 1b267b31ef..18e9c1db2b 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h @@ -9,60 +9,57 @@ #define B3_DEBUG_SERIALIZE_CL - #ifdef _WIN32 -#pragma warning(disable :4996) +#pragma warning(disable : 4996) #endif #define B3_CL_MAX_ARG_SIZE 16 -B3_ATTRIBUTE_ALIGNED16(struct) b3KernelArgData +B3_ATTRIBUTE_ALIGNED16(struct) +b3KernelArgData { - int m_isBuffer; - int m_argIndex; - int m_argSizeInBytes; + int m_isBuffer; + int m_argIndex; + int m_argSizeInBytes; int m_unusedPadding; - union - { - cl_mem m_clBuffer; - unsigned char m_argData[B3_CL_MAX_ARG_SIZE]; - }; - + union { + cl_mem m_clBuffer; + unsigned char m_argData[B3_CL_MAX_ARG_SIZE]; + }; }; class b3LauncherCL { - cl_command_queue m_commandQueue; cl_kernel m_kernel; int m_idx; - - b3AlignedObjectArray<b3KernelArgData> m_kernelArguments; - int m_serializationSizeInBytes; - bool m_enableSerialization; + + b3AlignedObjectArray<b3KernelArgData> m_kernelArguments; + int m_serializationSizeInBytes; + bool m_enableSerialization; const char* m_name; - public: - - b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays; - - b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name); - - virtual ~b3LauncherCL(); - - void setBuffer( cl_mem clBuffer); - - void setBuffers( b3BufferInfoCL* buffInfo, int n ); - - int getSerializationBufferSize() const - { - return m_serializationSizeInBytes; - } - - int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx); + +public: + b3AlignedObjectArray<b3OpenCLArray<unsigned char>*> m_arrays; + + b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name); + + virtual ~b3LauncherCL(); + + void setBuffer(cl_mem clBuffer); + + void setBuffers(b3BufferInfoCL* buffInfo, int n); + + int getSerializationBufferSize() const + { + return m_serializationSizeInBytes; + } + + int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx); inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx); - int serializeArguments(unsigned char* destBuffer, int destBufferCapacity); - + int serializeArguments(unsigned char* destBuffer, int destBufferCapacity); + int getNumArguments() const { return m_kernelArguments.size(); @@ -75,61 +72,57 @@ class b3LauncherCL void serializeToFile(const char* fileName, int numWorkItems); - template<typename T> - inline void setConst( const T& consts ) - { - int sz=sizeof(T); - b3Assert(sz<=B3_CL_MAX_ARG_SIZE); - - if (m_enableSerialization) - { - b3KernelArgData kernelArg; - kernelArg.m_argIndex = m_idx; - kernelArg.m_isBuffer = 0; - T* destArg = (T*)kernelArg.m_argData; - *destArg = consts; - kernelArg.m_argSizeInBytes = sizeof(T); - m_kernelArguments.push_back(kernelArg); - m_serializationSizeInBytes+=sizeof(b3KernelArgData); - } - - cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts ); - b3Assert( status == CL_SUCCESS ); - } + template <typename T> + inline void setConst(const T& consts) + { + int sz = sizeof(T); + b3Assert(sz <= B3_CL_MAX_ARG_SIZE); - inline void launch1D( int numThreads, int localSize = 64) + if (m_enableSerialization) { - launch2D( numThreads, 1, localSize, 1 ); + b3KernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 0; + T* destArg = (T*)kernelArg.m_argData; + *destArg = consts; + kernelArg.m_argSizeInBytes = sizeof(T); + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes += sizeof(b3KernelArgData); } - inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY ) - { - size_t gRange[3] = {1,1,1}; - size_t lRange[3] = {1,1,1}; - lRange[0] = localSizeX; - lRange[1] = localSizeY; - gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1)); - gRange[0] *= lRange[0]; - gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1)); - gRange[1] *= lRange[1]; - - cl_int status = clEnqueueNDRangeKernel( m_commandQueue, - m_kernel, 2, NULL, gRange, lRange, 0,0,0 ); - if (status != CL_SUCCESS) - { - printf("Error: OpenCL status = %d\n",status); - } - b3Assert( status == CL_SUCCESS ); + cl_int status = clSetKernelArg(m_kernel, m_idx++, sz, &consts); + b3Assert(status == CL_SUCCESS); + } - } - - void enableSerialization(bool serialize) + inline void launch1D(int numThreads, int localSize = 64) + { + launch2D(numThreads, 1, localSize, 1); + } + + inline void launch2D(int numThreadsX, int numThreadsY, int localSizeX, int localSizeY) + { + size_t gRange[3] = {1, 1, 1}; + size_t lRange[3] = {1, 1, 1}; + lRange[0] = localSizeX; + lRange[1] = localSizeY; + gRange[0] = b3Max((size_t)1, (numThreadsX / lRange[0]) + (!(numThreadsX % lRange[0]) ? 0 : 1)); + gRange[0] *= lRange[0]; + gRange[1] = b3Max((size_t)1, (numThreadsY / lRange[1]) + (!(numThreadsY % lRange[1]) ? 0 : 1)); + gRange[1] *= lRange[1]; + + cl_int status = clEnqueueNDRangeKernel(m_commandQueue, + m_kernel, 2, NULL, gRange, lRange, 0, 0, 0); + if (status != CL_SUCCESS) { - m_enableSerialization = serialize; + printf("Error: OpenCL status = %d\n", status); } - -}; - + b3Assert(status == CL_SUCCESS); + } + void enableSerialization(bool serialize) + { + m_enableSerialization = serialize; + } +}; -#endif //B3_LAUNCHER_CL_H +#endif //B3_LAUNCHER_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h index d70c30f53f..e837cceb66 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h @@ -7,16 +7,16 @@ template <typename T> class b3OpenCLArray { - size_t m_size; - size_t m_capacity; - cl_mem m_clBuffer; + size_t m_size; + size_t m_capacity; + cl_mem m_clBuffer; - cl_context m_clContext; + cl_context m_clContext; cl_command_queue m_commandQueue; - bool m_ownsMemory; + bool m_ownsMemory; - bool m_allowGrowingCapacity; + bool m_allowGrowingCapacity; void deallocate() { @@ -25,22 +25,19 @@ class b3OpenCLArray clReleaseMemObject(m_clBuffer); } m_clBuffer = 0; - m_capacity=0; + m_capacity = 0; } b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src); - B3_FORCE_INLINE size_t allocSize(size_t size) - { - return (size ? size*2 : 1); - } + B3_FORCE_INLINE size_t allocSize(size_t size) + { + return (size ? size * 2 : 1); + } public: - - b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity=0, bool allowGrowingCapacity=true) - :m_size(0), m_capacity(0),m_clBuffer(0), - m_clContext(ctx),m_commandQueue(queue), - m_ownsMemory(true),m_allowGrowingCapacity(true) + b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity = 0, bool allowGrowingCapacity = true) + : m_size(0), m_capacity(0), m_clBuffer(0), m_clContext(ctx), m_commandQueue(queue), m_ownsMemory(true), m_allowGrowingCapacity(true) { if (initialCapacity) { @@ -60,34 +57,32 @@ public: m_capacity = sizeInElements; } -// we could enable this assignment, but need to make sure to avoid accidental deep copies -// b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src) -// { -// copyFromArray(src); -// return *this; -// } + // we could enable this assignment, but need to make sure to avoid accidental deep copies + // b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src) + // { + // copyFromArray(src); + // return *this; + // } - - cl_mem getBufferCL() const + cl_mem getBufferCL() const { return m_clBuffer; } - virtual ~b3OpenCLArray() { deallocate(); - m_size=0; - m_capacity=0; + m_size = 0; + m_capacity = 0; } - B3_FORCE_INLINE bool push_back(const T& _Val,bool waitForCompletion=true) + B3_FORCE_INLINE bool push_back(const T& _Val, bool waitForCompletion = true) { bool result = true; size_t sz = size(); - if( sz == capacity() ) + if (sz == capacity()) { - result = reserve( allocSize(size()) ); + result = reserve(allocSize(size())); } copyFromHostPointer(&_Val, 1, sz, waitForCompletion); m_size++; @@ -96,23 +91,23 @@ public: B3_FORCE_INLINE T forcedAt(size_t n) const { - b3Assert(n>=0); - b3Assert(n<capacity()); + b3Assert(n >= 0); + b3Assert(n < capacity()); T elem; - copyToHostPointer(&elem,1,n,true); + copyToHostPointer(&elem, 1, n, true); return elem; } B3_FORCE_INLINE T at(size_t n) const { - b3Assert(n>=0); - b3Assert(n<size()); + b3Assert(n >= 0); + b3Assert(n < size()); T elem; - copyToHostPointer(&elem,1,n,true); + copyToHostPointer(&elem, 1, n, true); return elem; } - B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents=true) + B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents = true) { bool result = true; size_t curSize = size(); @@ -120,11 +115,12 @@ public: if (newsize < curSize) { //leave the OpenCL memory for now - } else + } + else { if (newsize > size()) { - result = reserve(newsize,copyOldContents); + result = reserve(newsize, copyOldContents); } //leave new data uninitialized (init in debug mode?) @@ -134,7 +130,8 @@ public: if (result) { m_size = newsize; - } else + } + else { m_size = 0; } @@ -146,25 +143,25 @@ public: return m_size; } - B3_FORCE_INLINE size_t capacity() const + B3_FORCE_INLINE size_t capacity() const { return m_capacity; } - B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents=true) + B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents = true) { - bool result=true; + bool result = true; // determine new minimum length of allocated storage if (capacity() < _Count) - { // not enough room, reallocate + { // not enough room, reallocate if (m_allowGrowingCapacity) { cl_int ciErrNum; //create a new OpenCL buffer - size_t memSizeInBytes = sizeof(T)*_Count; + size_t memSizeInBytes = sizeof(T) * _Count; cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum); - if (ciErrNum!=CL_SUCCESS) + if (ciErrNum != CL_SUCCESS) { b3Error("OpenCL out-of-memory\n"); _Count = 0; @@ -173,13 +170,13 @@ public: //#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS #ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS unsigned char* src = (unsigned char*)malloc(memSizeInBytes); - for (size_t i=0;i<memSizeInBytes;i++) + for (size_t i = 0; i < memSizeInBytes; i++) src[i] = 0xbb; - ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 ); - b3Assert(ciErrNum==CL_SUCCESS); + ciErrNum = clEnqueueWriteBuffer(m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0, 0, 0); + b3Assert(ciErrNum == CL_SUCCESS); clFinish(m_commandQueue); free(src); -#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS +#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS if (result) { @@ -193,21 +190,21 @@ public: m_clBuffer = buf; m_capacity = _Count; - } else + } + else { //fail: assert and b3Assert(0); deallocate(); - result=false; + result = false; } } return result; } - - void copyToCL(cl_mem destination, size_t numElements, size_t firstElem=0, size_t dstOffsetInElems=0) const + void copyToCL(cl_mem destination, size_t numElements, size_t firstElem = 0, size_t dstOffsetInElems = 0) const { - if (numElements<=0) + if (numElements <= 0) return; b3Assert(m_clBuffer); @@ -216,75 +213,74 @@ public: //likely some error, destination is same as source b3Assert(m_clBuffer != destination); - b3Assert((firstElem+numElements)<=m_size); + b3Assert((firstElem + numElements) <= m_size); cl_int status = 0; + b3Assert(numElements > 0); + b3Assert(numElements <= m_size); - b3Assert(numElements>0); - b3Assert(numElements<=m_size); - - size_t srcOffsetBytes = sizeof(T)*firstElem; - size_t dstOffsetInBytes = sizeof(T)*dstOffsetInElems; + size_t srcOffsetBytes = sizeof(T) * firstElem; + size_t dstOffsetInBytes = sizeof(T) * dstOffsetInElems; - status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination, - srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 ); + status = clEnqueueCopyBuffer(m_commandQueue, m_clBuffer, destination, + srcOffsetBytes, dstOffsetInBytes, sizeof(T) * numElements, 0, 0, 0); - b3Assert( status == CL_SUCCESS ); + b3Assert(status == CL_SUCCESS); } - void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true) + void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion = true) { size_t newSize = srcArray.size(); bool copyOldContents = false; - resize (newSize,copyOldContents); + resize(newSize, copyOldContents); if (newSize) - copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion); - + copyFromHostPointer(&srcArray[0], newSize, 0, waitForCompletion); } - void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem= 0, bool waitForCompletion=true) + void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem = 0, bool waitForCompletion = true) { - b3Assert(numElems+destFirstElem <= capacity()); + b3Assert(numElems + destFirstElem <= capacity()); - if (numElems+destFirstElem) + if (numElems + destFirstElem) { cl_int status = 0; - size_t sizeInBytes=sizeof(T)*numElems; - status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes, - src, 0,0,0 ); - b3Assert(status == CL_SUCCESS ); + size_t sizeInBytes = sizeof(T) * numElems; + status = clEnqueueWriteBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * destFirstElem, sizeInBytes, + src, 0, 0, 0); + b3Assert(status == CL_SUCCESS); if (waitForCompletion) clFinish(m_commandQueue); - } else + } + else { b3Error("copyFromHostPointer invalid range\n"); } } - - void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion=true) const + void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion = true) const { destArray.resize(this->size()); if (size()) - copyToHostPointer(&destArray[0], size(),0,waitForCompletion); + copyToHostPointer(&destArray[0], size(), 0, waitForCompletion); } - void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem=0, bool waitForCompletion=true) const + void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem = 0, bool waitForCompletion = true) const { - b3Assert(numElem+srcFirstElem <= capacity()); + b3Assert(numElem + srcFirstElem <= capacity()); - if(numElem+srcFirstElem <= capacity()) + if (numElem + srcFirstElem <= capacity()) { cl_int status = 0; - status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem, - destPtr, 0,0,0 ); - b3Assert( status==CL_SUCCESS ); + status = clEnqueueReadBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * srcFirstElem, sizeof(T) * numElem, + destPtr, 0, 0, 0); + b3Assert(status == CL_SUCCESS); if (waitForCompletion) clFinish(m_commandQueue); - } else + } + else { b3Error("copyToHostPointer invalid range\n"); } @@ -296,11 +292,9 @@ public: resize(newSize); if (size()) { - src.copyToCL(m_clBuffer,size()); + src.copyToCL(m_clBuffer, size()); } } - }; - -#endif //B3_OPENCL_ARRAY_H +#endif //B3_OPENCL_ARRAY_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp index 42cd197740..822b511633 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp @@ -7,25 +7,24 @@ #include "kernels/PrefixScanKernelsCL.h" b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size) -:m_commandQueue(queue) + : m_commandQueue(queue) { const char* scanKernelSource = prefixScanKernelsCL; cl_int pErrNum; - char* additionalMacros=0; + char* additionalMacros = 0; - m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size); - cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH); + m_workBuffer = new b3OpenCLArray<unsigned int>(ctx, queue, size); + cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_PROG_PATH); b3Assert(scanProg); - m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros ); - b3Assert(m_localScanKernel ); - m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros ); - b3Assert(m_blockSumKernel ); - m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros ); - b3Assert(m_propagationKernel ); + m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros); + b3Assert(m_localScanKernel); + m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros); + b3Assert(m_blockSumKernel); + m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros); + b3Assert(m_propagationKernel); } - b3PrefixScanCL::~b3PrefixScanCL() { delete m_workBuffer; @@ -34,20 +33,19 @@ b3PrefixScanCL::~b3PrefixScanCL() clReleaseKernel(m_propagationKernel); } -template<class T> +template <class T> T b3NextPowerOf2(T n) { n -= 1; - for(int i=0; i<sizeof(T)*8; i++) - n = n | (n>>i); - return n+1; + for (int i = 0; i < sizeof(T) * 8; i++) + n = n | (n >> i); + return n + 1; } void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum) { - -// b3Assert( data->m_option == EXCLUSIVE ); - const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) ); + // b3Assert( data->m_option == EXCLUSIVE ); + const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2)); dst.resize(src.size()); m_workBuffer->resize(src.size()); @@ -55,55 +53,51 @@ void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<uns b3Int4 constBuffer; constBuffer.x = n; constBuffer.y = numBlocks; - constBuffer.z = (int)b3NextPowerOf2( numBlocks ); + constBuffer.z = (int)b3NextPowerOf2(numBlocks); b3OpenCLArray<unsigned int>* srcNative = &src; b3OpenCLArray<unsigned int>* dstNative = &dst; - + { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())}; - b3LauncherCL launcher( m_commandQueue, m_localScanKernel,"m_localScanKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( constBuffer ); - launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE ); + b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(constBuffer); + launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE); } { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())}; - b3LauncherCL launcher( m_commandQueue, m_blockSumKernel,"m_blockSumKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( constBuffer ); - launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE ); + b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(constBuffer); + launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE); } - - if( numBlocks > 1 ) + if (numBlocks > 1) { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; - b3LauncherCL launcher( m_commandQueue, m_propagationKernel,"m_propagationKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( constBuffer ); - launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE ); + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())}; + b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(constBuffer); + launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE); } - - if( sum ) + if (sum) { clFinish(m_commandQueue); - dstNative->copyToHostPointer(sum,1,n-1,true); + dstNative->copyToHostPointer(sum, 1, n - 1, true); } - } - void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum) { unsigned int s = 0; //if( data->m_option == EXCLUSIVE ) { - for(int i=0; i<n; i++) + for (int i = 0; i < n; i++) { dst[i] = s; s += src[i]; @@ -119,8 +113,8 @@ void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3Alig } */ - if( sum ) + if (sum) { - *sum = dst[n-1]; + *sum = dst[n - 1]; } }
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h index a9a2e61b9e..346efa0c73 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h @@ -13,9 +13,9 @@ class b3PrefixScanCL BLOCK_SIZE = 128 }; -// Option m_option; + // Option m_option; - cl_command_queue m_commandQueue; + cl_command_queue m_commandQueue; cl_kernel m_localScanKernel; cl_kernel m_blockSumKernel; @@ -23,15 +23,13 @@ class b3PrefixScanCL b3OpenCLArray<unsigned int>* m_workBuffer; - - public: - - b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0); +public: + b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0); virtual ~b3PrefixScanCL(); void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0); - void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum=0); + void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum = 0); }; -#endif //B3_PREFIX_SCAN_CL_H +#endif //B3_PREFIX_SCAN_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp index 80560d793d..1cac97c988 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp @@ -7,25 +7,24 @@ #include "kernels/PrefixScanKernelsFloat4CL.h" b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size) -:m_commandQueue(queue) + : m_commandQueue(queue) { const char* scanKernelSource = prefixScanKernelsFloat4CL; cl_int pErrNum; - char* additionalMacros=0; + char* additionalMacros = 0; - m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx,queue,size); - cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH); + m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx, queue, size); + cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH); b3Assert(scanProg); - m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros ); - b3Assert(m_localScanKernel ); - m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros ); - b3Assert(m_blockSumKernel ); - m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros ); - b3Assert(m_propagationKernel ); + m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros); + b3Assert(m_localScanKernel); + m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros); + b3Assert(m_blockSumKernel); + m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros); + b3Assert(m_propagationKernel); } - b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL() { delete m_workBuffer; @@ -34,20 +33,19 @@ b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL() clReleaseKernel(m_propagationKernel); } -template<class T> +template <class T> T b3NextPowerOf2(T n) { n -= 1; - for(int i=0; i<sizeof(T)*8; i++) - n = n | (n>>i); - return n+1; + for (int i = 0; i < sizeof(T) * 8; i++) + n = n | (n >> i); + return n + 1; } void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum) { - -// b3Assert( data->m_option == EXCLUSIVE ); - const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) ); + // b3Assert( data->m_option == EXCLUSIVE ); + const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2)); dst.resize(src.size()); m_workBuffer->resize(src.size()); @@ -55,55 +53,51 @@ void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray< b3Int4 constBuffer; constBuffer.x = n; constBuffer.y = numBlocks; - constBuffer.z = (int)b3NextPowerOf2( numBlocks ); + constBuffer.z = (int)b3NextPowerOf2(numBlocks); b3OpenCLArray<b3Vector3>* srcNative = &src; b3OpenCLArray<b3Vector3>* dstNative = &dst; - + { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())}; - b3LauncherCL launcher( m_commandQueue, m_localScanKernel ,"m_localScanKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( constBuffer ); - launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE ); + b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(constBuffer); + launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE); } { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())}; - b3LauncherCL launcher( m_commandQueue, m_blockSumKernel ,"m_blockSumKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( constBuffer ); - launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE ); + b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(constBuffer); + launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE); } - - if( numBlocks > 1 ) + if (numBlocks > 1) { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; - b3LauncherCL launcher( m_commandQueue, m_propagationKernel ,"m_propagationKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( constBuffer ); - launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE ); + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())}; + b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(constBuffer); + launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE); } - - if( sum ) + if (sum) { clFinish(m_commandQueue); - dstNative->copyToHostPointer(sum,1,n-1,true); + dstNative->copyToHostPointer(sum, 1, n - 1, true); } - } - void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum) { - b3Vector3 s=b3MakeVector3(0,0,0); + b3Vector3 s = b3MakeVector3(0, 0, 0); //if( data->m_option == EXCLUSIVE ) { - for(int i=0; i<n; i++) + for (int i = 0; i < n; i++) { dst[i] = s; s += src[i]; @@ -119,8 +113,8 @@ void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3A } */ - if( sum ) + if (sum) { - *sum = dst[n-1]; + *sum = dst[n - 1]; } }
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h index 2c8003c1bb..122b0bfd68 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h @@ -14,9 +14,9 @@ class b3PrefixScanFloat4CL BLOCK_SIZE = 128 }; -// Option m_option; + // Option m_option; - cl_command_queue m_commandQueue; + cl_command_queue m_commandQueue; cl_kernel m_localScanKernel; cl_kernel m_blockSumKernel; @@ -24,10 +24,8 @@ class b3PrefixScanFloat4CL b3OpenCLArray<b3Vector3>* m_workBuffer; - - public: - - b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0); +public: + b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0); virtual ~b3PrefixScanFloat4CL(); @@ -35,4 +33,4 @@ class b3PrefixScanFloat4CL void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum); }; -#endif //B3_PREFIX_SCAN_CL_H +#endif //B3_PREFIX_SCAN_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp index f11ae4bcdb..e86af6583f 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp @@ -10,21 +10,20 @@ #include "kernels/RadixSort32KernelsCL.h" b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity) -:m_commandQueue(queue) + : m_commandQueue(queue) { b3OpenCLDeviceInfo info; - b3OpenCLUtils::getDeviceInfo(device,&info); - m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0; + b3OpenCLUtils::getDeviceInfo(device, &info); + m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0; - m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue); - m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue); - m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue); - m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue); - m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue); - m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue); + m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue); + m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue); + m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue); + m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue); + m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue); - - if (initialCapacity>0) + if (initialCapacity > 0) { m_workBuffer1->resize(initialCapacity); m_workBuffer3->resize(initialCapacity); @@ -33,45 +32,40 @@ b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command m_workBuffer4a->resize(initialCapacity); } - m_scan = new b3PrefixScanCL(ctx,device,queue); - m_fill = new b3FillCL(ctx,device,queue); - + m_scan = new b3PrefixScanCL(ctx, device, queue); + m_fill = new b3FillCL(ctx, device, queue); + const char* additionalMacros = ""; cl_int pErrNum; const char* kernelSource = radixSort32KernelsCL; - - cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH); - b3Assert(sortProg); - m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros ); - b3Assert(m_streamCountSortDataKernel ); + cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH); + b3Assert(sortProg); + m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros); + b3Assert(m_streamCountSortDataKernel); - - m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros ); + m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros); b3Assert(m_streamCountKernel); - - if (m_deviceCPU) { - - m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros ); + m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros); b3Assert(m_sortAndScatterSortDataKernel); - m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros ); + m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros); b3Assert(m_sortAndScatterKernel); - } else + } + else { - m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros ); + m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros); b3Assert(m_sortAndScatterSortDataKernel); - m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros ); + m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros); b3Assert(m_sortAndScatterKernel); } - - m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros ); + + m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros); b3Assert(m_prefixScanKernel); - } b3RadixSort32CL::~b3RadixSort32CL() @@ -96,8 +90,7 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s { int n = inout.size(); const int BITS_PER_PASS = 8; - const int NUM_TABLES = (1<<BITS_PER_PASS); - + const int NUM_TABLES = (1 << BITS_PER_PASS); int tables[NUM_TABLES]; int counter[NUM_TABLES]; @@ -107,34 +100,33 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s workbuffer.resize(inout.size()); b3SortData* dst = &workbuffer[0]; - int count=0; - for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS) + int count = 0; + for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS) { - for(int i=0; i<NUM_TABLES; i++) + for (int i = 0; i < NUM_TABLES; i++) { tables[i] = 0; } - for(int i=0; i<n; i++) + for (int i = 0; i < n; i++) { - int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); + int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1); tables[tableIdx]++; } //#define TEST #ifdef TEST - printf("histogram size=%d\n",NUM_TABLES); - for (int i=0;i<NUM_TABLES;i++) + printf("histogram size=%d\n", NUM_TABLES); + for (int i = 0; i < NUM_TABLES; i++) { - if (tables[i]!=0) + if (tables[i] != 0) { - printf("tables[%d]=%d]\n",i,tables[i]); + printf("tables[%d]=%d]\n", i, tables[i]); } - } -#endif //TEST - // prefix scan +#endif //TEST \ + // prefix scan int sum = 0; - for(int i=0; i<NUM_TABLES; i++) + for (int i = 0; i < NUM_TABLES; i++) { int iData = tables[i]; tables[i] = sum; @@ -143,71 +135,65 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s } // distribute - for(int i=0; i<n; i++) + for (int i = 0; i < n; i++) { - int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); - + int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1); + dst[tables[tableIdx] + counter[tableIdx]] = src[i]; - counter[tableIdx] ++; + counter[tableIdx]++; } - b3Swap( src, dst ); + b3Swap(src, dst); count++; } - if (count&1) + if (count & 1) { - b3Assert(0);//need to copy - + b3Assert(0); //need to copy } } void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) { - b3AlignedObjectArray<b3SortData> inout; keyValuesInOut.copyToHost(inout); - executeHost(inout,sortBits); + executeHost(inout, sortBits); keyValuesInOut.copyFromHost(inout); } -void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, - b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits) +void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, + b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits) { - } //#define DEBUG_RADIXSORT //#define DEBUG_RADIXSORT2 - void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) { - int originalSize = keyValuesInOut.size(); int workingSize = originalSize; - - + int dataAlignment = DATA_ALIGNMENT; #ifdef DEBUG_RADIXSORT2 - b3AlignedObjectArray<b3SortData> test2; - keyValuesInOut.copyToHost(test2); - printf("numElem = %d\n",test2.size()); - for (int i=0;i<test2.size();i++) - { - printf("test2[%d].m_key=%d\n",i,test2[i].m_key); - printf("test2[%d].m_value=%d\n",i,test2[i].m_value); - } -#endif //DEBUG_RADIXSORT2 - + b3AlignedObjectArray<b3SortData> test2; + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n", test2.size()); + for (int i = 0; i < test2.size(); i++) + { + printf("test2[%d].m_key=%d\n", i, test2[i].m_key); + printf("test2[%d].m_value=%d\n", i, test2[i].m_value); + } +#endif //DEBUG_RADIXSORT2 + b3OpenCLArray<b3SortData>* src = 0; - if (workingSize%dataAlignment) + if (workingSize % dataAlignment) { - workingSize += dataAlignment-(workingSize%dataAlignment); + workingSize += dataAlignment - (workingSize % dataAlignment); m_workBuffer4->copyFromOpenCLArray(keyValuesInOut); m_workBuffer4->resize(workingSize); b3SortData fillValue; @@ -216,327 +202,301 @@ void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sor #define USE_BTFILL #ifdef USE_BTFILL - m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize); + m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize); #else //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side) - - for (int i=originalSize; i<workingSize;i++) + + for (int i = originalSize; i < workingSize; i++) { - m_workBuffer4->copyFromHostPointer(&fillValue,1,i); + m_workBuffer4->copyFromHostPointer(&fillValue, 1, i); } -#endif//USE_BTFILL +#endif //USE_BTFILL src = m_workBuffer4; - } else + } + else { src = &keyValuesInOut; m_workBuffer4->resize(0); } - - b3Assert( workingSize%DATA_ALIGNMENT == 0 ); - int minCap = NUM_BUCKET*NUM_WGS; + b3Assert(workingSize % DATA_ALIGNMENT == 0); + int minCap = NUM_BUCKET * NUM_WGS; int n = workingSize; m_workBuffer1->resize(minCap); m_workBuffer3->resize(workingSize); - -// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); - b3Assert( BITS_PER_PASS == 4 ); - b3Assert( WG_SIZE == 64 ); - b3Assert( (sortBits&0x3) == 0 ); + // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + b3Assert(BITS_PER_PASS == 4); + b3Assert(WG_SIZE == 64); + b3Assert((sortBits & 0x3) == 0); - - b3OpenCLArray<b3SortData>* dst = m_workBuffer3; b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; - int nWGs = NUM_WGS; b3ConstData cdata; { - int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 - int nBlocks = (n+blockSize-1)/(blockSize); + int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256 + int nBlocks = (n + blockSize - 1) / (blockSize); cdata.m_n = n; cdata.m_nWGs = NUM_WGS; cdata.m_startBit = 0; - cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; - if( nBlocks < NUM_WGS ) + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs; + if (nBlocks < NUM_WGS) { cdata.m_nBlocksPerWG = 1; nWGs = nBlocks; } } - int count=0; - for(int ib=0; ib<sortBits; ib+=4) + int count = 0; + for (int ib = 0; ib < sortBits; ib += 4) { #ifdef DEBUG_RADIXSORT2 - keyValuesInOut.copyToHost(test2); - printf("numElem = %d\n",test2.size()); - for (int i=0;i<test2.size();i++) - { - if (test2[i].m_key != test2[i].m_value) - { - printf("test2[%d].m_key=%d\n",i,test2[i].m_key); - printf("test2[%d].m_value=%d\n",i,test2[i].m_value); - } - } -#endif //DEBUG_RADIXSORT2 - + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n", test2.size()); + for (int i = 0; i < test2.size(); i++) + { + if (test2[i].m_key != test2[i].m_value) + { + printf("test2[%d].m_key=%d\n", i, test2[i].m_key); + printf("test2[%d].m_value=%d\n", i, test2[i].m_value); + } + } +#endif //DEBUG_RADIXSORT2 + cdata.m_startBit = ib; - + if (src->size()) { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; - b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel"); - - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( cdata ); - - int num = NUM_WGS*WG_SIZE; - launcher.launch1D( num, WG_SIZE ); + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())}; + b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel"); + + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(cdata); + + int num = NUM_WGS * WG_SIZE; + launcher.launch1D(num, WG_SIZE); } - - #ifdef DEBUG_RADIXSORT b3AlignedObjectArray<unsigned int> testHist; srcHisto->copyToHost(testHist); - printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); - for (int i=0;i<testHist.size();i++) + printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size()); + for (int i = 0; i < testHist.size(); i++) { - if (testHist[i]!=0) - printf("testHist[%d]=%d\n",i,testHist[i]); + if (testHist[i] != 0) + printf("testHist[%d]=%d\n", i, testHist[i]); } -#endif //DEBUG_RADIXSORT - - +#endif //DEBUG_RADIXSORT //fast prefix scan is not working properly on Mac OSX yet #ifdef __APPLE__ - bool fastScan=false; + bool fastScan = false; #else - bool fastScan=!m_deviceCPU;//only use fast scan on GPU + bool fastScan = !m_deviceCPU; //only use fast scan on GPU #endif if (fastScan) - {// prefix scan group histogram - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; - b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( cdata ); - launcher.launch1D( 128, 128 ); + { // prefix scan group histogram + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())}; + b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(cdata); + launcher.launch1D(128, 128); destHisto = srcHisto; - }else + } + else { //unsigned int sum; //for debugging - m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum); } - #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); - printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); - for (int i=0;i<testHist.size();i++) + printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size()); + for (int i = 0; i < testHist.size(); i++) { - if (testHist[i]!=0) - printf("testHist[%d]=%d\n",i,testHist[i]); + if (testHist[i] != 0) + printf("testHist[%d]=%d\n", i, testHist[i]); } - - for (int i=0;i<testHist.size();i+=NUM_WGS) + + for (int i = 0; i < testHist.size(); i += NUM_WGS) { - printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]); + printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]); } -#endif //DEBUG_RADIXSORT +#endif //DEBUG_RADIXSORT #define USE_GPU #ifdef USE_GPU - + if (src->size()) - {// local sort and distribute - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; - b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( cdata ); - launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); - + { // local sort and distribute + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())}; + b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(cdata); + launcher.launch1D(nWGs * WG_SIZE, WG_SIZE); } #else - { + { #define NUM_TABLES 16 //#define SEQUENTIAL #ifdef SEQUENTIAL - int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - int tables[NUM_TABLES]; - int startBit = ib; - - destHisto->copyToHost(testHist); - b3AlignedObjectArray<b3SortData> srcHost; - b3AlignedObjectArray<b3SortData> dstHost; - dstHost.resize(src->size()); - - src->copyToHost(srcHost); - - for (int i=0;i<NUM_TABLES;i++) - { - tables[i] = testHist[i*NUM_WGS]; - } - - // distribute - for(int i=0; i<n; i++) - { - int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); - - dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; - counter2[tableIdx] ++; - } - - + int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int tables[NUM_TABLES]; + int startBit = ib; + + destHisto->copyToHost(testHist); + b3AlignedObjectArray<b3SortData> srcHost; + b3AlignedObjectArray<b3SortData> dstHost; + dstHost.resize(src->size()); + + src->copyToHost(srcHost); + + for (int i = 0; i < NUM_TABLES; i++) + { + tables[i] = testHist[i * NUM_WGS]; + } + + // distribute + for (int i = 0; i < n; i++) + { + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1); + + dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx]++; + } + #else - - int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - - int tables[NUM_TABLES]; - b3AlignedObjectArray<b3SortData> dstHostOK; - dstHostOK.resize(src->size()); - - destHisto->copyToHost(testHist); - b3AlignedObjectArray<b3SortData> srcHost; - src->copyToHost(srcHost); - - int blockSize = 256; - int nBlocksPerWG = cdata.m_nBlocksPerWG; - int startBit = ib; - - { - for (int i=0;i<NUM_TABLES;i++) - { - tables[i] = testHist[i*NUM_WGS]; - } - - // distribute - for(int i=0; i<n; i++) - { - int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); - - dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; - counter2[tableIdx] ++; - } - - - } - - - b3AlignedObjectArray<b3SortData> dstHost; - dstHost.resize(src->size()); - - - int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - - - - for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++) - { - int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - - int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; - - for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++) - { - for (int lIdx = 0;lIdx < 64;lIdx++) - { - int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; - - // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD - // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops - // AMD: AtomInc performs better while NV prefers ++ - for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) - { - if( addr+j < n ) - { - // printf ("addr+j=%d\n", addr+j); - - int i = addr+j; - - int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); - - int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx]; - - b3SortData ok = dstHostOK[destIndex]; - - if (ok.m_key != srcHost[i].m_key) - { - printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key ); - printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value ); - } - if (ok.m_value != srcHost[i].m_value) - { - - printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value ); - printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key ); - - } - - dstHost[destIndex] = srcHost[i]; - counter[tableIdx] ++; - - } - } - } - } - } - - -#endif //SEQUENTIAL - - dst->copyFromHost(dstHost); - } -#endif//USE_GPU - - - + + int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + int tables[NUM_TABLES]; + b3AlignedObjectArray<b3SortData> dstHostOK; + dstHostOK.resize(src->size()); + + destHisto->copyToHost(testHist); + b3AlignedObjectArray<b3SortData> srcHost; + src->copyToHost(srcHost); + + int blockSize = 256; + int nBlocksPerWG = cdata.m_nBlocksPerWG; + int startBit = ib; + + { + for (int i = 0; i < NUM_TABLES; i++) + { + tables[i] = testHist[i * NUM_WGS]; + } + + // distribute + for (int i = 0; i < n; i++) + { + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1); + + dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx]++; + } + } + + b3AlignedObjectArray<b3SortData> dstHost; + dstHost.resize(src->size()); + + int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++) + { + int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx; + + for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx = 0; lIdx < 64; lIdx++) + { + int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx; + + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++) + { + if (addr + j < n) + { + // printf ("addr+j=%d\n", addr+j); + + int i = addr + j; + + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1); + + int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx]; + + b3SortData ok = dstHostOK[destIndex]; + + if (ok.m_key != srcHost[i].m_key) + { + printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key); + printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value); + } + if (ok.m_value != srcHost[i].m_value) + { + printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value); + printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key); + } + + dstHost[destIndex] = srcHost[i]; + counter[tableIdx]++; + } + } + } + } + } + +#endif //SEQUENTIAL + + dst->copyFromHost(dstHost); + } +#endif //USE_GPU + #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); - printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); - for (int i=0;i<testHist.size();i++) + printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size()); + for (int i = 0; i < testHist.size(); i++) { - if (testHist[i]!=0) - printf("testHist[%d]=%d\n",i,testHist[i]); + if (testHist[i] != 0) + printf("testHist[%d]=%d\n", i, testHist[i]); } -#endif //DEBUG_RADIXSORT - b3Swap(src, dst ); - b3Swap(srcHisto,destHisto); +#endif //DEBUG_RADIXSORT + b3Swap(src, dst); + b3Swap(srcHisto, destHisto); #ifdef DEBUG_RADIXSORT2 - keyValuesInOut.copyToHost(test2); - printf("numElem = %d\n",test2.size()); - for (int i=0;i<test2.size();i++) - { - if (test2[i].m_key != test2[i].m_value) - { - printf("test2[%d].m_key=%d\n",i,test2[i].m_key); - printf("test2[%d].m_value=%d\n",i,test2[i].m_value); - } - } -#endif //DEBUG_RADIXSORT2 - - count++; - - + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n", test2.size()); + for (int i = 0; i < test2.size(); i++) + { + if (test2[i].m_key != test2[i].m_value) + { + printf("test2[%d].m_key=%d\n", i, test2[i].m_key); + printf("test2[%d].m_value=%d\n", i, test2[i].m_value); + } + } +#endif //DEBUG_RADIXSORT2 + + count++; } - - - - if (count&1) + + if (count & 1) { - b3Assert(0);//need to copy from workbuffer to keyValuesInOut + b3Assert(0); //need to copy from workbuffer to keyValuesInOut } if (m_workBuffer4->size()) @@ -545,153 +505,137 @@ void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sor keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4); } - #ifdef DEBUG_RADIXSORT - keyValuesInOut.copyToHost(test2); - - printf("numElem = %d\n",test2.size()); - for (int i=0;i<test2.size();i++) - { - printf("test2[%d].m_key=%d\n",i,test2[i].m_key); - printf("test2[%d].m_value=%d\n",i,test2[i].m_value); - } -#endif - -} - - - - + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n", test2.size()); + for (int i = 0; i < test2.size(); i++) + { + printf("test2[%d].m_key=%d\n", i, test2[i].m_key); + printf("test2[%d].m_value=%d\n", i, test2[i].m_value); + } +#endif +} void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */) { int originalSize = keysInOut.size(); int workingSize = originalSize; - - + int dataAlignment = DATA_ALIGNMENT; b3OpenCLArray<unsigned int>* src = 0; - if (workingSize%dataAlignment) + if (workingSize % dataAlignment) { - workingSize += dataAlignment-(workingSize%dataAlignment); + workingSize += dataAlignment - (workingSize % dataAlignment); m_workBuffer4a->copyFromOpenCLArray(keysInOut); m_workBuffer4a->resize(workingSize); unsigned int fillValue = 0xffffffff; - - m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize); + + m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize); src = m_workBuffer4a; - } else + } + else { src = &keysInOut; m_workBuffer4a->resize(0); } - - - - b3Assert( workingSize%DATA_ALIGNMENT == 0 ); - int minCap = NUM_BUCKET*NUM_WGS; + b3Assert(workingSize % DATA_ALIGNMENT == 0); + int minCap = NUM_BUCKET * NUM_WGS; int n = workingSize; - m_workBuffer1->resize(minCap); m_workBuffer3->resize(workingSize); m_workBuffer3a->resize(workingSize); -// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); - b3Assert( BITS_PER_PASS == 4 ); - b3Assert( WG_SIZE == 64 ); - b3Assert( (sortBits&0x3) == 0 ); + // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + b3Assert(BITS_PER_PASS == 4); + b3Assert(WG_SIZE == 64); + b3Assert((sortBits & 0x3) == 0); - - b3OpenCLArray<unsigned int>* dst = m_workBuffer3a; b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; - int nWGs = NUM_WGS; b3ConstData cdata; { - int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 - int nBlocks = (n+blockSize-1)/(blockSize); + int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256 + int nBlocks = (n + blockSize - 1) / (blockSize); cdata.m_n = n; cdata.m_nWGs = NUM_WGS; cdata.m_startBit = 0; - cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; - if( nBlocks < NUM_WGS ) + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs; + if (nBlocks < NUM_WGS) { cdata.m_nBlocksPerWG = 1; nWGs = nBlocks; } } - int count=0; - for(int ib=0; ib<sortBits; ib+=4) + int count = 0; + for (int ib = 0; ib < sortBits; ib += 4) { cdata.m_startBit = ib; - + if (src->size()) { - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; - b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel"); - - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( cdata ); - - int num = NUM_WGS*WG_SIZE; - launcher.launch1D( num, WG_SIZE ); - } + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())}; + b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel"); + + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(cdata); - + int num = NUM_WGS * WG_SIZE; + launcher.launch1D(num, WG_SIZE); + } //fast prefix scan is not working properly on Mac OSX yet #ifdef __APPLE__ - bool fastScan=false; + bool fastScan = false; #else - bool fastScan=!m_deviceCPU; + bool fastScan = !m_deviceCPU; #endif if (fastScan) - {// prefix scan group histogram - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; - b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( cdata ); - launcher.launch1D( 128, 128 ); + { // prefix scan group histogram + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())}; + b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(cdata); + launcher.launch1D(128, 128); destHisto = srcHisto; - }else + } + else { //unsigned int sum; //for debugging - m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum); } if (src->size()) - {// local sort and distribute - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; - b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( cdata ); - launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); - + { // local sort and distribute + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())}; + b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(cdata); + launcher.launch1D(nWGs * WG_SIZE, WG_SIZE); } - - b3Swap(src, dst ); - b3Swap(srcHisto,destHisto); - count++; + b3Swap(src, dst); + b3Swap(srcHisto, destHisto); + + count++; } - - if (count&1) + + if (count & 1) { - b3Assert(0);//need to copy from workbuffer to keyValuesInOut + b3Assert(0); //need to copy from workbuffer to keyValuesInOut } if (m_workBuffer4a->size()) @@ -699,12 +643,4 @@ void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBi m_workBuffer4a->resize(originalSize); keysInOut.copyFromOpenCLArray(*m_workBuffer4a); } - } - - - - - - - diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h index 975bd80e53..69caf182d7 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h @@ -6,90 +6,79 @@ struct b3SortData { - union - { + union { unsigned int m_key; unsigned int x; }; - union - { + union { unsigned int m_value; unsigned int y; - }; }; #include "b3BufferInfoCL.h" -class b3RadixSort32CL +class b3RadixSort32CL { + b3OpenCLArray<unsigned int>* m_workBuffer1; + b3OpenCLArray<unsigned int>* m_workBuffer2; - b3OpenCLArray<unsigned int>* m_workBuffer1; - b3OpenCLArray<unsigned int>* m_workBuffer2; - - b3OpenCLArray<b3SortData>* m_workBuffer3; - b3OpenCLArray<b3SortData>* m_workBuffer4; - - b3OpenCLArray<unsigned int>* m_workBuffer3a; - b3OpenCLArray<unsigned int>* m_workBuffer4a; + b3OpenCLArray<b3SortData>* m_workBuffer3; + b3OpenCLArray<b3SortData>* m_workBuffer4; - cl_command_queue m_commandQueue; + b3OpenCLArray<unsigned int>* m_workBuffer3a; + b3OpenCLArray<unsigned int>* m_workBuffer4a; - cl_kernel m_streamCountSortDataKernel; - cl_kernel m_streamCountKernel; + cl_command_queue m_commandQueue; - cl_kernel m_prefixScanKernel; - cl_kernel m_sortAndScatterSortDataKernel; - cl_kernel m_sortAndScatterKernel; + cl_kernel m_streamCountSortDataKernel; + cl_kernel m_streamCountKernel; + cl_kernel m_prefixScanKernel; + cl_kernel m_sortAndScatterSortDataKernel; + cl_kernel m_sortAndScatterKernel; - bool m_deviceCPU; + bool m_deviceCPU; - class b3PrefixScanCL* m_scan; - class b3FillCL* m_fill; + class b3PrefixScanCL* m_scan; + class b3FillCL* m_fill; public: struct b3ConstData - { - int m_n; - int m_nWGs; - int m_startBit; - int m_nBlocksPerWG; - }; + { + int m_n; + int m_nWGs; + int m_startBit; + int m_nBlocksPerWG; + }; enum - { - DATA_ALIGNMENT = 256, - WG_SIZE = 64, - BLOCK_SIZE = 256, - ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE), - BITS_PER_PASS = 4, - NUM_BUCKET=(1<<BITS_PER_PASS), - // if you change this, change nPerWI in kernel as well - NUM_WGS = 20*6, // cypress -// NUM_WGS = 24*6, // cayman -// NUM_WGS = 32*4, // nv - }; - + { + DATA_ALIGNMENT = 256, + WG_SIZE = 64, + BLOCK_SIZE = 256, + ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE), + BITS_PER_PASS = 4, + NUM_BUCKET = (1 << BITS_PER_PASS), + // if you change this, change nPerWI in kernel as well + NUM_WGS = 20 * 6, // cypress + // NUM_WGS = 24*6, // cayman + // NUM_WGS = 32*4, // nv + }; private: - - public: + b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0); - b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0); + virtual ~b3RadixSort32CL(); - virtual ~b3RadixSort32CL(); + void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, + b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32); - void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, - b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32); - - ///keys only - void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32 ); - - void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32 ); - void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32); - void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32); + ///keys only + void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32); + void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32); + void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32); + void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32); }; -#endif //B3_RADIXSORT32_H - +#endif //B3_RADIXSORT32_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h index 9c9e847138..1758dd41e3 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h @@ -1,87 +1,86 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* boundSearchKernelsCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"typedef unsigned int u32;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"typedef struct\n" -"{\n" -" u32 m_key; \n" -" u32 m_value;\n" -"}SortData;\n" -"typedef struct\n" -"{\n" -" u32 m_nSrc;\n" -" u32 m_nDst;\n" -" u32 m_padding[2];\n" -"} ConstBuffer;\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"__kernel\n" -"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n" -" unsigned int nSrc, unsigned int nDst)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < nSrc )\n" -" {\n" -" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n" -" SortData end; end.m_key = nDst; end.m_value = nDst;\n" -" SortData iData = (gIdx==0)? first: src[gIdx-1];\n" -" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" -" if( iData.m_key != jData.m_key )\n" -" {\n" -"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n" -" u32 k = jData.m_key;\n" -" {\n" -" dst[k] = gIdx;\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"__kernel\n" -"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n" -" unsigned int nSrc, unsigned int nDst)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX+1;\n" -" if( gIdx < nSrc+1 )\n" -" {\n" -" SortData first; first.m_key = 0; first.m_value = 0;\n" -" SortData end; end.m_key = nDst; end.m_value = nDst;\n" -" SortData iData = src[gIdx-1];\n" -" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" -" if( iData.m_key != jData.m_key )\n" -" {\n" -" u32 k = iData.m_key;\n" -" {\n" -" dst[k] = gIdx;\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"__kernel\n" -"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n" -" unsigned int nSrc, unsigned int nDst)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" \n" -" if( gIdx < nDst )\n" -" {\n" -" C[gIdx] = A[gIdx] - B[gIdx];\n" -" }\n" -"}\n" -; +static const char* boundSearchKernelsCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "typedef unsigned int u32;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "typedef struct\n" + "{\n" + " u32 m_key; \n" + " u32 m_value;\n" + "}SortData;\n" + "typedef struct\n" + "{\n" + " u32 m_nSrc;\n" + " u32 m_nDst;\n" + " u32 m_padding[2];\n" + "} ConstBuffer;\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "__kernel\n" + "void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n" + " unsigned int nSrc, unsigned int nDst)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < nSrc )\n" + " {\n" + " SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n" + " SortData end; end.m_key = nDst; end.m_value = nDst;\n" + " SortData iData = (gIdx==0)? first: src[gIdx-1];\n" + " SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" + " if( iData.m_key != jData.m_key )\n" + " {\n" + "// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n" + " u32 k = jData.m_key;\n" + " {\n" + " dst[k] = gIdx;\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "__kernel\n" + "void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n" + " unsigned int nSrc, unsigned int nDst)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX+1;\n" + " if( gIdx < nSrc+1 )\n" + " {\n" + " SortData first; first.m_key = 0; first.m_value = 0;\n" + " SortData end; end.m_key = nDst; end.m_value = nDst;\n" + " SortData iData = src[gIdx-1];\n" + " SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" + " if( iData.m_key != jData.m_key )\n" + " {\n" + " u32 k = iData.m_key;\n" + " {\n" + " dst[k] = gIdx;\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "__kernel\n" + "void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n" + " unsigned int nSrc, unsigned int nDst)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " \n" + " if( gIdx < nDst )\n" + " {\n" + " C[gIdx] = A[gIdx] - B[gIdx];\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h index e5670e3cd3..33c9279462 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h @@ -1,132 +1,131 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* copyKernelsCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"\n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"\n" -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"\n" -"typedef unsigned int u32;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"\n" -"#define make_uint4 (uint4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"\n" -"typedef struct\n" -"{\n" -" int m_n;\n" -" int m_padding[3];\n" -"} ConstBuffer;\n" -"\n" -"\n" -"\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n" -" ConstBuffer cb)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -"\n" -" if( gIdx < cb.m_n )\n" -" {\n" -" float4 a0 = src[gIdx];\n" -"\n" -" dst[ gIdx ] = a0;\n" -" }\n" -"}\n" -"\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n" -" ConstBuffer cb)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -"\n" -" if( 2*gIdx <= cb.m_n )\n" -" {\n" -" float4 a0 = src[gIdx*2+0];\n" -" float4 a1 = src[gIdx*2+1];\n" -"\n" -" dst[ gIdx*2+0 ] = a0;\n" -" dst[ gIdx*2+1 ] = a1;\n" -" }\n" -"}\n" -"\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n" -" ConstBuffer cb)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -"\n" -" if( 4*gIdx <= cb.m_n )\n" -" {\n" -" int idx0 = gIdx*4+0;\n" -" int idx1 = gIdx*4+1;\n" -" int idx2 = gIdx*4+2;\n" -" int idx3 = gIdx*4+3;\n" -"\n" -" float4 a0 = src[idx0];\n" -" float4 a1 = src[idx1];\n" -" float4 a2 = src[idx2];\n" -" float4 a3 = src[idx3];\n" -"\n" -" dst[ idx0 ] = a0;\n" -" dst[ idx1 ] = a1;\n" -" dst[ idx2 ] = a2;\n" -" dst[ idx3 ] = a3;\n" -" }\n" -"}\n" -"\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n" -" ConstBuffer cb)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -"\n" -" if( gIdx < cb.m_n )\n" -" {\n" -" float a0 = srcF1[gIdx];\n" -"\n" -" dstF1[ gIdx ] = a0;\n" -" }\n" -"}\n" -"\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n" -" ConstBuffer cb)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -"\n" -" if( gIdx < cb.m_n )\n" -" {\n" -" float2 a0 = srcF2[gIdx];\n" -"\n" -" dstF2[ gIdx ] = a0;\n" -" }\n" -"}\n" -"\n" -"\n" -; +static const char* copyKernelsCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "\n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "\n" + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "\n" + "typedef unsigned int u32;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "\n" + "#define make_uint4 (uint4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "\n" + "typedef struct\n" + "{\n" + " int m_n;\n" + " int m_padding[3];\n" + "} ConstBuffer;\n" + "\n" + "\n" + "\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void Copy1F4Kernel(__global float4* dst, __global float4* src, \n" + " ConstBuffer cb)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + "\n" + " if( gIdx < cb.m_n )\n" + " {\n" + " float4 a0 = src[gIdx];\n" + "\n" + " dst[ gIdx ] = a0;\n" + " }\n" + "}\n" + "\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void Copy2F4Kernel(__global float4* dst, __global float4* src, \n" + " ConstBuffer cb)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + "\n" + " if( 2*gIdx <= cb.m_n )\n" + " {\n" + " float4 a0 = src[gIdx*2+0];\n" + " float4 a1 = src[gIdx*2+1];\n" + "\n" + " dst[ gIdx*2+0 ] = a0;\n" + " dst[ gIdx*2+1 ] = a1;\n" + " }\n" + "}\n" + "\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void Copy4F4Kernel(__global float4* dst, __global float4* src, \n" + " ConstBuffer cb)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + "\n" + " if( 4*gIdx <= cb.m_n )\n" + " {\n" + " int idx0 = gIdx*4+0;\n" + " int idx1 = gIdx*4+1;\n" + " int idx2 = gIdx*4+2;\n" + " int idx3 = gIdx*4+3;\n" + "\n" + " float4 a0 = src[idx0];\n" + " float4 a1 = src[idx1];\n" + " float4 a2 = src[idx2];\n" + " float4 a3 = src[idx3];\n" + "\n" + " dst[ idx0 ] = a0;\n" + " dst[ idx1 ] = a1;\n" + " dst[ idx2 ] = a2;\n" + " dst[ idx3 ] = a3;\n" + " }\n" + "}\n" + "\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n" + " ConstBuffer cb)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + "\n" + " if( gIdx < cb.m_n )\n" + " {\n" + " float a0 = srcF1[gIdx];\n" + "\n" + " dstF1[ gIdx ] = a0;\n" + " }\n" + "}\n" + "\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n" + " ConstBuffer cb)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + "\n" + " if( gIdx < cb.m_n )\n" + " {\n" + " float2 a0 = srcF2[gIdx];\n" + "\n" + " dstF2[ gIdx ] = a0;\n" + " }\n" + "}\n" + "\n" + "\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h index 4f8b96e489..983e652270 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h @@ -1,91 +1,90 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* fillKernelsCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"typedef unsigned int u32;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define make_uint4 (uint4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"typedef struct\n" -"{\n" -" union\n" -" {\n" -" int4 m_data;\n" -" uint4 m_unsignedData;\n" -" float m_floatData;\n" -" };\n" -" int m_offset;\n" -" int m_n;\n" -" int m_padding[2];\n" -"} ConstBuffer;\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < num_elements )\n" -" {\n" -" dstInt[ offset+gIdx ] = value;\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < num_elements )\n" -" {\n" -" dstFloat[ offset+gIdx ] = value;\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < num )\n" -" {\n" -" dstInt[ offset+gIdx ] = value;\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < num )\n" -" {\n" -" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(64,1,1)))\n" -"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < num )\n" -" {\n" -" dstInt4[ offset+gIdx ] = value;\n" -" }\n" -"}\n" -; +static const char* fillKernelsCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "typedef unsigned int u32;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define make_uint4 (uint4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "typedef struct\n" + "{\n" + " union\n" + " {\n" + " int4 m_data;\n" + " uint4 m_unsignedData;\n" + " float m_floatData;\n" + " };\n" + " int m_offset;\n" + " int m_n;\n" + " int m_padding[2];\n" + "} ConstBuffer;\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < num_elements )\n" + " {\n" + " dstInt[ offset+gIdx ] = value;\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < num_elements )\n" + " {\n" + " dstFloat[ offset+gIdx ] = value;\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < num )\n" + " {\n" + " dstInt[ offset+gIdx ] = value;\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < num )\n" + " {\n" + " dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(64,1,1)))\n" + "void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < num )\n" + " {\n" + " dstInt4[ offset+gIdx ] = value;\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h index 27baab8331..fc5e7b865c 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h @@ -1,129 +1,128 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* prefixScanKernelsCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"typedef unsigned int u32;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"// takahiro end\n" -"#define WG_SIZE 128 \n" -"#define m_numElems x\n" -"#define m_numBlocks y\n" -"#define m_numScanBlocks z\n" -"/*typedef struct\n" -"{\n" -" uint m_numElems;\n" -" uint m_numBlocks;\n" -" uint m_numScanBlocks;\n" -" uint m_padding[1];\n" -"} ConstBuffer;\n" -"*/\n" -"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n" -"{\n" -" u32 blocksum;\n" -" int offset = 1;\n" -" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" -" {\n" -" GROUP_LDS_BARRIER;\n" -" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" -" {\n" -" int ai = offset*(2*iIdx+1)-1;\n" -" int bi = offset*(2*iIdx+2)-1;\n" -" data[bi] += data[ai];\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" if( lIdx == 0 )\n" -" {\n" -" blocksum = data[ n-1 ];\n" -" data[ n-1 ] = 0;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" offset >>= 1;\n" -" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" -" {\n" -" GROUP_LDS_BARRIER;\n" -" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" -" {\n" -" int ai = offset*(2*iIdx+1)-1;\n" -" int bi = offset*(2*iIdx+2)-1;\n" -" u32 temp = data[ai];\n" -" data[ai] = data[bi];\n" -" data[bi] += temp;\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" return blocksum;\n" -"}\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"__kernel\n" -"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n" -" uint4 cb)\n" -"{\n" -" __local u32 ldsData[WG_SIZE*2];\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" -" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" -" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" -" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n" -" if( (2*gIdx) < cb.m_numElems )\n" -" {\n" -" dst[2*gIdx] = ldsData[2*lIdx];\n" -" }\n" -" if( (2*gIdx + 1) < cb.m_numElems )\n" -" {\n" -" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" -" }\n" -"}\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"__kernel\n" -"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n" -"{\n" -" const u32 blockSize = WG_SIZE*2;\n" -" int myIdx = GET_GROUP_IDX+1;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" u32 iBlockSum = blockSum[myIdx];\n" -" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" -" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" -" {\n" -" dst[i] += iBlockSum;\n" -" }\n" -"}\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"__kernel\n" -"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n" -"{\n" -" __local u32 ldsData[2048];\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" int lSize = GET_GROUP_SIZE;\n" -" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" -" {\n" -" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" -" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" -" {\n" -" dst[i] = ldsData[i];\n" -" }\n" -" if( gIdx == 0 )\n" -" {\n" -" dst[cb.m_numBlocks] = sum;\n" -" }\n" -"}\n" -; +static const char* prefixScanKernelsCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "typedef unsigned int u32;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "// takahiro end\n" + "#define WG_SIZE 128 \n" + "#define m_numElems x\n" + "#define m_numBlocks y\n" + "#define m_numScanBlocks z\n" + "/*typedef struct\n" + "{\n" + " uint m_numElems;\n" + " uint m_numBlocks;\n" + " uint m_numScanBlocks;\n" + " uint m_padding[1];\n" + "} ConstBuffer;\n" + "*/\n" + "u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n" + "{\n" + " u32 blocksum;\n" + " int offset = 1;\n" + " for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" + " {\n" + " GROUP_LDS_BARRIER;\n" + " for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" + " {\n" + " int ai = offset*(2*iIdx+1)-1;\n" + " int bi = offset*(2*iIdx+2)-1;\n" + " data[bi] += data[ai];\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " if( lIdx == 0 )\n" + " {\n" + " blocksum = data[ n-1 ];\n" + " data[ n-1 ] = 0;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " offset >>= 1;\n" + " for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" + " {\n" + " GROUP_LDS_BARRIER;\n" + " for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" + " {\n" + " int ai = offset*(2*iIdx+1)-1;\n" + " int bi = offset*(2*iIdx+2)-1;\n" + " u32 temp = data[ai];\n" + " data[ai] = data[bi];\n" + " data[bi] += temp;\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " return blocksum;\n" + "}\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "__kernel\n" + "void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n" + " uint4 cb)\n" + "{\n" + " __local u32 ldsData[WG_SIZE*2];\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" + " ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" + " u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" + " if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n" + " if( (2*gIdx) < cb.m_numElems )\n" + " {\n" + " dst[2*gIdx] = ldsData[2*lIdx];\n" + " }\n" + " if( (2*gIdx + 1) < cb.m_numElems )\n" + " {\n" + " dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" + " }\n" + "}\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "__kernel\n" + "void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n" + "{\n" + " const u32 blockSize = WG_SIZE*2;\n" + " int myIdx = GET_GROUP_IDX+1;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " u32 iBlockSum = blockSum[myIdx];\n" + " int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" + " for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" + " {\n" + " dst[i] += iBlockSum;\n" + " }\n" + "}\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "__kernel\n" + "void TopLevelScanKernel(__global u32* dst, uint4 cb)\n" + "{\n" + " __local u32 ldsData[2048];\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " int lSize = GET_GROUP_SIZE;\n" + " for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" + " {\n" + " ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" + " for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" + " {\n" + " dst[i] = ldsData[i];\n" + " }\n" + " if( gIdx == 0 )\n" + " {\n" + " dst[cb.m_numBlocks] = sum;\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h index 5b13254796..15d1bc5195 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h @@ -1,129 +1,128 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* prefixScanKernelsFloat4CL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"typedef unsigned int u32;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"// takahiro end\n" -"#define WG_SIZE 128 \n" -"#define m_numElems x\n" -"#define m_numBlocks y\n" -"#define m_numScanBlocks z\n" -"/*typedef struct\n" -"{\n" -" uint m_numElems;\n" -" uint m_numBlocks;\n" -" uint m_numScanBlocks;\n" -" uint m_padding[1];\n" -"} ConstBuffer;\n" -"*/\n" -"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n" -"{\n" -" float4 blocksum;\n" -" int offset = 1;\n" -" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" -" {\n" -" GROUP_LDS_BARRIER;\n" -" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" -" {\n" -" int ai = offset*(2*iIdx+1)-1;\n" -" int bi = offset*(2*iIdx+2)-1;\n" -" data[bi] += data[ai];\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" if( lIdx == 0 )\n" -" {\n" -" blocksum = data[ n-1 ];\n" -" data[ n-1 ] = 0;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" offset >>= 1;\n" -" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" -" {\n" -" GROUP_LDS_BARRIER;\n" -" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" -" {\n" -" int ai = offset*(2*iIdx+1)-1;\n" -" int bi = offset*(2*iIdx+2)-1;\n" -" float4 temp = data[ai];\n" -" data[ai] = data[bi];\n" -" data[bi] += temp;\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" return blocksum;\n" -"}\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"__kernel\n" -"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)\n" -"{\n" -" __local float4 ldsData[WG_SIZE*2];\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" -" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" -" float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" -" if( lIdx == 0 ) \n" -" sumBuffer[GET_GROUP_IDX] = sum;\n" -" if( (2*gIdx) < cb.m_numElems )\n" -" {\n" -" dst[2*gIdx] = ldsData[2*lIdx];\n" -" }\n" -" if( (2*gIdx + 1) < cb.m_numElems )\n" -" {\n" -" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" -" }\n" -"}\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"__kernel\n" -"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n" -"{\n" -" const u32 blockSize = WG_SIZE*2;\n" -" int myIdx = GET_GROUP_IDX+1;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" float4 iBlockSum = blockSum[myIdx];\n" -" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" -" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" -" {\n" -" dst[i] += iBlockSum;\n" -" }\n" -"}\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"__kernel\n" -"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n" -"{\n" -" __local float4 ldsData[2048];\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" int lSize = GET_GROUP_SIZE;\n" -" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" -" {\n" -" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" -" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" -" {\n" -" dst[i] = ldsData[i];\n" -" }\n" -" if( gIdx == 0 )\n" -" {\n" -" dst[cb.m_numBlocks] = sum;\n" -" }\n" -"}\n" -; +static const char* prefixScanKernelsFloat4CL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "typedef unsigned int u32;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "// takahiro end\n" + "#define WG_SIZE 128 \n" + "#define m_numElems x\n" + "#define m_numBlocks y\n" + "#define m_numScanBlocks z\n" + "/*typedef struct\n" + "{\n" + " uint m_numElems;\n" + " uint m_numBlocks;\n" + " uint m_numScanBlocks;\n" + " uint m_padding[1];\n" + "} ConstBuffer;\n" + "*/\n" + "float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n" + "{\n" + " float4 blocksum;\n" + " int offset = 1;\n" + " for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" + " {\n" + " GROUP_LDS_BARRIER;\n" + " for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" + " {\n" + " int ai = offset*(2*iIdx+1)-1;\n" + " int bi = offset*(2*iIdx+2)-1;\n" + " data[bi] += data[ai];\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " if( lIdx == 0 )\n" + " {\n" + " blocksum = data[ n-1 ];\n" + " data[ n-1 ] = 0;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " offset >>= 1;\n" + " for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" + " {\n" + " GROUP_LDS_BARRIER;\n" + " for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" + " {\n" + " int ai = offset*(2*iIdx+1)-1;\n" + " int bi = offset*(2*iIdx+2)-1;\n" + " float4 temp = data[ai];\n" + " data[ai] = data[bi];\n" + " data[bi] += temp;\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " return blocksum;\n" + "}\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "__kernel\n" + "void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)\n" + "{\n" + " __local float4 ldsData[WG_SIZE*2];\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" + " ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" + " float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" + " if( lIdx == 0 ) \n" + " sumBuffer[GET_GROUP_IDX] = sum;\n" + " if( (2*gIdx) < cb.m_numElems )\n" + " {\n" + " dst[2*gIdx] = ldsData[2*lIdx];\n" + " }\n" + " if( (2*gIdx + 1) < cb.m_numElems )\n" + " {\n" + " dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" + " }\n" + "}\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "__kernel\n" + "void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n" + "{\n" + " const u32 blockSize = WG_SIZE*2;\n" + " int myIdx = GET_GROUP_IDX+1;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " float4 iBlockSum = blockSum[myIdx];\n" + " int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" + " for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" + " {\n" + " dst[i] += iBlockSum;\n" + " }\n" + "}\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "__kernel\n" + "void TopLevelScanKernel(__global float4* dst, uint4 cb)\n" + "{\n" + " __local float4 ldsData[2048];\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " int lSize = GET_GROUP_SIZE;\n" + " for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" + " {\n" + " ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" + " for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" + " {\n" + " dst[i] = ldsData[i];\n" + " }\n" + " if( gIdx == 0 )\n" + " {\n" + " dst[cb.m_numBlocks] = sum;\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h index 8876c16aa6..fb4bdda303 100644 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h @@ -1,910 +1,909 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* radixSort32KernelsCL= \ -"/*\n" -"Bullet Continuous Collision Detection and Physics Library\n" -"Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Author Takahiro Harada\n" -"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"typedef unsigned int u32;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define make_uint4 (uint4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"#define WG_SIZE 64\n" -"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n" -"#define BITS_PER_PASS 4\n" -"#define NUM_BUCKET (1<<BITS_PER_PASS)\n" -"typedef uchar u8;\n" -"// this isn't optimization for VLIW. But just reducing writes. \n" -"#define USE_2LEVEL_REDUCE 1\n" -"//#define CHECK_BOUNDARY 1\n" -"//#define NV_GPU 1\n" -"// Cypress\n" -"#define nPerWI 16\n" -"// Cayman\n" -"//#define nPerWI 20\n" -"#define m_n x\n" -"#define m_nWGs y\n" -"#define m_startBit z\n" -"#define m_nBlocksPerWG w\n" -"/*\n" -"typedef struct\n" -"{\n" -" int m_n;\n" -" int m_nWGs;\n" -" int m_startBit;\n" -" int m_nBlocksPerWG;\n" -"} ConstBuffer;\n" -"*/\n" -"typedef struct\n" -"{\n" -" unsigned int m_key;\n" -" unsigned int m_value;\n" -"} SortDataCL;\n" -"uint prefixScanVectorEx( uint4* data )\n" -"{\n" -" u32 sum = 0;\n" -" u32 tmp = data[0].x;\n" -" data[0].x = sum;\n" -" sum += tmp;\n" -" tmp = data[0].y;\n" -" data[0].y = sum;\n" -" sum += tmp;\n" -" tmp = data[0].z;\n" -" data[0].z = sum;\n" -" sum += tmp;\n" -" tmp = data[0].w;\n" -" data[0].w = sum;\n" -" sum += tmp;\n" -" return sum;\n" -"}\n" -"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n" -"{\n" -" { // Set data\n" -" sorterSharedMemory[lIdx] = 0;\n" -" sorterSharedMemory[lIdx+wgSize] = pData;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" { // Prefix sum\n" -" int idx = 2*lIdx + (wgSize+1);\n" -"#if defined(USE_2LEVEL_REDUCE)\n" -" if( lIdx < 64 )\n" -" {\n" -" u32 u0, u1, u2;\n" -" u0 = sorterSharedMemory[idx-3];\n" -" u1 = sorterSharedMemory[idx-2];\n" -" u2 = sorterSharedMemory[idx-1];\n" -" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" -" GROUP_MEM_FENCE;\n" -" u0 = sorterSharedMemory[idx-12];\n" -" u1 = sorterSharedMemory[idx-8];\n" -" u2 = sorterSharedMemory[idx-4];\n" -" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" -" GROUP_MEM_FENCE;\n" -" u0 = sorterSharedMemory[idx-48];\n" -" u1 = sorterSharedMemory[idx-32];\n" -" u2 = sorterSharedMemory[idx-16];\n" -" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" -" GROUP_MEM_FENCE;\n" -" if( wgSize > 64 )\n" -" {\n" -" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" -" GROUP_MEM_FENCE;\n" -" }\n" -" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" -" GROUP_MEM_FENCE;\n" -" }\n" -"#else\n" -" if( lIdx < 64 )\n" -" {\n" -" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n" -" GROUP_MEM_FENCE;\n" -" sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n" -" GROUP_MEM_FENCE;\n" -" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n" -" GROUP_MEM_FENCE;\n" -" sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n" -" GROUP_MEM_FENCE;\n" -" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n" -" GROUP_MEM_FENCE;\n" -" sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n" -" GROUP_MEM_FENCE;\n" -" if( wgSize > 64 )\n" -" {\n" -" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" -" GROUP_MEM_FENCE;\n" -" }\n" -" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" -" GROUP_MEM_FENCE;\n" -" }\n" -"#endif\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" *totalSum = sorterSharedMemory[wgSize*2-1];\n" -" u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n" -" return addValue;\n" -"}\n" -"//__attribute__((reqd_work_group_size(128,1,1)))\n" -"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" -"{\n" -" u32 s4 = prefixScanVectorEx( &pData );\n" -" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n" -" return pData + make_uint4( rank, rank, rank, rank );\n" -"}\n" -"//__attribute__((reqd_work_group_size(64,1,1)))\n" -"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" -"{\n" -" u32 s4 = prefixScanVectorEx( &pData );\n" -" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n" -" return pData + make_uint4( rank, rank, rank, rank );\n" -"}\n" -"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n" -"u32 bit8Scan(u32 v)\n" -"{\n" -" return (v<<8) + (v<<16) + (v<<24);\n" -"}\n" -"//===\n" -"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n" -"{\n" -" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" -" u32 gIdx = GET_GLOBAL_IDX;\n" -" u32 lIdx = GET_LOCAL_IDX;\n" -" u32 wgIdx = GET_GROUP_IDX;\n" -" u32 wgSize = GET_GROUP_SIZE;\n" -" const int startBit = cb.m_startBit;\n" -" const int n = cb.m_n;\n" -" const int nWGs = cb.m_nWGs;\n" -" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" -" for(int i=0; i<NUM_BUCKET; i++)\n" -" {\n" -" MY_HISTOGRAM(i) = 0;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" -" u32 localKey;\n" -" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" -" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" -" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" -" {\n" -" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" -" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" -" // AMD: AtomInc performs better while NV prefers ++\n" -" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" -" {\n" -"#if defined(CHECK_BOUNDARY)\n" -" if( addr+i < n )\n" -"#endif\n" -" {\n" -" localKey = (gSrc[addr+i]>>startBit) & 0xf;\n" -"#if defined(NV_GPU)\n" -" MY_HISTOGRAM( localKey )++;\n" -"#else\n" -" AtomInc( MY_HISTOGRAM( localKey ) );\n" -"#endif\n" -" }\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" \n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" u32 sum = 0;\n" -" for(int i=0; i<GET_GROUP_SIZE; i++)\n" -" {\n" -" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" -" }\n" -" histogramOut[lIdx*nWGs+wgIdx] = sum;\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb )\n" -"{\n" -" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" -" u32 gIdx = GET_GLOBAL_IDX;\n" -" u32 lIdx = GET_LOCAL_IDX;\n" -" u32 wgIdx = GET_GROUP_IDX;\n" -" u32 wgSize = GET_GROUP_SIZE;\n" -" const int startBit = cb.m_startBit;\n" -" const int n = cb.m_n;\n" -" const int nWGs = cb.m_nWGs;\n" -" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" -" for(int i=0; i<NUM_BUCKET; i++)\n" -" {\n" -" MY_HISTOGRAM(i) = 0;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" -" u32 localKey;\n" -" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" -" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" -" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" -" {\n" -" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" -" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" -" // AMD: AtomInc performs better while NV prefers ++\n" -" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" -" {\n" -"#if defined(CHECK_BOUNDARY)\n" -" if( addr+i < n )\n" -"#endif\n" -" {\n" -" localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n" -"#if defined(NV_GPU)\n" -" MY_HISTOGRAM( localKey )++;\n" -"#else\n" -" AtomInc( MY_HISTOGRAM( localKey ) );\n" -"#endif\n" -" }\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" \n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" u32 sum = 0;\n" -" for(int i=0; i<GET_GROUP_SIZE; i++)\n" -" {\n" -" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" -" }\n" -" histogramOut[lIdx*nWGs+wgIdx] = sum;\n" -" }\n" -"}\n" -"#define nPerLane (nPerWI/4)\n" -"// NUM_BUCKET*nWGs < 128*nPerWI\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(128,1,1)))\n" -"void PrefixScanKernel( __global u32* wHistogram1, int4 cb )\n" -"{\n" -" __local u32 ldsTopScanData[128*2];\n" -" u32 lIdx = GET_LOCAL_IDX;\n" -" u32 wgIdx = GET_GROUP_IDX;\n" -" const int nWGs = cb.m_nWGs;\n" -" u32 data[nPerWI];\n" -" for(int i=0; i<nPerWI; i++)\n" -" {\n" -" data[i] = 0;\n" -" if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n" -" data[i] = wHistogram1[nPerWI*lIdx+i];\n" -" }\n" -" uint4 myData = make_uint4(0,0,0,0);\n" -" for(int i=0; i<nPerLane; i++)\n" -" {\n" -" myData.x += data[nPerLane*0+i];\n" -" myData.y += data[nPerLane*1+i];\n" -" myData.z += data[nPerLane*2+i];\n" -" myData.w += data[nPerLane*3+i];\n" -" }\n" -" uint totalSum;\n" -" uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n" -"// for(int j=0; j<4; j++) // somehow it introduces a lot of branches\n" -" { int j = 0;\n" -" u32 sum = 0;\n" -" for(int i=0; i<nPerLane; i++)\n" -" {\n" -" u32 tmp = data[nPerLane*j+i];\n" -" data[nPerLane*j+i] = sum;\n" -" sum += tmp;\n" -" }\n" -" }\n" -" { int j = 1;\n" -" u32 sum = 0;\n" -" for(int i=0; i<nPerLane; i++)\n" -" {\n" -" u32 tmp = data[nPerLane*j+i];\n" -" data[nPerLane*j+i] = sum;\n" -" sum += tmp;\n" -" }\n" -" }\n" -" { int j = 2;\n" -" u32 sum = 0;\n" -" for(int i=0; i<nPerLane; i++)\n" -" {\n" -" u32 tmp = data[nPerLane*j+i];\n" -" data[nPerLane*j+i] = sum;\n" -" sum += tmp;\n" -" }\n" -" }\n" -" { int j = 3;\n" -" u32 sum = 0;\n" -" for(int i=0; i<nPerLane; i++)\n" -" {\n" -" u32 tmp = data[nPerLane*j+i];\n" -" data[nPerLane*j+i] = sum;\n" -" sum += tmp;\n" -" }\n" -" }\n" -" for(int i=0; i<nPerLane; i++)\n" -" {\n" -" data[nPerLane*0+i] += scanned.x;\n" -" data[nPerLane*1+i] += scanned.y;\n" -" data[nPerLane*2+i] += scanned.z;\n" -" data[nPerLane*3+i] += scanned.w;\n" -" }\n" -" for(int i=0; i<nPerWI; i++)\n" -" {\n" -" int index = nPerWI*lIdx+i;\n" -" if (index < NUM_BUCKET*nWGs)\n" -" wHistogram1[nPerWI*lIdx+i] = data[i];\n" -" }\n" -"}\n" -"// 4 scan, 4 exchange\n" -"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" -"{\n" -" for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n" -" {\n" -" u32 mask = (1<<bitIdx);\n" -" uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n" -" uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n" -" u32 total;\n" -" prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n" -" {\n" -" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n" -" uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n" -" dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n" -" GROUP_LDS_BARRIER;\n" -" ldsSortData[dstAddr.x] = sortData[0];\n" -" ldsSortData[dstAddr.y] = sortData[1];\n" -" ldsSortData[dstAddr.z] = sortData[2];\n" -" ldsSortData[dstAddr.w] = sortData[3];\n" -" GROUP_LDS_BARRIER;\n" -" sortData[0] = ldsSortData[localAddr.x];\n" -" sortData[1] = ldsSortData[localAddr.y];\n" -" sortData[2] = ldsSortData[localAddr.z];\n" -" sortData[3] = ldsSortData[localAddr.w];\n" -" GROUP_LDS_BARRIER;\n" -" }\n" -" }\n" -"}\n" -"// 2 scan, 2 exchange\n" -"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" -"{\n" -" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" -" {\n" -" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" -" (sortData[1]>>(startBit+ibit)) & 0x3, \n" -" (sortData[2]>>(startBit+ibit)) & 0x3, \n" -" (sortData[3]>>(startBit+ibit)) & 0x3);\n" -" u32 key4;\n" -" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" -" {\n" -" sKeyPacked[0] |= 1<<(8*b.x);\n" -" sKeyPacked[1] |= 1<<(8*b.y);\n" -" sKeyPacked[2] |= 1<<(8*b.z);\n" -" sKeyPacked[3] |= 1<<(8*b.w);\n" -" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" -" }\n" -" u32 rankPacked;\n" -" u32 sumPacked;\n" -" {\n" -" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" u32 newOffset[4] = { 0,0,0,0 };\n" -" {\n" -" u32 sumScanned = bit8Scan( sumPacked );\n" -" u32 scannedKeys[4];\n" -" scannedKeys[0] = 1<<(8*b.x);\n" -" scannedKeys[1] = 1<<(8*b.y);\n" -" scannedKeys[2] = 1<<(8*b.z);\n" -" scannedKeys[3] = 1<<(8*b.w);\n" -" { // 4 scans at once\n" -" u32 sum4 = 0;\n" -" for(int ie=0; ie<4; ie++)\n" -" {\n" -" u32 tmp = scannedKeys[ie];\n" -" scannedKeys[ie] = sum4;\n" -" sum4 += tmp;\n" -" }\n" -" }\n" -" {\n" -" u32 sumPlusRank = sumScanned + rankPacked;\n" -" { u32 ie = b.x;\n" -" scannedKeys[0] += sumPlusRank;\n" -" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" -" }\n" -" { u32 ie = b.y;\n" -" scannedKeys[1] += sumPlusRank;\n" -" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" -" }\n" -" { u32 ie = b.z;\n" -" scannedKeys[2] += sumPlusRank;\n" -" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" -" }\n" -" { u32 ie = b.w;\n" -" scannedKeys[3] += sumPlusRank;\n" -" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" -" }\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" {\n" -" ldsSortData[newOffset[0]] = sortData[0];\n" -" ldsSortData[newOffset[1]] = sortData[1];\n" -" ldsSortData[newOffset[2]] = sortData[2];\n" -" ldsSortData[newOffset[3]] = sortData[3];\n" -" GROUP_LDS_BARRIER;\n" -" u32 dstAddr = 4*lIdx;\n" -" sortData[0] = ldsSortData[dstAddr+0];\n" -" sortData[1] = ldsSortData[dstAddr+1];\n" -" sortData[2] = ldsSortData[dstAddr+2];\n" -" sortData[3] = ldsSortData[dstAddr+3];\n" -" GROUP_LDS_BARRIER;\n" -" }\n" -" }\n" -"}\n" -"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" -"{\n" -" __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" -" __local u32 localHistogramToCarry[NUM_BUCKET];\n" -" __local u32 localHistogram[NUM_BUCKET*2];\n" -" u32 gIdx = GET_GLOBAL_IDX;\n" -" u32 lIdx = GET_LOCAL_IDX;\n" -" u32 wgIdx = GET_GROUP_IDX;\n" -" u32 wgSize = GET_GROUP_SIZE;\n" -" const int n = cb.m_n;\n" -" const int nWGs = cb.m_nWGs;\n" -" const int startBit = cb.m_startBit;\n" -" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" -" if( lIdx < (NUM_BUCKET) )\n" -" {\n" -" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" -" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" -" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" -" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" -" {\n" -" u32 myHistogram = 0;\n" -" u32 sortData[ELEMENTS_PER_WORK_ITEM];\n" -" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" -"#if defined(CHECK_BOUNDARY)\n" -" sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n" -"#else\n" -" sortData[i] = gSrc[ addr+i ];\n" -"#endif\n" -" sort4Bits(sortData, startBit, lIdx, ldsSortData);\n" -" u32 keys[ELEMENTS_PER_WORK_ITEM];\n" -" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" -" keys[i] = (sortData[i]>>startBit) & 0xf;\n" -" { // create histogram\n" -" u32 setIdx = lIdx/16;\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" localHistogram[lIdx] = 0;\n" -" }\n" -" ldsSortData[lIdx] = 0;\n" -" GROUP_LDS_BARRIER;\n" -" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" -"#if defined(CHECK_BOUNDARY)\n" -" if( addr+i < n )\n" -"#endif\n" -"#if defined(NV_GPU)\n" -" SET_HISTOGRAM( setIdx, keys[i] )++;\n" -"#else\n" -" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" -"#endif\n" -" \n" -" GROUP_LDS_BARRIER;\n" -" \n" -" uint hIdx = NUM_BUCKET+lIdx;\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" u32 sum = 0;\n" -" for(int i=0; i<WG_SIZE/16; i++)\n" -" {\n" -" sum += SET_HISTOGRAM( i, lIdx );\n" -" }\n" -" myHistogram = sum;\n" -" localHistogram[hIdx] = sum;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -"#if defined(USE_2LEVEL_REDUCE)\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" localHistogram[hIdx] = localHistogram[hIdx-1];\n" -" GROUP_MEM_FENCE;\n" -" u32 u0, u1, u2;\n" -" u0 = localHistogram[hIdx-3];\n" -" u1 = localHistogram[hIdx-2];\n" -" u2 = localHistogram[hIdx-1];\n" -" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" -" GROUP_MEM_FENCE;\n" -" u0 = localHistogram[hIdx-12];\n" -" u1 = localHistogram[hIdx-8];\n" -" u2 = localHistogram[hIdx-4];\n" -" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" -" GROUP_MEM_FENCE;\n" -" }\n" -"#else\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" localHistogram[hIdx] = localHistogram[hIdx-1];\n" -" GROUP_MEM_FENCE;\n" -" localHistogram[hIdx] += localHistogram[hIdx-1];\n" -" GROUP_MEM_FENCE;\n" -" localHistogram[hIdx] += localHistogram[hIdx-2];\n" -" GROUP_MEM_FENCE;\n" -" localHistogram[hIdx] += localHistogram[hIdx-4];\n" -" GROUP_MEM_FENCE;\n" -" localHistogram[hIdx] += localHistogram[hIdx-8];\n" -" GROUP_MEM_FENCE;\n" -" }\n" -"#endif\n" -" GROUP_LDS_BARRIER;\n" -" }\n" -" {\n" -" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" -" {\n" -" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" -" int binIdx = keys[ie];\n" -" int groupOffset = localHistogramToCarry[binIdx];\n" -" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" -"#if defined(CHECK_BOUNDARY)\n" -" if( addr+ie < n )\n" -"#endif\n" -" gDst[ groupOffset + myIdx ] = sortData[ie];\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" localHistogramToCarry[lIdx] += myHistogram;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" }\n" -"}\n" -"// 2 scan, 2 exchange\n" -"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n" -"{\n" -" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" -" {\n" -" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" -" (sortData[1]>>(startBit+ibit)) & 0x3, \n" -" (sortData[2]>>(startBit+ibit)) & 0x3, \n" -" (sortData[3]>>(startBit+ibit)) & 0x3);\n" -" u32 key4;\n" -" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" -" {\n" -" sKeyPacked[0] |= 1<<(8*b.x);\n" -" sKeyPacked[1] |= 1<<(8*b.y);\n" -" sKeyPacked[2] |= 1<<(8*b.z);\n" -" sKeyPacked[3] |= 1<<(8*b.w);\n" -" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" -" }\n" -" u32 rankPacked;\n" -" u32 sumPacked;\n" -" {\n" -" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" u32 newOffset[4] = { 0,0,0,0 };\n" -" {\n" -" u32 sumScanned = bit8Scan( sumPacked );\n" -" u32 scannedKeys[4];\n" -" scannedKeys[0] = 1<<(8*b.x);\n" -" scannedKeys[1] = 1<<(8*b.y);\n" -" scannedKeys[2] = 1<<(8*b.z);\n" -" scannedKeys[3] = 1<<(8*b.w);\n" -" { // 4 scans at once\n" -" u32 sum4 = 0;\n" -" for(int ie=0; ie<4; ie++)\n" -" {\n" -" u32 tmp = scannedKeys[ie];\n" -" scannedKeys[ie] = sum4;\n" -" sum4 += tmp;\n" -" }\n" -" }\n" -" {\n" -" u32 sumPlusRank = sumScanned + rankPacked;\n" -" { u32 ie = b.x;\n" -" scannedKeys[0] += sumPlusRank;\n" -" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" -" }\n" -" { u32 ie = b.y;\n" -" scannedKeys[1] += sumPlusRank;\n" -" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" -" }\n" -" { u32 ie = b.z;\n" -" scannedKeys[2] += sumPlusRank;\n" -" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" -" }\n" -" { u32 ie = b.w;\n" -" scannedKeys[3] += sumPlusRank;\n" -" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" -" }\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" {\n" -" ldsSortData[newOffset[0]] = sortData[0];\n" -" ldsSortData[newOffset[1]] = sortData[1];\n" -" ldsSortData[newOffset[2]] = sortData[2];\n" -" ldsSortData[newOffset[3]] = sortData[3];\n" -" ldsSortVal[newOffset[0]] = sortVal[0];\n" -" ldsSortVal[newOffset[1]] = sortVal[1];\n" -" ldsSortVal[newOffset[2]] = sortVal[2];\n" -" ldsSortVal[newOffset[3]] = sortVal[3];\n" -" GROUP_LDS_BARRIER;\n" -" u32 dstAddr = 4*lIdx;\n" -" sortData[0] = ldsSortData[dstAddr+0];\n" -" sortData[1] = ldsSortData[dstAddr+1];\n" -" sortData[2] = ldsSortData[dstAddr+2];\n" -" sortData[3] = ldsSortData[dstAddr+3];\n" -" sortVal[0] = ldsSortVal[dstAddr+0];\n" -" sortVal[1] = ldsSortVal[dstAddr+1];\n" -" sortVal[2] = ldsSortVal[dstAddr+2];\n" -" sortVal[3] = ldsSortVal[dstAddr+3];\n" -" GROUP_LDS_BARRIER;\n" -" }\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" -"{\n" -" __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" -" __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" -" __local u32 localHistogramToCarry[NUM_BUCKET];\n" -" __local u32 localHistogram[NUM_BUCKET*2];\n" -" u32 gIdx = GET_GLOBAL_IDX;\n" -" u32 lIdx = GET_LOCAL_IDX;\n" -" u32 wgIdx = GET_GROUP_IDX;\n" -" u32 wgSize = GET_GROUP_SIZE;\n" -" const int n = cb.m_n;\n" -" const int nWGs = cb.m_nWGs;\n" -" const int startBit = cb.m_startBit;\n" -" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" -" if( lIdx < (NUM_BUCKET) )\n" -" {\n" -" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" \n" -" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" -" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" -" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" -" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" -" {\n" -" u32 myHistogram = 0;\n" -" int sortData[ELEMENTS_PER_WORK_ITEM];\n" -" int sortVal[ELEMENTS_PER_WORK_ITEM];\n" -" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" -"#if defined(CHECK_BOUNDARY)\n" -" {\n" -" sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n" -" sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n" -" }\n" -"#else\n" -" {\n" -" sortData[i] = gSrc[ addr+i ].m_key;\n" -" sortVal[i] = gSrc[ addr+i ].m_value;\n" -" }\n" -"#endif\n" -" sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n" -" u32 keys[ELEMENTS_PER_WORK_ITEM];\n" -" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" -" keys[i] = (sortData[i]>>startBit) & 0xf;\n" -" { // create histogram\n" -" u32 setIdx = lIdx/16;\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" localHistogram[lIdx] = 0;\n" -" }\n" -" ldsSortData[lIdx] = 0;\n" -" GROUP_LDS_BARRIER;\n" -" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" -"#if defined(CHECK_BOUNDARY)\n" -" if( addr+i < n )\n" -"#endif\n" -"#if defined(NV_GPU)\n" -" SET_HISTOGRAM( setIdx, keys[i] )++;\n" -"#else\n" -" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" -"#endif\n" -" \n" -" GROUP_LDS_BARRIER;\n" -" \n" -" uint hIdx = NUM_BUCKET+lIdx;\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" u32 sum = 0;\n" -" for(int i=0; i<WG_SIZE/16; i++)\n" -" {\n" -" sum += SET_HISTOGRAM( i, lIdx );\n" -" }\n" -" myHistogram = sum;\n" -" localHistogram[hIdx] = sum;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -"#if defined(USE_2LEVEL_REDUCE)\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" localHistogram[hIdx] = localHistogram[hIdx-1];\n" -" GROUP_MEM_FENCE;\n" -" u32 u0, u1, u2;\n" -" u0 = localHistogram[hIdx-3];\n" -" u1 = localHistogram[hIdx-2];\n" -" u2 = localHistogram[hIdx-1];\n" -" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" -" GROUP_MEM_FENCE;\n" -" u0 = localHistogram[hIdx-12];\n" -" u1 = localHistogram[hIdx-8];\n" -" u2 = localHistogram[hIdx-4];\n" -" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" -" GROUP_MEM_FENCE;\n" -" }\n" -"#else\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" localHistogram[hIdx] = localHistogram[hIdx-1];\n" -" GROUP_MEM_FENCE;\n" -" localHistogram[hIdx] += localHistogram[hIdx-1];\n" -" GROUP_MEM_FENCE;\n" -" localHistogram[hIdx] += localHistogram[hIdx-2];\n" -" GROUP_MEM_FENCE;\n" -" localHistogram[hIdx] += localHistogram[hIdx-4];\n" -" GROUP_MEM_FENCE;\n" -" localHistogram[hIdx] += localHistogram[hIdx-8];\n" -" GROUP_MEM_FENCE;\n" -" }\n" -"#endif\n" -" GROUP_LDS_BARRIER;\n" -" }\n" -" {\n" -" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" -" {\n" -" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" -" int binIdx = keys[ie];\n" -" int groupOffset = localHistogramToCarry[binIdx];\n" -" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" -"#if defined(CHECK_BOUNDARY)\n" -" if( addr+ie < n )\n" -" {\n" -" if ((groupOffset + myIdx)<n)\n" -" {\n" -" if (sortData[ie]==sortVal[ie])\n" -" {\n" -" \n" -" SortDataCL tmp;\n" -" tmp.m_key = sortData[ie];\n" -" tmp.m_value = sortVal[ie];\n" -" if (tmp.m_key == tmp.m_value)\n" -" gDst[groupOffset + myIdx ] = tmp;\n" -" }\n" -" \n" -" }\n" -" }\n" -"#else\n" -" if ((groupOffset + myIdx)<n)\n" -" {\n" -" gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n" -" gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n" -" }\n" -"#endif\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" if( lIdx < NUM_BUCKET )\n" -" {\n" -" localHistogramToCarry[lIdx] += myHistogram;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" -"{\n" -" \n" -" u32 gIdx = GET_GLOBAL_IDX;\n" -" u32 realLocalIdx = GET_LOCAL_IDX;\n" -" u32 wgIdx = GET_GROUP_IDX;\n" -" u32 wgSize = GET_GROUP_SIZE;\n" -" const int startBit = cb.m_startBit;\n" -" const int n = cb.m_n;\n" -" const int nWGs = cb.m_nWGs;\n" -" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" -" int counter[NUM_BUCKET];\n" -" \n" -" if (realLocalIdx>0)\n" -" return;\n" -" \n" -" for (int c=0;c<NUM_BUCKET;c++)\n" -" counter[c]=0;\n" -" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" -" \n" -" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" -" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" -" {\n" -" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" -" {\n" -" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" -" \n" -" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" -" {\n" -" int i = addr2+j;\n" -" if( i < n )\n" -" {\n" -" int tableIdx;\n" -" tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" -" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" -" counter[tableIdx] ++;\n" -" }\n" -" }\n" -" }\n" -" }\n" -" \n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" -"{\n" -" \n" -" u32 gIdx = GET_GLOBAL_IDX;\n" -" u32 realLocalIdx = GET_LOCAL_IDX;\n" -" u32 wgIdx = GET_GROUP_IDX;\n" -" u32 wgSize = GET_GROUP_SIZE;\n" -" const int startBit = cb.m_startBit;\n" -" const int n = cb.m_n;\n" -" const int nWGs = cb.m_nWGs;\n" -" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" -" int counter[NUM_BUCKET];\n" -" \n" -" if (realLocalIdx>0)\n" -" return;\n" -" \n" -" for (int c=0;c<NUM_BUCKET;c++)\n" -" counter[c]=0;\n" -" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" -" \n" -" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" -" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" -" {\n" -" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" -" {\n" -" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" -" \n" -" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" -" {\n" -" int i = addr2+j;\n" -" if( i < n )\n" -" {\n" -" int tableIdx;\n" -" tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" -" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" -" counter[tableIdx] ++;\n" -" }\n" -" }\n" -" }\n" -" }\n" -" \n" -"}\n" -; +static const char* radixSort32KernelsCL = + "/*\n" + "Bullet Continuous Collision Detection and Physics Library\n" + "Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Author Takahiro Harada\n" + "//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "typedef unsigned int u32;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define make_uint4 (uint4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "#define WG_SIZE 64\n" + "#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n" + "#define BITS_PER_PASS 4\n" + "#define NUM_BUCKET (1<<BITS_PER_PASS)\n" + "typedef uchar u8;\n" + "// this isn't optimization for VLIW. But just reducing writes. \n" + "#define USE_2LEVEL_REDUCE 1\n" + "//#define CHECK_BOUNDARY 1\n" + "//#define NV_GPU 1\n" + "// Cypress\n" + "#define nPerWI 16\n" + "// Cayman\n" + "//#define nPerWI 20\n" + "#define m_n x\n" + "#define m_nWGs y\n" + "#define m_startBit z\n" + "#define m_nBlocksPerWG w\n" + "/*\n" + "typedef struct\n" + "{\n" + " int m_n;\n" + " int m_nWGs;\n" + " int m_startBit;\n" + " int m_nBlocksPerWG;\n" + "} ConstBuffer;\n" + "*/\n" + "typedef struct\n" + "{\n" + " unsigned int m_key;\n" + " unsigned int m_value;\n" + "} SortDataCL;\n" + "uint prefixScanVectorEx( uint4* data )\n" + "{\n" + " u32 sum = 0;\n" + " u32 tmp = data[0].x;\n" + " data[0].x = sum;\n" + " sum += tmp;\n" + " tmp = data[0].y;\n" + " data[0].y = sum;\n" + " sum += tmp;\n" + " tmp = data[0].z;\n" + " data[0].z = sum;\n" + " sum += tmp;\n" + " tmp = data[0].w;\n" + " data[0].w = sum;\n" + " sum += tmp;\n" + " return sum;\n" + "}\n" + "u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n" + "{\n" + " { // Set data\n" + " sorterSharedMemory[lIdx] = 0;\n" + " sorterSharedMemory[lIdx+wgSize] = pData;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " { // Prefix sum\n" + " int idx = 2*lIdx + (wgSize+1);\n" + "#if defined(USE_2LEVEL_REDUCE)\n" + " if( lIdx < 64 )\n" + " {\n" + " u32 u0, u1, u2;\n" + " u0 = sorterSharedMemory[idx-3];\n" + " u1 = sorterSharedMemory[idx-2];\n" + " u2 = sorterSharedMemory[idx-1];\n" + " AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" + " GROUP_MEM_FENCE;\n" + " u0 = sorterSharedMemory[idx-12];\n" + " u1 = sorterSharedMemory[idx-8];\n" + " u2 = sorterSharedMemory[idx-4];\n" + " AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" + " GROUP_MEM_FENCE;\n" + " u0 = sorterSharedMemory[idx-48];\n" + " u1 = sorterSharedMemory[idx-32];\n" + " u2 = sorterSharedMemory[idx-16];\n" + " AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" + " GROUP_MEM_FENCE;\n" + " if( wgSize > 64 )\n" + " {\n" + " sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" + " GROUP_MEM_FENCE;\n" + " }\n" + " sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" + " GROUP_MEM_FENCE;\n" + " }\n" + "#else\n" + " if( lIdx < 64 )\n" + " {\n" + " sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n" + " GROUP_MEM_FENCE;\n" + " sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n" + " GROUP_MEM_FENCE;\n" + " sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n" + " GROUP_MEM_FENCE;\n" + " sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n" + " GROUP_MEM_FENCE;\n" + " sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n" + " GROUP_MEM_FENCE;\n" + " sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n" + " GROUP_MEM_FENCE;\n" + " if( wgSize > 64 )\n" + " {\n" + " sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" + " GROUP_MEM_FENCE;\n" + " }\n" + " sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" + " GROUP_MEM_FENCE;\n" + " }\n" + "#endif\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " *totalSum = sorterSharedMemory[wgSize*2-1];\n" + " u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n" + " return addValue;\n" + "}\n" + "//__attribute__((reqd_work_group_size(128,1,1)))\n" + "uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" + "{\n" + " u32 s4 = prefixScanVectorEx( &pData );\n" + " u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n" + " return pData + make_uint4( rank, rank, rank, rank );\n" + "}\n" + "//__attribute__((reqd_work_group_size(64,1,1)))\n" + "uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" + "{\n" + " u32 s4 = prefixScanVectorEx( &pData );\n" + " u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n" + " return pData + make_uint4( rank, rank, rank, rank );\n" + "}\n" + "u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n" + "u32 bit8Scan(u32 v)\n" + "{\n" + " return (v<<8) + (v<<16) + (v<<24);\n" + "}\n" + "//===\n" + "#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n" + "{\n" + " __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" + " u32 gIdx = GET_GLOBAL_IDX;\n" + " u32 lIdx = GET_LOCAL_IDX;\n" + " u32 wgIdx = GET_GROUP_IDX;\n" + " u32 wgSize = GET_GROUP_SIZE;\n" + " const int startBit = cb.m_startBit;\n" + " const int n = cb.m_n;\n" + " const int nWGs = cb.m_nWGs;\n" + " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" + " for(int i=0; i<NUM_BUCKET; i++)\n" + " {\n" + " MY_HISTOGRAM(i) = 0;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" + " u32 localKey;\n" + " int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" + " int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" + " for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" + " {\n" + " // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" + " // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" + " // AMD: AtomInc performs better while NV prefers ++\n" + " for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" + " {\n" + "#if defined(CHECK_BOUNDARY)\n" + " if( addr+i < n )\n" + "#endif\n" + " {\n" + " localKey = (gSrc[addr+i]>>startBit) & 0xf;\n" + "#if defined(NV_GPU)\n" + " MY_HISTOGRAM( localKey )++;\n" + "#else\n" + " AtomInc( MY_HISTOGRAM( localKey ) );\n" + "#endif\n" + " }\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " \n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " u32 sum = 0;\n" + " for(int i=0; i<GET_GROUP_SIZE; i++)\n" + " {\n" + " sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" + " }\n" + " histogramOut[lIdx*nWGs+wgIdx] = sum;\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb )\n" + "{\n" + " __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" + " u32 gIdx = GET_GLOBAL_IDX;\n" + " u32 lIdx = GET_LOCAL_IDX;\n" + " u32 wgIdx = GET_GROUP_IDX;\n" + " u32 wgSize = GET_GROUP_SIZE;\n" + " const int startBit = cb.m_startBit;\n" + " const int n = cb.m_n;\n" + " const int nWGs = cb.m_nWGs;\n" + " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" + " for(int i=0; i<NUM_BUCKET; i++)\n" + " {\n" + " MY_HISTOGRAM(i) = 0;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" + " u32 localKey;\n" + " int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" + " int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" + " for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" + " {\n" + " // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" + " // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" + " // AMD: AtomInc performs better while NV prefers ++\n" + " for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" + " {\n" + "#if defined(CHECK_BOUNDARY)\n" + " if( addr+i < n )\n" + "#endif\n" + " {\n" + " localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n" + "#if defined(NV_GPU)\n" + " MY_HISTOGRAM( localKey )++;\n" + "#else\n" + " AtomInc( MY_HISTOGRAM( localKey ) );\n" + "#endif\n" + " }\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " \n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " u32 sum = 0;\n" + " for(int i=0; i<GET_GROUP_SIZE; i++)\n" + " {\n" + " sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" + " }\n" + " histogramOut[lIdx*nWGs+wgIdx] = sum;\n" + " }\n" + "}\n" + "#define nPerLane (nPerWI/4)\n" + "// NUM_BUCKET*nWGs < 128*nPerWI\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(128,1,1)))\n" + "void PrefixScanKernel( __global u32* wHistogram1, int4 cb )\n" + "{\n" + " __local u32 ldsTopScanData[128*2];\n" + " u32 lIdx = GET_LOCAL_IDX;\n" + " u32 wgIdx = GET_GROUP_IDX;\n" + " const int nWGs = cb.m_nWGs;\n" + " u32 data[nPerWI];\n" + " for(int i=0; i<nPerWI; i++)\n" + " {\n" + " data[i] = 0;\n" + " if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n" + " data[i] = wHistogram1[nPerWI*lIdx+i];\n" + " }\n" + " uint4 myData = make_uint4(0,0,0,0);\n" + " for(int i=0; i<nPerLane; i++)\n" + " {\n" + " myData.x += data[nPerLane*0+i];\n" + " myData.y += data[nPerLane*1+i];\n" + " myData.z += data[nPerLane*2+i];\n" + " myData.w += data[nPerLane*3+i];\n" + " }\n" + " uint totalSum;\n" + " uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n" + "// for(int j=0; j<4; j++) // somehow it introduces a lot of branches\n" + " { int j = 0;\n" + " u32 sum = 0;\n" + " for(int i=0; i<nPerLane; i++)\n" + " {\n" + " u32 tmp = data[nPerLane*j+i];\n" + " data[nPerLane*j+i] = sum;\n" + " sum += tmp;\n" + " }\n" + " }\n" + " { int j = 1;\n" + " u32 sum = 0;\n" + " for(int i=0; i<nPerLane; i++)\n" + " {\n" + " u32 tmp = data[nPerLane*j+i];\n" + " data[nPerLane*j+i] = sum;\n" + " sum += tmp;\n" + " }\n" + " }\n" + " { int j = 2;\n" + " u32 sum = 0;\n" + " for(int i=0; i<nPerLane; i++)\n" + " {\n" + " u32 tmp = data[nPerLane*j+i];\n" + " data[nPerLane*j+i] = sum;\n" + " sum += tmp;\n" + " }\n" + " }\n" + " { int j = 3;\n" + " u32 sum = 0;\n" + " for(int i=0; i<nPerLane; i++)\n" + " {\n" + " u32 tmp = data[nPerLane*j+i];\n" + " data[nPerLane*j+i] = sum;\n" + " sum += tmp;\n" + " }\n" + " }\n" + " for(int i=0; i<nPerLane; i++)\n" + " {\n" + " data[nPerLane*0+i] += scanned.x;\n" + " data[nPerLane*1+i] += scanned.y;\n" + " data[nPerLane*2+i] += scanned.z;\n" + " data[nPerLane*3+i] += scanned.w;\n" + " }\n" + " for(int i=0; i<nPerWI; i++)\n" + " {\n" + " int index = nPerWI*lIdx+i;\n" + " if (index < NUM_BUCKET*nWGs)\n" + " wHistogram1[nPerWI*lIdx+i] = data[i];\n" + " }\n" + "}\n" + "// 4 scan, 4 exchange\n" + "void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" + "{\n" + " for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n" + " {\n" + " u32 mask = (1<<bitIdx);\n" + " uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n" + " uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n" + " u32 total;\n" + " prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n" + " {\n" + " uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n" + " uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n" + " dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n" + " GROUP_LDS_BARRIER;\n" + " ldsSortData[dstAddr.x] = sortData[0];\n" + " ldsSortData[dstAddr.y] = sortData[1];\n" + " ldsSortData[dstAddr.z] = sortData[2];\n" + " ldsSortData[dstAddr.w] = sortData[3];\n" + " GROUP_LDS_BARRIER;\n" + " sortData[0] = ldsSortData[localAddr.x];\n" + " sortData[1] = ldsSortData[localAddr.y];\n" + " sortData[2] = ldsSortData[localAddr.z];\n" + " sortData[3] = ldsSortData[localAddr.w];\n" + " GROUP_LDS_BARRIER;\n" + " }\n" + " }\n" + "}\n" + "// 2 scan, 2 exchange\n" + "void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" + "{\n" + " for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" + " {\n" + " uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" + " (sortData[1]>>(startBit+ibit)) & 0x3, \n" + " (sortData[2]>>(startBit+ibit)) & 0x3, \n" + " (sortData[3]>>(startBit+ibit)) & 0x3);\n" + " u32 key4;\n" + " u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" + " {\n" + " sKeyPacked[0] |= 1<<(8*b.x);\n" + " sKeyPacked[1] |= 1<<(8*b.y);\n" + " sKeyPacked[2] |= 1<<(8*b.z);\n" + " sKeyPacked[3] |= 1<<(8*b.w);\n" + " key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" + " }\n" + " u32 rankPacked;\n" + " u32 sumPacked;\n" + " {\n" + " rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " u32 newOffset[4] = { 0,0,0,0 };\n" + " {\n" + " u32 sumScanned = bit8Scan( sumPacked );\n" + " u32 scannedKeys[4];\n" + " scannedKeys[0] = 1<<(8*b.x);\n" + " scannedKeys[1] = 1<<(8*b.y);\n" + " scannedKeys[2] = 1<<(8*b.z);\n" + " scannedKeys[3] = 1<<(8*b.w);\n" + " { // 4 scans at once\n" + " u32 sum4 = 0;\n" + " for(int ie=0; ie<4; ie++)\n" + " {\n" + " u32 tmp = scannedKeys[ie];\n" + " scannedKeys[ie] = sum4;\n" + " sum4 += tmp;\n" + " }\n" + " }\n" + " {\n" + " u32 sumPlusRank = sumScanned + rankPacked;\n" + " { u32 ie = b.x;\n" + " scannedKeys[0] += sumPlusRank;\n" + " newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" + " }\n" + " { u32 ie = b.y;\n" + " scannedKeys[1] += sumPlusRank;\n" + " newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" + " }\n" + " { u32 ie = b.z;\n" + " scannedKeys[2] += sumPlusRank;\n" + " newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" + " }\n" + " { u32 ie = b.w;\n" + " scannedKeys[3] += sumPlusRank;\n" + " newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" + " }\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " {\n" + " ldsSortData[newOffset[0]] = sortData[0];\n" + " ldsSortData[newOffset[1]] = sortData[1];\n" + " ldsSortData[newOffset[2]] = sortData[2];\n" + " ldsSortData[newOffset[3]] = sortData[3];\n" + " GROUP_LDS_BARRIER;\n" + " u32 dstAddr = 4*lIdx;\n" + " sortData[0] = ldsSortData[dstAddr+0];\n" + " sortData[1] = ldsSortData[dstAddr+1];\n" + " sortData[2] = ldsSortData[dstAddr+2];\n" + " sortData[3] = ldsSortData[dstAddr+3];\n" + " GROUP_LDS_BARRIER;\n" + " }\n" + " }\n" + "}\n" + "#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" + "{\n" + " __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" + " __local u32 localHistogramToCarry[NUM_BUCKET];\n" + " __local u32 localHistogram[NUM_BUCKET*2];\n" + " u32 gIdx = GET_GLOBAL_IDX;\n" + " u32 lIdx = GET_LOCAL_IDX;\n" + " u32 wgIdx = GET_GROUP_IDX;\n" + " u32 wgSize = GET_GROUP_SIZE;\n" + " const int n = cb.m_n;\n" + " const int nWGs = cb.m_nWGs;\n" + " const int startBit = cb.m_startBit;\n" + " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" + " if( lIdx < (NUM_BUCKET) )\n" + " {\n" + " localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" + " int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" + " int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" + " for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" + " {\n" + " u32 myHistogram = 0;\n" + " u32 sortData[ELEMENTS_PER_WORK_ITEM];\n" + " for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" + "#if defined(CHECK_BOUNDARY)\n" + " sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n" + "#else\n" + " sortData[i] = gSrc[ addr+i ];\n" + "#endif\n" + " sort4Bits(sortData, startBit, lIdx, ldsSortData);\n" + " u32 keys[ELEMENTS_PER_WORK_ITEM];\n" + " for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" + " keys[i] = (sortData[i]>>startBit) & 0xf;\n" + " { // create histogram\n" + " u32 setIdx = lIdx/16;\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " localHistogram[lIdx] = 0;\n" + " }\n" + " ldsSortData[lIdx] = 0;\n" + " GROUP_LDS_BARRIER;\n" + " for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" + "#if defined(CHECK_BOUNDARY)\n" + " if( addr+i < n )\n" + "#endif\n" + "#if defined(NV_GPU)\n" + " SET_HISTOGRAM( setIdx, keys[i] )++;\n" + "#else\n" + " AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" + "#endif\n" + " \n" + " GROUP_LDS_BARRIER;\n" + " \n" + " uint hIdx = NUM_BUCKET+lIdx;\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " u32 sum = 0;\n" + " for(int i=0; i<WG_SIZE/16; i++)\n" + " {\n" + " sum += SET_HISTOGRAM( i, lIdx );\n" + " }\n" + " myHistogram = sum;\n" + " localHistogram[hIdx] = sum;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + "#if defined(USE_2LEVEL_REDUCE)\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " localHistogram[hIdx] = localHistogram[hIdx-1];\n" + " GROUP_MEM_FENCE;\n" + " u32 u0, u1, u2;\n" + " u0 = localHistogram[hIdx-3];\n" + " u1 = localHistogram[hIdx-2];\n" + " u2 = localHistogram[hIdx-1];\n" + " AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" + " GROUP_MEM_FENCE;\n" + " u0 = localHistogram[hIdx-12];\n" + " u1 = localHistogram[hIdx-8];\n" + " u2 = localHistogram[hIdx-4];\n" + " AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" + " GROUP_MEM_FENCE;\n" + " }\n" + "#else\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " localHistogram[hIdx] = localHistogram[hIdx-1];\n" + " GROUP_MEM_FENCE;\n" + " localHistogram[hIdx] += localHistogram[hIdx-1];\n" + " GROUP_MEM_FENCE;\n" + " localHistogram[hIdx] += localHistogram[hIdx-2];\n" + " GROUP_MEM_FENCE;\n" + " localHistogram[hIdx] += localHistogram[hIdx-4];\n" + " GROUP_MEM_FENCE;\n" + " localHistogram[hIdx] += localHistogram[hIdx-8];\n" + " GROUP_MEM_FENCE;\n" + " }\n" + "#endif\n" + " GROUP_LDS_BARRIER;\n" + " }\n" + " {\n" + " for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" + " {\n" + " int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" + " int binIdx = keys[ie];\n" + " int groupOffset = localHistogramToCarry[binIdx];\n" + " int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" + "#if defined(CHECK_BOUNDARY)\n" + " if( addr+ie < n )\n" + "#endif\n" + " gDst[ groupOffset + myIdx ] = sortData[ie];\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " localHistogramToCarry[lIdx] += myHistogram;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " }\n" + "}\n" + "// 2 scan, 2 exchange\n" + "void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n" + "{\n" + " for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" + " {\n" + " uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" + " (sortData[1]>>(startBit+ibit)) & 0x3, \n" + " (sortData[2]>>(startBit+ibit)) & 0x3, \n" + " (sortData[3]>>(startBit+ibit)) & 0x3);\n" + " u32 key4;\n" + " u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" + " {\n" + " sKeyPacked[0] |= 1<<(8*b.x);\n" + " sKeyPacked[1] |= 1<<(8*b.y);\n" + " sKeyPacked[2] |= 1<<(8*b.z);\n" + " sKeyPacked[3] |= 1<<(8*b.w);\n" + " key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" + " }\n" + " u32 rankPacked;\n" + " u32 sumPacked;\n" + " {\n" + " rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " u32 newOffset[4] = { 0,0,0,0 };\n" + " {\n" + " u32 sumScanned = bit8Scan( sumPacked );\n" + " u32 scannedKeys[4];\n" + " scannedKeys[0] = 1<<(8*b.x);\n" + " scannedKeys[1] = 1<<(8*b.y);\n" + " scannedKeys[2] = 1<<(8*b.z);\n" + " scannedKeys[3] = 1<<(8*b.w);\n" + " { // 4 scans at once\n" + " u32 sum4 = 0;\n" + " for(int ie=0; ie<4; ie++)\n" + " {\n" + " u32 tmp = scannedKeys[ie];\n" + " scannedKeys[ie] = sum4;\n" + " sum4 += tmp;\n" + " }\n" + " }\n" + " {\n" + " u32 sumPlusRank = sumScanned + rankPacked;\n" + " { u32 ie = b.x;\n" + " scannedKeys[0] += sumPlusRank;\n" + " newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" + " }\n" + " { u32 ie = b.y;\n" + " scannedKeys[1] += sumPlusRank;\n" + " newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" + " }\n" + " { u32 ie = b.z;\n" + " scannedKeys[2] += sumPlusRank;\n" + " newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" + " }\n" + " { u32 ie = b.w;\n" + " scannedKeys[3] += sumPlusRank;\n" + " newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" + " }\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " {\n" + " ldsSortData[newOffset[0]] = sortData[0];\n" + " ldsSortData[newOffset[1]] = sortData[1];\n" + " ldsSortData[newOffset[2]] = sortData[2];\n" + " ldsSortData[newOffset[3]] = sortData[3];\n" + " ldsSortVal[newOffset[0]] = sortVal[0];\n" + " ldsSortVal[newOffset[1]] = sortVal[1];\n" + " ldsSortVal[newOffset[2]] = sortVal[2];\n" + " ldsSortVal[newOffset[3]] = sortVal[3];\n" + " GROUP_LDS_BARRIER;\n" + " u32 dstAddr = 4*lIdx;\n" + " sortData[0] = ldsSortData[dstAddr+0];\n" + " sortData[1] = ldsSortData[dstAddr+1];\n" + " sortData[2] = ldsSortData[dstAddr+2];\n" + " sortData[3] = ldsSortData[dstAddr+3];\n" + " sortVal[0] = ldsSortVal[dstAddr+0];\n" + " sortVal[1] = ldsSortVal[dstAddr+1];\n" + " sortVal[2] = ldsSortVal[dstAddr+2];\n" + " sortVal[3] = ldsSortVal[dstAddr+3];\n" + " GROUP_LDS_BARRIER;\n" + " }\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" + "{\n" + " __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" + " __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" + " __local u32 localHistogramToCarry[NUM_BUCKET];\n" + " __local u32 localHistogram[NUM_BUCKET*2];\n" + " u32 gIdx = GET_GLOBAL_IDX;\n" + " u32 lIdx = GET_LOCAL_IDX;\n" + " u32 wgIdx = GET_GROUP_IDX;\n" + " u32 wgSize = GET_GROUP_SIZE;\n" + " const int n = cb.m_n;\n" + " const int nWGs = cb.m_nWGs;\n" + " const int startBit = cb.m_startBit;\n" + " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" + " if( lIdx < (NUM_BUCKET) )\n" + " {\n" + " localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " \n" + " const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" + " int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" + " int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" + " for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" + " {\n" + " u32 myHistogram = 0;\n" + " int sortData[ELEMENTS_PER_WORK_ITEM];\n" + " int sortVal[ELEMENTS_PER_WORK_ITEM];\n" + " for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" + "#if defined(CHECK_BOUNDARY)\n" + " {\n" + " sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n" + " sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n" + " }\n" + "#else\n" + " {\n" + " sortData[i] = gSrc[ addr+i ].m_key;\n" + " sortVal[i] = gSrc[ addr+i ].m_value;\n" + " }\n" + "#endif\n" + " sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n" + " u32 keys[ELEMENTS_PER_WORK_ITEM];\n" + " for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" + " keys[i] = (sortData[i]>>startBit) & 0xf;\n" + " { // create histogram\n" + " u32 setIdx = lIdx/16;\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " localHistogram[lIdx] = 0;\n" + " }\n" + " ldsSortData[lIdx] = 0;\n" + " GROUP_LDS_BARRIER;\n" + " for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" + "#if defined(CHECK_BOUNDARY)\n" + " if( addr+i < n )\n" + "#endif\n" + "#if defined(NV_GPU)\n" + " SET_HISTOGRAM( setIdx, keys[i] )++;\n" + "#else\n" + " AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" + "#endif\n" + " \n" + " GROUP_LDS_BARRIER;\n" + " \n" + " uint hIdx = NUM_BUCKET+lIdx;\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " u32 sum = 0;\n" + " for(int i=0; i<WG_SIZE/16; i++)\n" + " {\n" + " sum += SET_HISTOGRAM( i, lIdx );\n" + " }\n" + " myHistogram = sum;\n" + " localHistogram[hIdx] = sum;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + "#if defined(USE_2LEVEL_REDUCE)\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " localHistogram[hIdx] = localHistogram[hIdx-1];\n" + " GROUP_MEM_FENCE;\n" + " u32 u0, u1, u2;\n" + " u0 = localHistogram[hIdx-3];\n" + " u1 = localHistogram[hIdx-2];\n" + " u2 = localHistogram[hIdx-1];\n" + " AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" + " GROUP_MEM_FENCE;\n" + " u0 = localHistogram[hIdx-12];\n" + " u1 = localHistogram[hIdx-8];\n" + " u2 = localHistogram[hIdx-4];\n" + " AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" + " GROUP_MEM_FENCE;\n" + " }\n" + "#else\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " localHistogram[hIdx] = localHistogram[hIdx-1];\n" + " GROUP_MEM_FENCE;\n" + " localHistogram[hIdx] += localHistogram[hIdx-1];\n" + " GROUP_MEM_FENCE;\n" + " localHistogram[hIdx] += localHistogram[hIdx-2];\n" + " GROUP_MEM_FENCE;\n" + " localHistogram[hIdx] += localHistogram[hIdx-4];\n" + " GROUP_MEM_FENCE;\n" + " localHistogram[hIdx] += localHistogram[hIdx-8];\n" + " GROUP_MEM_FENCE;\n" + " }\n" + "#endif\n" + " GROUP_LDS_BARRIER;\n" + " }\n" + " {\n" + " for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" + " {\n" + " int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" + " int binIdx = keys[ie];\n" + " int groupOffset = localHistogramToCarry[binIdx];\n" + " int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" + "#if defined(CHECK_BOUNDARY)\n" + " if( addr+ie < n )\n" + " {\n" + " if ((groupOffset + myIdx)<n)\n" + " {\n" + " if (sortData[ie]==sortVal[ie])\n" + " {\n" + " \n" + " SortDataCL tmp;\n" + " tmp.m_key = sortData[ie];\n" + " tmp.m_value = sortVal[ie];\n" + " if (tmp.m_key == tmp.m_value)\n" + " gDst[groupOffset + myIdx ] = tmp;\n" + " }\n" + " \n" + " }\n" + " }\n" + "#else\n" + " if ((groupOffset + myIdx)<n)\n" + " {\n" + " gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n" + " gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n" + " }\n" + "#endif\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " if( lIdx < NUM_BUCKET )\n" + " {\n" + " localHistogramToCarry[lIdx] += myHistogram;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" + "{\n" + " \n" + " u32 gIdx = GET_GLOBAL_IDX;\n" + " u32 realLocalIdx = GET_LOCAL_IDX;\n" + " u32 wgIdx = GET_GROUP_IDX;\n" + " u32 wgSize = GET_GROUP_SIZE;\n" + " const int startBit = cb.m_startBit;\n" + " const int n = cb.m_n;\n" + " const int nWGs = cb.m_nWGs;\n" + " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" + " int counter[NUM_BUCKET];\n" + " \n" + " if (realLocalIdx>0)\n" + " return;\n" + " \n" + " for (int c=0;c<NUM_BUCKET;c++)\n" + " counter[c]=0;\n" + " const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" + " \n" + " int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" + " for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" + " {\n" + " for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" + " {\n" + " int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" + " \n" + " for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" + " {\n" + " int i = addr2+j;\n" + " if( i < n )\n" + " {\n" + " int tableIdx;\n" + " tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" + " gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" + " counter[tableIdx] ++;\n" + " }\n" + " }\n" + " }\n" + " }\n" + " \n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" + "{\n" + " \n" + " u32 gIdx = GET_GLOBAL_IDX;\n" + " u32 realLocalIdx = GET_LOCAL_IDX;\n" + " u32 wgIdx = GET_GROUP_IDX;\n" + " u32 wgSize = GET_GROUP_SIZE;\n" + " const int startBit = cb.m_startBit;\n" + " const int n = cb.m_n;\n" + " const int nWGs = cb.m_nWGs;\n" + " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" + " int counter[NUM_BUCKET];\n" + " \n" + " if (realLocalIdx>0)\n" + " return;\n" + " \n" + " for (int c=0;c<NUM_BUCKET;c++)\n" + " counter[c]=0;\n" + " const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" + " \n" + " int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" + " for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" + " {\n" + " for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" + " {\n" + " int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" + " \n" + " for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" + " {\n" + " int i = addr2+j;\n" + " if( i < n )\n" + " {\n" + " int tableIdx;\n" + " tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" + " gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" + " counter[tableIdx] ++;\n" + " }\n" + " }\n" + " }\n" + " }\n" + " \n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp b/thirdparty/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp index 161e304f09..6571f30548 100644 --- a/thirdparty/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp @@ -4,7 +4,6 @@ #include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" #include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h" - #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" @@ -15,38 +14,35 @@ #include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h" - #define B3_RAYCAST_PATH "src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl" - - struct b3GpuRaycastInternalData { cl_context m_context; cl_device_id m_device; - cl_command_queue m_q; + cl_command_queue m_q; cl_kernel m_raytraceKernel; cl_kernel m_raytracePairsKernel; cl_kernel m_findRayRigidPairIndexRanges; - + b3GpuParallelLinearBvh* m_plbvh; b3RadixSort32CL* m_radixSorter; b3FillCL* m_fill; - + //1 element per ray b3OpenCLArray<b3RayInfo>* m_gpuRays; b3OpenCLArray<b3RayHit>* m_gpuHitResults; b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay; b3OpenCLArray<int>* m_numRayRigidPairsPerRay; - + //1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB b3OpenCLArray<int>* m_gpuNumRayRigidPairs; - b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs; //x == ray index, y == rigid index - + b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs; //x == ray index, y == rigid index + int m_test; }; -b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue q) +b3GpuRaycast::b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q) { m_data = new b3GpuRaycastInternalData; m_data->m_context = ctx; @@ -59,7 +55,7 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q); m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q); m_data->m_fill = new b3FillCL(ctx, device, q); - + m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q); m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q); m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q); @@ -68,19 +64,17 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q); { - cl_int errNum=0; - cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,rayCastKernelCL,&errNum,"",B3_RAYCAST_PATH); - b3Assert(errNum==CL_SUCCESS); - m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastPairsKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "findRayRigidPairIndexRanges",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); + cl_int errNum = 0; + cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, &errNum, "", B3_RAYCAST_PATH); + b3Assert(errNum == CL_SUCCESS); + m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastPairsKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "findRayRigidPairIndexRanges", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); clReleaseProgram(prog); } - - } b3GpuRaycast::~b3GpuRaycast() @@ -88,78 +82,80 @@ b3GpuRaycast::~b3GpuRaycast() clReleaseKernel(m_data->m_raytraceKernel); clReleaseKernel(m_data->m_raytracePairsKernel); clReleaseKernel(m_data->m_findRayRigidPairIndexRanges); - + delete m_data->m_plbvh; delete m_data->m_radixSorter; delete m_data->m_fill; - + delete m_data->m_gpuRays; delete m_data->m_gpuHitResults; delete m_data->m_firstRayRigidPairIndexPerRay; delete m_data->m_numRayRigidPairsPerRay; delete m_data->m_gpuNumRayRigidPairs; delete m_data->m_gpuRayRigidPairs; - + delete m_data; } -bool sphere_intersect(const b3Vector3& spherePos, b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction) +bool sphere_intersect(const b3Vector3& spherePos, b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction) { - b3Vector3 rs = rayFrom - spherePos; - b3Vector3 rayDir = rayTo-rayFrom; - - float A = b3Dot(rayDir,rayDir); - float B = b3Dot(rs, rayDir); - float C = b3Dot(rs, rs) - (radius * radius); - - float D = B * B - A*C; - - if (D > 0.0) - { - float t = (-B - sqrt(D))/A; - - if ( (t >= 0.0f) && (t < hitFraction) ) - { + b3Vector3 rs = rayFrom - spherePos; + b3Vector3 rayDir = rayTo - rayFrom; + + float A = b3Dot(rayDir, rayDir); + float B = b3Dot(rs, rayDir); + float C = b3Dot(rs, rs) - (radius * radius); + + float D = B * B - A * C; + + if (D > 0.0) + { + float t = (-B - sqrt(D)) / A; + + if ((t >= 0.0f) && (t < hitFraction)) + { hitFraction = t; - return true; + return true; } } return false; } bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const b3ConvexPolyhedronData& poly, - const b3AlignedObjectArray<b3GpuFace>& faces, float& hitFraction, b3Vector3& hitNormal) + const b3AlignedObjectArray<b3GpuFace>& faces, float& hitFraction, b3Vector3& hitNormal) { float exitFraction = hitFraction; float enterFraction = -0.1f; - b3Vector3 curHitNormal=b3MakeVector3(0,0,0); - for (int i=0;i<poly.m_numFaces;i++) + b3Vector3 curHitNormal = b3MakeVector3(0, 0, 0); + for (int i = 0; i < poly.m_numFaces; i++) { - const b3GpuFace& face = faces[poly.m_faceOffset+i]; - float fromPlaneDist = b3Dot(rayFromLocal,face.m_plane)+face.m_plane.w; - float toPlaneDist = b3Dot(rayToLocal,face.m_plane)+face.m_plane.w; - if (fromPlaneDist<0.f) + const b3GpuFace& face = faces[poly.m_faceOffset + i]; + float fromPlaneDist = b3Dot(rayFromLocal, face.m_plane) + face.m_plane.w; + float toPlaneDist = b3Dot(rayToLocal, face.m_plane) + face.m_plane.w; + if (fromPlaneDist < 0.f) { if (toPlaneDist >= 0.f) { - float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist); - if (exitFraction>fraction) + float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist); + if (exitFraction > fraction) { exitFraction = fraction; } - } - } else + } + } + else { - if (toPlaneDist<0.f) + if (toPlaneDist < 0.f) { - float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist); + float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist); if (enterFraction <= fraction) { enterFraction = fraction; curHitNormal = face.m_plane; curHitNormal.w = 0.f; } - } else + } + else { return false; } @@ -176,44 +172,41 @@ bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const return true; } -void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults, - int numBodies,const struct b3RigidBodyData* bodies, int numCollidables,const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData) +void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults, + int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData) { - -// return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables); + // return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables); B3_PROFILE("castRaysHost"); - for (int r=0;r<rays.size();r++) + for (int r = 0; r < rays.size(); r++) { b3Vector3 rayFrom = rays[r].m_from; b3Vector3 rayTo = rays[r].m_to; float hitFraction = hitResults[r].m_hitFraction; - int hitBodyIndex= -1; + int hitBodyIndex = -1; b3Vector3 hitNormal; - for (int b=0;b<numBodies;b++) + for (int b = 0; b < numBodies; b++) { - const b3Vector3& pos = bodies[b].m_pos; //const b3Quaternion& orn = bodies[b].m_quat; - + switch (collidables[bodies[b].m_collidableIdx].m_shapeType) { - case SHAPE_SPHERE: + case SHAPE_SPHERE: { b3Scalar radius = collidables[bodies[b].m_collidableIdx].m_radius; - if (sphere_intersect(pos, radius, rayFrom, rayTo,hitFraction)) + if (sphere_intersect(pos, radius, rayFrom, rayTo, hitFraction)) { hitBodyIndex = b; b3Vector3 hitPoint; - hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction); - hitNormal = (hitPoint-bodies[b].m_pos).normalize(); + hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction); + hitNormal = (hitPoint - bodies[b].m_pos).normalize(); } } - case SHAPE_CONVEX_HULL: + case SHAPE_CONVEX_HULL: { - b3Transform convexWorldTransform; convexWorldTransform.setIdentity(); convexWorldTransform.setOrigin(bodies[b].m_pos); @@ -222,72 +215,67 @@ void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3A b3Vector3 rayFromLocal = convexWorld2Local(rayFrom); b3Vector3 rayToLocal = convexWorld2Local(rayTo); - - + int shapeIndex = collidables[bodies[b].m_collidableIdx].m_shapeIndex; const b3ConvexPolyhedronData& poly = narrowphaseData->m_convexPolyhedra[shapeIndex]; - if (rayConvex(rayFromLocal, rayToLocal,poly,narrowphaseData->m_convexFaces, hitFraction, hitNormal)) + if (rayConvex(rayFromLocal, rayToLocal, poly, narrowphaseData->m_convexFaces, hitFraction, hitNormal)) { hitBodyIndex = b; } - break; } - default: + default: { - static bool once=true; + static bool once = true; if (once) { - once=false; + once = false; b3Warning("Raytest: unsupported shape type\n"); } } } } - if (hitBodyIndex>=0) + if (hitBodyIndex >= 0) { - hitResults[r].m_hitFraction = hitFraction; - hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction); + hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction); hitResults[r].m_hitNormal = hitNormal; hitResults[r].m_hitBody = hitBodyIndex; } - } } ///todo: add some acceleration structure (AABBs, tree etc) -void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults, - int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, - const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase) +void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults, + int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, + const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase) { //castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData); B3_PROFILE("castRaysGPU"); - + { B3_PROFILE("raycast copyFromHost"); m_data->m_gpuRays->copyFromHost(rays); m_data->m_gpuHitResults->copyFromHost(hitResults); - } - + int numRays = hitResults.size(); { m_data->m_firstRayRigidPairIndexPerRay->resize(numRays); m_data->m_numRayRigidPairsPerRay->resize(numRays); - + m_data->m_gpuNumRayRigidPairs->resize(1); m_data->m_gpuRayRigidPairs->resize(numRays * 16); } - + //run kernel const bool USE_BRUTE_FORCE_RAYCAST = false; - if(USE_BRUTE_FORCE_RAYCAST) + if (USE_BRUTE_FORCE_RAYCAST) { B3_PROFILE("raycast launch1D"); - b3LauncherCL launcher(m_data->m_q,m_data->m_raytraceKernel,"m_raytraceKernel"); + b3LauncherCL launcher(m_data->m_q, m_data->m_raytraceKernel, "m_raytraceKernel"); int numRays = rays.size(); launcher.setConst(numRays); @@ -299,93 +287,88 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3Align launcher.setBuffer(narrowphaseData->m_collidablesGPU->getBufferCL()); launcher.setBuffer(narrowphaseData->m_convexFacesGPU->getBufferCL()); launcher.setBuffer(narrowphaseData->m_convexPolyhedraGPU->getBufferCL()); - + launcher.launch1D(numRays); clFinish(m_data->m_q); } else { - m_data->m_plbvh->build( broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU() ); + m_data->m_plbvh->build(broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU()); m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs); - + int numRayRigidPairs = -1; m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1); - if( numRayRigidPairs > m_data->m_gpuRayRigidPairs->size() ) + if (numRayRigidPairs > m_data->m_gpuRayRigidPairs->size()) { numRayRigidPairs = m_data->m_gpuRayRigidPairs->size(); m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1); } - - m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs); //Radix sort needs b3OpenCLArray::size() to be correct - + + m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs); //Radix sort needs b3OpenCLArray::size() to be correct + //Sort ray-rigid pairs by ray index { B3_PROFILE("sort ray-rigid pairs"); - m_data->m_radixSorter->execute( *reinterpret_cast< b3OpenCLArray<b3SortData>* >(m_data->m_gpuRayRigidPairs) ); + m_data->m_radixSorter->execute(*reinterpret_cast<b3OpenCLArray<b3SortData>*>(m_data->m_gpuRayRigidPairs)); } - + //detect start,count of each ray pair { B3_PROFILE("detect ray-rigid pair index ranges"); - + { B3_PROFILE("reset ray-rigid pair index ranges"); - - m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays); //atomic_min used to find first index + + m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays); //atomic_min used to find first index m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays); clFinish(m_data->m_q); } - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ), - - b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ), - b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL()), + + b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()), + b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL())}; + b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numRayRigidPairs); - + launcher.launch1D(numRayRigidPairs); clFinish(m_data->m_q); } - + { B3_PROFILE("ray-rigid intersection"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_data->m_gpuRays->getBufferCL() ), - b3BufferInfoCL( m_data->m_gpuHitResults->getBufferCL() ), - b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ), - b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ), - - b3BufferInfoCL( narrowphaseData->m_bodyBufferGPU->getBufferCL() ), - b3BufferInfoCL( narrowphaseData->m_collidablesGPU->getBufferCL() ), - b3BufferInfoCL( narrowphaseData->m_convexFacesGPU->getBufferCL() ), - b3BufferInfoCL( narrowphaseData->m_convexPolyhedraGPU->getBufferCL() ), - - b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ) - }; - + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL(m_data->m_gpuRays->getBufferCL()), + b3BufferInfoCL(m_data->m_gpuHitResults->getBufferCL()), + b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()), + b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL()), + + b3BufferInfoCL(narrowphaseData->m_bodyBufferGPU->getBufferCL()), + b3BufferInfoCL(narrowphaseData->m_collidablesGPU->getBufferCL()), + b3BufferInfoCL(narrowphaseData->m_convexFacesGPU->getBufferCL()), + b3BufferInfoCL(narrowphaseData->m_convexPolyhedraGPU->getBufferCL()), + + b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL())}; + b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numRays); - + launcher.launch1D(numRays); clFinish(m_data->m_q); } } - - //copy results { B3_PROFILE("raycast copyToHost"); m_data->m_gpuHitResults->copyToHost(hitResults); } - }
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.h b/thirdparty/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.h index 3a5cf44b79..f1f6ffd402 100644 --- a/thirdparty/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.h +++ b/thirdparty/bullet/Bullet3OpenCL/Raycast/b3GpuRaycast.h @@ -7,26 +7,22 @@ #include "Bullet3Common/b3AlignedObjectArray.h" #include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h" - - class b3GpuRaycast { protected: struct b3GpuRaycastInternalData* m_data; + public: - b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue q); + b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q); virtual ~b3GpuRaycast(); - void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn, b3AlignedObjectArray<b3RayHit>& hitResults, - int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, - const struct b3GpuNarrowPhaseInternalData* narrowphaseData); - - void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults, - int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, - const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase); - + void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn, b3AlignedObjectArray<b3RayHit>& hitResults, + int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, + const struct b3GpuNarrowPhaseInternalData* narrowphaseData); - + void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults, + int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, + const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase); }; -#endif //B3_GPU_RAYCAST_H +#endif //B3_GPU_RAYCAST_H diff --git a/thirdparty/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h b/thirdparty/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h index 6257909a4d..94f6a8eb9f 100644 --- a/thirdparty/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h @@ -1,381 +1,380 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* rayCastKernelCL= \ -"#define SHAPE_CONVEX_HULL 3\n" -"#define SHAPE_PLANE 4\n" -"#define SHAPE_CONCAVE_TRIMESH 5\n" -"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" -"#define SHAPE_SPHERE 7\n" -"typedef struct\n" -"{\n" -" float4 m_from;\n" -" float4 m_to;\n" -"} b3RayInfo;\n" -"typedef struct\n" -"{\n" -" float m_hitFraction;\n" -" int m_hitResult0;\n" -" int m_hitResult1;\n" -" int m_hitResult2;\n" -" float4 m_hitPoint;\n" -" float4 m_hitNormal;\n" -"} b3RayHit;\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" float4 m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" unsigned int m_collidableIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} Body;\n" -"typedef struct Collidable\n" -"{\n" -" union {\n" -" int m_numChildShapes;\n" -" int m_bvhIndex;\n" -" };\n" -" float m_radius;\n" -" int m_shapeType;\n" -" int m_shapeIndex;\n" -"} Collidable;\n" -"typedef struct \n" -"{\n" -" float4 m_localCenter;\n" -" float4 m_extents;\n" -" float4 mC;\n" -" float4 mE;\n" -" float m_radius;\n" -" int m_faceOffset;\n" -" int m_numFaces;\n" -" int m_numVertices;\n" -" int m_vertexOffset;\n" -" int m_uniqueEdgesOffset;\n" -" int m_numUniqueEdges;\n" -" int m_unused;\n" -"} ConvexPolyhedronCL;\n" -"typedef struct\n" -"{\n" -" float4 m_plane;\n" -" int m_indexOffset;\n" -" int m_numIndices;\n" -"} b3GpuFace;\n" -"///////////////////////////////////////\n" -"// Quaternion\n" -"///////////////////////////////////////\n" -"typedef float4 Quaternion;\n" -"__inline\n" -" Quaternion qtMul(Quaternion a, Quaternion b);\n" -"__inline\n" -" Quaternion qtNormalize(Quaternion in);\n" -"__inline\n" -" Quaternion qtInvert(Quaternion q);\n" -"__inline\n" -" float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = (float4)(a.xyz,0.f);\n" -" float4 b1 = (float4)(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -" Quaternion qtMul(Quaternion a, Quaternion b)\n" -"{\n" -" Quaternion ans;\n" -" ans = cross( a, b );\n" -" ans += a.w*b+b.w*a;\n" -" // ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"__inline\n" -" Quaternion qtNormalize(Quaternion in)\n" -"{\n" -" return fast_normalize(in);\n" -" // in /= length( in );\n" -" // return in;\n" -"}\n" -"__inline\n" -" float4 qtRotate(Quaternion q, float4 vec)\n" -"{\n" -" Quaternion qInv = qtInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = qtMul(q,vcpy);\n" -" out = qtMul(out,qInv);\n" -" return out;\n" -"}\n" -"__inline\n" -" Quaternion qtInvert(Quaternion q)\n" -"{\n" -" return (Quaternion)(-q.xyz, q.w);\n" -"}\n" -"__inline\n" -" float4 qtInvRotate(const Quaternion q, float4 vec)\n" -"{\n" -" return qtRotate( qtInvert( q ), vec );\n" -"}\n" -"void trInverse(float4 translationIn, Quaternion orientationIn,\n" -" float4* translationOut, Quaternion* orientationOut)\n" -"{\n" -" *orientationOut = qtInvert(orientationIn);\n" -" *translationOut = qtRotate(*orientationOut, -translationIn);\n" -"}\n" -"bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n" -" __global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n" -"{\n" -" rayFromLocal.w = 0.f;\n" -" rayToLocal.w = 0.f;\n" -" bool result = true;\n" -" float exitFraction = hitFraction[0];\n" -" float enterFraction = -0.3f;\n" -" float4 curHitNormal = (float4)(0,0,0,0);\n" -" for (int i=0;i<numFaces && result;i++)\n" -" {\n" -" b3GpuFace face = faces[faceOffset+i];\n" -" float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n" -" float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n" -" if (fromPlaneDist<0.f)\n" -" {\n" -" if (toPlaneDist >= 0.f)\n" -" {\n" -" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n" -" if (exitFraction>fraction)\n" -" {\n" -" exitFraction = fraction;\n" -" }\n" -" } \n" -" } else\n" -" {\n" -" if (toPlaneDist<0.f)\n" -" {\n" -" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n" -" if (enterFraction <= fraction)\n" -" {\n" -" enterFraction = fraction;\n" -" curHitNormal = face.m_plane;\n" -" curHitNormal.w = 0.f;\n" -" }\n" -" } else\n" -" {\n" -" result = false;\n" -" }\n" -" }\n" -" if (exitFraction <= enterFraction)\n" -" result = false;\n" -" }\n" -" if (enterFraction < 0.f)\n" -" {\n" -" result = false;\n" -" }\n" -" if (result)\n" -" { \n" -" hitFraction[0] = enterFraction;\n" -" hitNormal[0] = curHitNormal;\n" -" }\n" -" return result;\n" -"}\n" -"bool sphere_intersect(float4 spherePos, float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n" -"{\n" -" float4 rs = rayFrom - spherePos;\n" -" rs.w = 0.f;\n" -" float4 rayDir = rayTo-rayFrom;\n" -" rayDir.w = 0.f;\n" -" float A = dot(rayDir,rayDir);\n" -" float B = dot(rs, rayDir);\n" -" float C = dot(rs, rs) - (radius * radius);\n" -" float D = B * B - A*C;\n" -" if (D > 0.0f)\n" -" {\n" -" float t = (-B - sqrt(D))/A;\n" -" if ( (t >= 0.0f) && (t < (*hitFraction)) )\n" -" {\n" -" *hitFraction = t;\n" -" return true;\n" -" }\n" -" }\n" -" return false;\n" -"}\n" -"float4 setInterpolate3(float4 from, float4 to, float t)\n" -"{\n" -" float s = 1.0f - t;\n" -" float4 result;\n" -" result = s * from + t * to;\n" -" result.w = 0.f; \n" -" return result; \n" -"}\n" -"__kernel void rayCastKernel( \n" -" int numRays, \n" -" const __global b3RayInfo* rays, \n" -" __global b3RayHit* hitResults, \n" -" const int numBodies, \n" -" __global Body* bodies,\n" -" __global Collidable* collidables,\n" -" __global const b3GpuFace* faces,\n" -" __global const ConvexPolyhedronCL* convexShapes )\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numRays)\n" -" return;\n" -" hitResults[i].m_hitFraction = 1.f;\n" -" float4 rayFrom = rays[i].m_from;\n" -" float4 rayTo = rays[i].m_to;\n" -" float hitFraction = 1.f;\n" -" float4 hitPoint;\n" -" float4 hitNormal;\n" -" int hitBodyIndex= -1;\n" -" int cachedCollidableIndex = -1;\n" -" Collidable cachedCollidable;\n" -" for (int b=0;b<numBodies;b++)\n" -" {\n" -" if (hitResults[i].m_hitResult2==b)\n" -" continue;\n" -" Body body = bodies[b];\n" -" float4 pos = body.m_pos;\n" -" float4 orn = body.m_quat;\n" -" if (cachedCollidableIndex != body.m_collidableIdx)\n" -" {\n" -" cachedCollidableIndex = body.m_collidableIdx;\n" -" cachedCollidable = collidables[cachedCollidableIndex];\n" -" }\n" -" if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n" -" {\n" -" float4 invPos = (float4)(0,0,0,0);\n" -" float4 invOrn = (float4)(0,0,0,0);\n" -" float4 rayFromLocal = (float4)(0,0,0,0);\n" -" float4 rayToLocal = (float4)(0,0,0,0);\n" -" invOrn = qtInvert(orn);\n" -" invPos = qtRotate(invOrn, -pos);\n" -" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n" -" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n" -" rayFromLocal.w = 0.f;\n" -" rayToLocal.w = 0.f;\n" -" int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n" -" int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n" -" if (numFaces)\n" -" {\n" -" if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n" -" {\n" -" hitBodyIndex = b;\n" -" \n" -" }\n" -" }\n" -" }\n" -" if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n" -" {\n" -" float radius = cachedCollidable.m_radius;\n" -" \n" -" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n" -" {\n" -" hitBodyIndex = b;\n" -" hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n" -" }\n" -" }\n" -" }\n" -" if (hitBodyIndex>=0)\n" -" {\n" -" hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n" -" hitResults[i].m_hitFraction = hitFraction;\n" -" hitResults[i].m_hitPoint = hitPoint;\n" -" hitResults[i].m_hitNormal = normalize(hitNormal);\n" -" hitResults[i].m_hitResult0 = hitBodyIndex;\n" -" }\n" -"}\n" -"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n" -" __global int* out_firstRayRigidPairIndexPerRay,\n" -" __global int* out_numRayRigidPairsPerRay,\n" -" int numRayRigidPairs)\n" -"{\n" -" int rayRigidPairIndex = get_global_id(0);\n" -" if (rayRigidPairIndex >= numRayRigidPairs) return;\n" -" \n" -" int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n" -" \n" -" atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n" -" atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n" -"}\n" -"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n" -" __global b3RayHit* hitResults, \n" -" __global int* firstRayRigidPairIndexPerRay,\n" -" __global int* numRayRigidPairsPerRay,\n" -" \n" -" __global Body* bodies,\n" -" __global Collidable* collidables,\n" -" __global const b3GpuFace* faces,\n" -" __global const ConvexPolyhedronCL* convexShapes,\n" -" \n" -" __global int2* rayRigidPairs,\n" -" int numRays)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i >= numRays) return;\n" -" \n" -" float4 rayFrom = rays[i].m_from;\n" -" float4 rayTo = rays[i].m_to;\n" -" \n" -" hitResults[i].m_hitFraction = 1.f;\n" -" \n" -" float hitFraction = 1.f;\n" -" float4 hitPoint;\n" -" float4 hitNormal;\n" -" int hitBodyIndex = -1;\n" -" \n" -" //\n" -" for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n" -" {\n" -" int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n" -" int b = rayRigidPairs[rayRigidPairIndex].y;\n" -" \n" -" if (hitResults[i].m_hitResult2 == b) continue;\n" -" \n" -" Body body = bodies[b];\n" -" Collidable rigidCollidable = collidables[body.m_collidableIdx];\n" -" \n" -" float4 pos = body.m_pos;\n" -" float4 orn = body.m_quat;\n" -" \n" -" if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n" -" {\n" -" float4 invPos = (float4)(0,0,0,0);\n" -" float4 invOrn = (float4)(0,0,0,0);\n" -" float4 rayFromLocal = (float4)(0,0,0,0);\n" -" float4 rayToLocal = (float4)(0,0,0,0);\n" -" invOrn = qtInvert(orn);\n" -" invPos = qtRotate(invOrn, -pos);\n" -" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n" -" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n" -" rayFromLocal.w = 0.f;\n" -" rayToLocal.w = 0.f;\n" -" int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n" -" int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n" -" \n" -" if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n" -" {\n" -" hitBodyIndex = b;\n" -" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n" -" }\n" -" }\n" -" \n" -" if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n" -" {\n" -" float radius = rigidCollidable.m_radius;\n" -" \n" -" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n" -" {\n" -" hitBodyIndex = b;\n" -" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n" -" hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n" -" }\n" -" }\n" -" }\n" -" \n" -" if (hitBodyIndex >= 0)\n" -" {\n" -" hitResults[i].m_hitFraction = hitFraction;\n" -" hitResults[i].m_hitPoint = hitPoint;\n" -" hitResults[i].m_hitNormal = normalize(hitNormal);\n" -" hitResults[i].m_hitResult0 = hitBodyIndex;\n" -" }\n" -" \n" -"}\n" -; +static const char* rayCastKernelCL = + "#define SHAPE_CONVEX_HULL 3\n" + "#define SHAPE_PLANE 4\n" + "#define SHAPE_CONCAVE_TRIMESH 5\n" + "#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n" + "#define SHAPE_SPHERE 7\n" + "typedef struct\n" + "{\n" + " float4 m_from;\n" + " float4 m_to;\n" + "} b3RayInfo;\n" + "typedef struct\n" + "{\n" + " float m_hitFraction;\n" + " int m_hitResult0;\n" + " int m_hitResult1;\n" + " int m_hitResult2;\n" + " float4 m_hitPoint;\n" + " float4 m_hitNormal;\n" + "} b3RayHit;\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " float4 m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " unsigned int m_collidableIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} Body;\n" + "typedef struct Collidable\n" + "{\n" + " union {\n" + " int m_numChildShapes;\n" + " int m_bvhIndex;\n" + " };\n" + " float m_radius;\n" + " int m_shapeType;\n" + " int m_shapeIndex;\n" + "} Collidable;\n" + "typedef struct \n" + "{\n" + " float4 m_localCenter;\n" + " float4 m_extents;\n" + " float4 mC;\n" + " float4 mE;\n" + " float m_radius;\n" + " int m_faceOffset;\n" + " int m_numFaces;\n" + " int m_numVertices;\n" + " int m_vertexOffset;\n" + " int m_uniqueEdgesOffset;\n" + " int m_numUniqueEdges;\n" + " int m_unused;\n" + "} ConvexPolyhedronCL;\n" + "typedef struct\n" + "{\n" + " float4 m_plane;\n" + " int m_indexOffset;\n" + " int m_numIndices;\n" + "} b3GpuFace;\n" + "///////////////////////////////////////\n" + "// Quaternion\n" + "///////////////////////////////////////\n" + "typedef float4 Quaternion;\n" + "__inline\n" + " Quaternion qtMul(Quaternion a, Quaternion b);\n" + "__inline\n" + " Quaternion qtNormalize(Quaternion in);\n" + "__inline\n" + " Quaternion qtInvert(Quaternion q);\n" + "__inline\n" + " float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = (float4)(a.xyz,0.f);\n" + " float4 b1 = (float4)(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + " Quaternion qtMul(Quaternion a, Quaternion b)\n" + "{\n" + " Quaternion ans;\n" + " ans = cross( a, b );\n" + " ans += a.w*b+b.w*a;\n" + " // ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "__inline\n" + " Quaternion qtNormalize(Quaternion in)\n" + "{\n" + " return fast_normalize(in);\n" + " // in /= length( in );\n" + " // return in;\n" + "}\n" + "__inline\n" + " float4 qtRotate(Quaternion q, float4 vec)\n" + "{\n" + " Quaternion qInv = qtInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = qtMul(q,vcpy);\n" + " out = qtMul(out,qInv);\n" + " return out;\n" + "}\n" + "__inline\n" + " Quaternion qtInvert(Quaternion q)\n" + "{\n" + " return (Quaternion)(-q.xyz, q.w);\n" + "}\n" + "__inline\n" + " float4 qtInvRotate(const Quaternion q, float4 vec)\n" + "{\n" + " return qtRotate( qtInvert( q ), vec );\n" + "}\n" + "void trInverse(float4 translationIn, Quaternion orientationIn,\n" + " float4* translationOut, Quaternion* orientationOut)\n" + "{\n" + " *orientationOut = qtInvert(orientationIn);\n" + " *translationOut = qtRotate(*orientationOut, -translationIn);\n" + "}\n" + "bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n" + " __global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n" + "{\n" + " rayFromLocal.w = 0.f;\n" + " rayToLocal.w = 0.f;\n" + " bool result = true;\n" + " float exitFraction = hitFraction[0];\n" + " float enterFraction = -0.3f;\n" + " float4 curHitNormal = (float4)(0,0,0,0);\n" + " for (int i=0;i<numFaces && result;i++)\n" + " {\n" + " b3GpuFace face = faces[faceOffset+i];\n" + " float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n" + " float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n" + " if (fromPlaneDist<0.f)\n" + " {\n" + " if (toPlaneDist >= 0.f)\n" + " {\n" + " float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n" + " if (exitFraction>fraction)\n" + " {\n" + " exitFraction = fraction;\n" + " }\n" + " } \n" + " } else\n" + " {\n" + " if (toPlaneDist<0.f)\n" + " {\n" + " float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n" + " if (enterFraction <= fraction)\n" + " {\n" + " enterFraction = fraction;\n" + " curHitNormal = face.m_plane;\n" + " curHitNormal.w = 0.f;\n" + " }\n" + " } else\n" + " {\n" + " result = false;\n" + " }\n" + " }\n" + " if (exitFraction <= enterFraction)\n" + " result = false;\n" + " }\n" + " if (enterFraction < 0.f)\n" + " {\n" + " result = false;\n" + " }\n" + " if (result)\n" + " { \n" + " hitFraction[0] = enterFraction;\n" + " hitNormal[0] = curHitNormal;\n" + " }\n" + " return result;\n" + "}\n" + "bool sphere_intersect(float4 spherePos, float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n" + "{\n" + " float4 rs = rayFrom - spherePos;\n" + " rs.w = 0.f;\n" + " float4 rayDir = rayTo-rayFrom;\n" + " rayDir.w = 0.f;\n" + " float A = dot(rayDir,rayDir);\n" + " float B = dot(rs, rayDir);\n" + " float C = dot(rs, rs) - (radius * radius);\n" + " float D = B * B - A*C;\n" + " if (D > 0.0f)\n" + " {\n" + " float t = (-B - sqrt(D))/A;\n" + " if ( (t >= 0.0f) && (t < (*hitFraction)) )\n" + " {\n" + " *hitFraction = t;\n" + " return true;\n" + " }\n" + " }\n" + " return false;\n" + "}\n" + "float4 setInterpolate3(float4 from, float4 to, float t)\n" + "{\n" + " float s = 1.0f - t;\n" + " float4 result;\n" + " result = s * from + t * to;\n" + " result.w = 0.f; \n" + " return result; \n" + "}\n" + "__kernel void rayCastKernel( \n" + " int numRays, \n" + " const __global b3RayInfo* rays, \n" + " __global b3RayHit* hitResults, \n" + " const int numBodies, \n" + " __global Body* bodies,\n" + " __global Collidable* collidables,\n" + " __global const b3GpuFace* faces,\n" + " __global const ConvexPolyhedronCL* convexShapes )\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numRays)\n" + " return;\n" + " hitResults[i].m_hitFraction = 1.f;\n" + " float4 rayFrom = rays[i].m_from;\n" + " float4 rayTo = rays[i].m_to;\n" + " float hitFraction = 1.f;\n" + " float4 hitPoint;\n" + " float4 hitNormal;\n" + " int hitBodyIndex= -1;\n" + " int cachedCollidableIndex = -1;\n" + " Collidable cachedCollidable;\n" + " for (int b=0;b<numBodies;b++)\n" + " {\n" + " if (hitResults[i].m_hitResult2==b)\n" + " continue;\n" + " Body body = bodies[b];\n" + " float4 pos = body.m_pos;\n" + " float4 orn = body.m_quat;\n" + " if (cachedCollidableIndex != body.m_collidableIdx)\n" + " {\n" + " cachedCollidableIndex = body.m_collidableIdx;\n" + " cachedCollidable = collidables[cachedCollidableIndex];\n" + " }\n" + " if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n" + " {\n" + " float4 invPos = (float4)(0,0,0,0);\n" + " float4 invOrn = (float4)(0,0,0,0);\n" + " float4 rayFromLocal = (float4)(0,0,0,0);\n" + " float4 rayToLocal = (float4)(0,0,0,0);\n" + " invOrn = qtInvert(orn);\n" + " invPos = qtRotate(invOrn, -pos);\n" + " rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n" + " rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n" + " rayFromLocal.w = 0.f;\n" + " rayToLocal.w = 0.f;\n" + " int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n" + " int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n" + " if (numFaces)\n" + " {\n" + " if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n" + " {\n" + " hitBodyIndex = b;\n" + " \n" + " }\n" + " }\n" + " }\n" + " if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n" + " {\n" + " float radius = cachedCollidable.m_radius;\n" + " \n" + " if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n" + " {\n" + " hitBodyIndex = b;\n" + " hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n" + " }\n" + " }\n" + " }\n" + " if (hitBodyIndex>=0)\n" + " {\n" + " hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n" + " hitResults[i].m_hitFraction = hitFraction;\n" + " hitResults[i].m_hitPoint = hitPoint;\n" + " hitResults[i].m_hitNormal = normalize(hitNormal);\n" + " hitResults[i].m_hitResult0 = hitBodyIndex;\n" + " }\n" + "}\n" + "__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n" + " __global int* out_firstRayRigidPairIndexPerRay,\n" + " __global int* out_numRayRigidPairsPerRay,\n" + " int numRayRigidPairs)\n" + "{\n" + " int rayRigidPairIndex = get_global_id(0);\n" + " if (rayRigidPairIndex >= numRayRigidPairs) return;\n" + " \n" + " int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n" + " \n" + " atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n" + " atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n" + "}\n" + "__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n" + " __global b3RayHit* hitResults, \n" + " __global int* firstRayRigidPairIndexPerRay,\n" + " __global int* numRayRigidPairsPerRay,\n" + " \n" + " __global Body* bodies,\n" + " __global Collidable* collidables,\n" + " __global const b3GpuFace* faces,\n" + " __global const ConvexPolyhedronCL* convexShapes,\n" + " \n" + " __global int2* rayRigidPairs,\n" + " int numRays)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i >= numRays) return;\n" + " \n" + " float4 rayFrom = rays[i].m_from;\n" + " float4 rayTo = rays[i].m_to;\n" + " \n" + " hitResults[i].m_hitFraction = 1.f;\n" + " \n" + " float hitFraction = 1.f;\n" + " float4 hitPoint;\n" + " float4 hitNormal;\n" + " int hitBodyIndex = -1;\n" + " \n" + " //\n" + " for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n" + " {\n" + " int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n" + " int b = rayRigidPairs[rayRigidPairIndex].y;\n" + " \n" + " if (hitResults[i].m_hitResult2 == b) continue;\n" + " \n" + " Body body = bodies[b];\n" + " Collidable rigidCollidable = collidables[body.m_collidableIdx];\n" + " \n" + " float4 pos = body.m_pos;\n" + " float4 orn = body.m_quat;\n" + " \n" + " if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n" + " {\n" + " float4 invPos = (float4)(0,0,0,0);\n" + " float4 invOrn = (float4)(0,0,0,0);\n" + " float4 rayFromLocal = (float4)(0,0,0,0);\n" + " float4 rayToLocal = (float4)(0,0,0,0);\n" + " invOrn = qtInvert(orn);\n" + " invPos = qtRotate(invOrn, -pos);\n" + " rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n" + " rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n" + " rayFromLocal.w = 0.f;\n" + " rayToLocal.w = 0.f;\n" + " int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n" + " int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n" + " \n" + " if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n" + " {\n" + " hitBodyIndex = b;\n" + " hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n" + " }\n" + " }\n" + " \n" + " if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n" + " {\n" + " float radius = rigidCollidable.m_radius;\n" + " \n" + " if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n" + " {\n" + " hitBodyIndex = b;\n" + " hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n" + " hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n" + " }\n" + " }\n" + " }\n" + " \n" + " if (hitBodyIndex >= 0)\n" + " {\n" + " hitResults[i].m_hitFraction = hitFraction;\n" + " hitResults[i].m_hitPoint = hitPoint;\n" + " hitResults[i].m_hitNormal = normalize(hitNormal);\n" + " hitResults[i].m_hitResult0 = hitBodyIndex;\n" + " }\n" + " \n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h index c7478f54a1..89c0142ab3 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h @@ -5,14 +5,13 @@ #include "Bullet3Dynamics/shared/b3ContactConstraint4.h" - -B3_ATTRIBUTE_ALIGNED16(struct) b3GpuConstraint4 : public b3ContactConstraint4 +B3_ATTRIBUTE_ALIGNED16(struct) +b3GpuConstraint4 : public b3ContactConstraint4 { B3_DECLARE_ALIGNED_ALLOCATOR(); - inline void setFrictionCoeff(float value) { m_linear[3] = value; } - inline float getFrictionCoeff() const { return m_linear[3]; } + inline void setFrictionCoeff(float value) { m_linear[3] = value; } + inline float getFrictionCoeff() const { return m_linear[3]; } }; -#endif //B3_CONSTRAINT4_h - +#endif //B3_CONSTRAINT4_h diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp index af687b54e9..a271090af4 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp @@ -19,11 +19,11 @@ subject to the following restrictions: #include <new> #include "Bullet3Common/b3Transform.h" -void b3GpuGenericConstraint::getInfo1 (unsigned int* info,const b3RigidBodyData* bodies) +void b3GpuGenericConstraint::getInfo1(unsigned int* info, const b3RigidBodyData* bodies) { switch (m_constraintType) { - case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: + case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: { *info = 3; break; @@ -35,7 +35,7 @@ void b3GpuGenericConstraint::getInfo1 (unsigned int* info,const b3RigidBodyData* }; } -void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies) +void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies) { b3Transform trA; trA.setIdentity(); @@ -47,54 +47,52 @@ void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo trB.setOrigin(bodies[constraint->m_rbB].m_pos); trB.setRotation(bodies[constraint->m_rbB].m_quat); - // anchor points in global coordinates with respect to body PORs. - - // set jacobian - info->m_J1linearAxis[0] = 1; - info->m_J1linearAxis[info->rowskip+1] = 1; - info->m_J1linearAxis[2*info->rowskip+2] = 1; + // anchor points in global coordinates with respect to body PORs. - b3Vector3 a1 = trA.getBasis()*constraint->getPivotInA(); + // set jacobian + info->m_J1linearAxis[0] = 1; + info->m_J1linearAxis[info->rowskip + 1] = 1; + info->m_J1linearAxis[2 * info->rowskip + 2] = 1; + + b3Vector3 a1 = trA.getBasis() * constraint->getPivotInA(); //b3Vector3 a1a = b3QuatRotate(trA.getRotation(),constraint->getPivotInA()); { b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis); - b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis+info->rowskip); - b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis+2*info->rowskip); + b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis + info->rowskip); + b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis + 2 * info->rowskip); b3Vector3 a1neg = -a1; - a1neg.getSkewSymmetricMatrix(angular0,angular1,angular2); + a1neg.getSkewSymmetricMatrix(angular0, angular1, angular2); } - + if (info->m_J2linearAxis) { info->m_J2linearAxis[0] = -1; - info->m_J2linearAxis[info->rowskip+1] = -1; - info->m_J2linearAxis[2*info->rowskip+2] = -1; + info->m_J2linearAxis[info->rowskip + 1] = -1; + info->m_J2linearAxis[2 * info->rowskip + 2] = -1; } - - b3Vector3 a2 = trB.getBasis()*constraint->getPivotInB(); - + + b3Vector3 a2 = trB.getBasis() * constraint->getPivotInB(); + { - // b3Vector3 a2n = -a2; + // b3Vector3 a2n = -a2; b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis); - b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis+info->rowskip); - b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis+2*info->rowskip); - a2.getSkewSymmetricMatrix(angular0,angular1,angular2); + b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis + info->rowskip); + b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis + 2 * info->rowskip); + a2.getSkewSymmetricMatrix(angular0, angular1, angular2); } - - - // set right hand side -// b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp; + // set right hand side + // b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp; b3Scalar currERP = info->erp; b3Scalar k = info->fps * currERP; - int j; - for (j=0; j<3; j++) - { - info->m_constraintError[j*info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]); + int j; + for (j = 0; j < 3; j++) + { + info->m_constraintError[j * info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]); //printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]); - } + } #if 0 if(m_flags & B3_P2P_FLAGS_CFM) { @@ -117,21 +115,20 @@ void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo } info->m_damping = m_setting.m_damping; #endif - } -void b3GpuGenericConstraint::getInfo2 (b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies) +void b3GpuGenericConstraint::getInfo2(b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies) { switch (m_constraintType) { - case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: + case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: { - getInfo2Point2Point(this,info,bodies); + getInfo2Point2Point(this, info, bodies); break; }; default: - { - b3Assert(0); - } + { + b3Assert(0); + } }; } diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h index 14b3ba7fec..1f163ba7d5 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h @@ -20,37 +20,35 @@ subject to the following restrictions: struct b3RigidBodyData; enum B3_CONSTRAINT_FLAGS { - B3_CONSTRAINT_FLAG_ENABLED=1, + B3_CONSTRAINT_FLAG_ENABLED = 1, }; enum b3GpuGenericConstraintType { - B3_GPU_POINT2POINT_CONSTRAINT_TYPE=3, - B3_GPU_FIXED_CONSTRAINT_TYPE=4, -// B3_HINGE_CONSTRAINT_TYPE, -// B3_CONETWIST_CONSTRAINT_TYPE, -// B3_D6_CONSTRAINT_TYPE, -// B3_SLIDER_CONSTRAINT_TYPE, -// B3_CONTACT_CONSTRAINT_TYPE, -// B3_D6_SPRING_CONSTRAINT_TYPE, -// B3_GEAR_CONSTRAINT_TYPE, - + B3_GPU_POINT2POINT_CONSTRAINT_TYPE = 3, + B3_GPU_FIXED_CONSTRAINT_TYPE = 4, + // B3_HINGE_CONSTRAINT_TYPE, + // B3_CONETWIST_CONSTRAINT_TYPE, + // B3_D6_CONSTRAINT_TYPE, + // B3_SLIDER_CONSTRAINT_TYPE, + // B3_CONTACT_CONSTRAINT_TYPE, + // B3_D6_SPRING_CONSTRAINT_TYPE, + // B3_GEAR_CONSTRAINT_TYPE, + B3_GPU_MAX_CONSTRAINT_TYPE }; - - -struct b3GpuConstraintInfo2 +struct b3GpuConstraintInfo2 { // integrator parameters: frames per second (1/stepsize), default error // reduction parameter (0..1). - b3Scalar fps,erp; + b3Scalar fps, erp; // for the first and second body, pointers to two (linear and angular) // n*3 jacobian sub matrices, stored by rows. these matrices will have // been initialized to 0 on entry. if the second body is zero then the // J2xx pointers may be 0. - b3Scalar *m_J1linearAxis,*m_J1angularAxis,*m_J2linearAxis,*m_J2angularAxis; + b3Scalar *m_J1linearAxis, *m_J1angularAxis, *m_J2linearAxis, *m_J2angularAxis; // elements to jump from one row to the next in J's int rowskip; @@ -58,44 +56,44 @@ struct b3GpuConstraintInfo2 // right hand sides of the equation J*v = c + cfm * lambda. cfm is the // "constraint force mixing" vector. c is set to zero on entry, cfm is // set to a constant value (typically very small or zero) value on entry. - b3Scalar *m_constraintError,*cfm; + b3Scalar *m_constraintError, *cfm; // lo and hi limits for variables (set to -/+ infinity on entry). - b3Scalar *m_lowerLimit,*m_upperLimit; + b3Scalar *m_lowerLimit, *m_upperLimit; // findex vector for variables. see the LCP solver interface for a // description of what this does. this is set to -1 on entry. // note that the returned indexes are relative to the first index of // the constraint. - int *findex; + int* findex; // number of solver iterations int m_numIterations; //damping of the velocity - b3Scalar m_damping; + b3Scalar m_damping; }; - -B3_ATTRIBUTE_ALIGNED16(struct) b3GpuGenericConstraint +B3_ATTRIBUTE_ALIGNED16(struct) +b3GpuGenericConstraint { - int m_constraintType; - int m_rbA; - int m_rbB; - float m_breakingImpulseThreshold; + int m_constraintType; + int m_rbA; + int m_rbB; + float m_breakingImpulseThreshold; b3Vector3 m_pivotInA; b3Vector3 m_pivotInB; b3Quaternion m_relTargetAB; - int m_flags; + int m_flags; int m_uid; int m_padding[2]; - int getRigidBodyA() const + int getRigidBodyA() const { return m_rbA; } - int getRigidBodyB() const + int getRigidBodyB() const { return m_rbB; } @@ -121,12 +119,10 @@ B3_ATTRIBUTE_ALIGNED16(struct) b3GpuGenericConstraint } ///internal method used by the constraint solver, don't use them directly - void getInfo1 (unsigned int* info,const b3RigidBodyData* bodies); + void getInfo1(unsigned int* info, const b3RigidBodyData* bodies); ///internal method used by the constraint solver, don't use them directly - void getInfo2 (b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies); - - + void getInfo2(b3GpuConstraintInfo2 * info, const b3RigidBodyData* bodies); }; -#endif //B3_GPU_GENERIC_CONSTRAINT_H
\ No newline at end of file +#endif //B3_GPU_GENERIC_CONSTRAINT_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp index 179dfc4f26..089fb1f6a6 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp @@ -2,7 +2,7 @@ #include "b3GpuJacobiContactSolver.h" #include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" #include "Bullet3Common/b3AlignedObjectArray.h" -#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 class b3Vector3; #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" @@ -15,89 +15,78 @@ class b3Vector3; #include "Bullet3Common/shared/b3Int4.h" #define SOLVER_UTILS_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl" - struct b3GpuJacobiSolverInternalData { - //btRadixSort32CL* m_sort32; - //btBoundSearchCL* m_search; - b3PrefixScanCL* m_scan; - - b3OpenCLArray<unsigned int>* m_bodyCount; - b3OpenCLArray<b3Int2>* m_contactConstraintOffsets; - b3OpenCLArray<unsigned int>* m_offsetSplitBodies; - - b3OpenCLArray<b3Vector3>* m_deltaLinearVelocities; - b3OpenCLArray<b3Vector3>* m_deltaAngularVelocities; - - b3AlignedObjectArray<b3Vector3> m_deltaLinearVelocitiesCPU; - b3AlignedObjectArray<b3Vector3> m_deltaAngularVelocitiesCPU; + //btRadixSort32CL* m_sort32; + //btBoundSearchCL* m_search; + b3PrefixScanCL* m_scan; + b3OpenCLArray<unsigned int>* m_bodyCount; + b3OpenCLArray<b3Int2>* m_contactConstraintOffsets; + b3OpenCLArray<unsigned int>* m_offsetSplitBodies; + b3OpenCLArray<b3Vector3>* m_deltaLinearVelocities; + b3OpenCLArray<b3Vector3>* m_deltaAngularVelocities; - b3OpenCLArray<b3GpuConstraint4>* m_contactConstraints; - - b3FillCL* m_filler; - - - cl_kernel m_countBodiesKernel; - cl_kernel m_contactToConstraintSplitKernel; - cl_kernel m_clearVelocitiesKernel; - cl_kernel m_averageVelocitiesKernel; - cl_kernel m_updateBodyVelocitiesKernel; - cl_kernel m_solveContactKernel; - cl_kernel m_solveFrictionKernel; + b3AlignedObjectArray<b3Vector3> m_deltaLinearVelocitiesCPU; + b3AlignedObjectArray<b3Vector3> m_deltaAngularVelocitiesCPU; + b3OpenCLArray<b3GpuConstraint4>* m_contactConstraints; + b3FillCL* m_filler; + cl_kernel m_countBodiesKernel; + cl_kernel m_contactToConstraintSplitKernel; + cl_kernel m_clearVelocitiesKernel; + cl_kernel m_averageVelocitiesKernel; + cl_kernel m_updateBodyVelocitiesKernel; + cl_kernel m_solveContactKernel; + cl_kernel m_solveFrictionKernel; }; - b3GpuJacobiContactSolver::b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity) - :m_context(ctx), - m_device(device), - m_queue(queue) + : m_context(ctx), + m_device(device), + m_queue(queue) { m_data = new b3GpuJacobiSolverInternalData; - m_data->m_scan = new b3PrefixScanCL(m_context,m_device,m_queue); - m_data->m_bodyCount = new b3OpenCLArray<unsigned int>(m_context,m_queue); - m_data->m_filler = new b3FillCL(m_context,m_device,m_queue); - m_data->m_contactConstraintOffsets = new b3OpenCLArray<b3Int2>(m_context,m_queue); - m_data->m_offsetSplitBodies = new b3OpenCLArray<unsigned int>(m_context,m_queue); - m_data->m_contactConstraints = new b3OpenCLArray<b3GpuConstraint4>(m_context,m_queue); - m_data->m_deltaLinearVelocities = new b3OpenCLArray<b3Vector3>(m_context,m_queue); - m_data->m_deltaAngularVelocities = new b3OpenCLArray<b3Vector3>(m_context,m_queue); + m_data->m_scan = new b3PrefixScanCL(m_context, m_device, m_queue); + m_data->m_bodyCount = new b3OpenCLArray<unsigned int>(m_context, m_queue); + m_data->m_filler = new b3FillCL(m_context, m_device, m_queue); + m_data->m_contactConstraintOffsets = new b3OpenCLArray<b3Int2>(m_context, m_queue); + m_data->m_offsetSplitBodies = new b3OpenCLArray<unsigned int>(m_context, m_queue); + m_data->m_contactConstraints = new b3OpenCLArray<b3GpuConstraint4>(m_context, m_queue); + m_data->m_deltaLinearVelocities = new b3OpenCLArray<b3Vector3>(m_context, m_queue); + m_data->m_deltaAngularVelocities = new b3OpenCLArray<b3Vector3>(m_context, m_queue); cl_int pErrNum; - const char* additionalMacros=""; + const char* additionalMacros = ""; const char* solverUtilsSource = solverUtilsCL; { - cl_program solverUtilsProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverUtilsSource, &pErrNum,additionalMacros, SOLVER_UTILS_KERNEL_PATH); + cl_program solverUtilsProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverUtilsSource, &pErrNum, additionalMacros, SOLVER_UTILS_KERNEL_PATH); b3Assert(solverUtilsProg); - m_data->m_countBodiesKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "CountBodiesKernel", &pErrNum, solverUtilsProg,additionalMacros ); + m_data->m_countBodiesKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "CountBodiesKernel", &pErrNum, solverUtilsProg, additionalMacros); b3Assert(m_data->m_countBodiesKernel); - m_data->m_contactToConstraintSplitKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "ContactToConstraintSplitKernel", &pErrNum, solverUtilsProg,additionalMacros ); + m_data->m_contactToConstraintSplitKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "ContactToConstraintSplitKernel", &pErrNum, solverUtilsProg, additionalMacros); b3Assert(m_data->m_contactToConstraintSplitKernel); - m_data->m_clearVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "ClearVelocitiesKernel", &pErrNum, solverUtilsProg,additionalMacros ); + m_data->m_clearVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "ClearVelocitiesKernel", &pErrNum, solverUtilsProg, additionalMacros); b3Assert(m_data->m_clearVelocitiesKernel); - m_data->m_averageVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "AverageVelocitiesKernel", &pErrNum, solverUtilsProg,additionalMacros ); + m_data->m_averageVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "AverageVelocitiesKernel", &pErrNum, solverUtilsProg, additionalMacros); b3Assert(m_data->m_averageVelocitiesKernel); - m_data->m_updateBodyVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "UpdateBodyVelocitiesKernel", &pErrNum, solverUtilsProg,additionalMacros ); + m_data->m_updateBodyVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "UpdateBodyVelocitiesKernel", &pErrNum, solverUtilsProg, additionalMacros); b3Assert(m_data->m_updateBodyVelocitiesKernel); - - m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "SolveContactJacobiKernel", &pErrNum, solverUtilsProg,additionalMacros ); - b3Assert(m_data->m_solveContactKernel ); + m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "SolveContactJacobiKernel", &pErrNum, solverUtilsProg, additionalMacros); + b3Assert(m_data->m_solveContactKernel); - m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverUtilsSource, "SolveFrictionJacobiKernel", &pErrNum, solverUtilsProg,additionalMacros ); + m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverUtilsSource, "SolveFrictionJacobiKernel", &pErrNum, solverUtilsProg, additionalMacros); b3Assert(m_data->m_solveFrictionKernel); } - } - b3GpuJacobiContactSolver::~b3GpuJacobiContactSolver() { clReleaseKernel(m_data->m_solveContactKernel); @@ -106,7 +95,7 @@ b3GpuJacobiContactSolver::~b3GpuJacobiContactSolver() clReleaseKernel(m_data->m_contactToConstraintSplitKernel); clReleaseKernel(m_data->m_averageVelocitiesKernel); clReleaseKernel(m_data->m_updateBodyVelocitiesKernel); - clReleaseKernel(m_data->m_clearVelocitiesKernel ); + clReleaseKernel(m_data->m_clearVelocitiesKernel); delete m_data->m_deltaLinearVelocities; delete m_data->m_deltaAngularVelocities; @@ -119,80 +108,70 @@ b3GpuJacobiContactSolver::~b3GpuJacobiContactSolver() delete m_data; } - - b3Vector3 make_float4(float v) { - return b3MakeVector3 (v,v,v); + return b3MakeVector3(v, v, v); } -b3Vector4 make_float4(float x,float y, float z, float w) +b3Vector4 make_float4(float x, float y, float z, float w) { - return b3MakeVector4 (x,y,z,w); + return b3MakeVector4(x, y, z, w); } - - static - inline - float calcRelVel(const b3Vector3& l0, const b3Vector3& l1, const b3Vector3& a0, const b3Vector3& a1, - const b3Vector3& linVel0, const b3Vector3& angVel0, const b3Vector3& linVel1, const b3Vector3& angVel1) - { - return b3Dot(l0, linVel0) + b3Dot(a0, angVel0) + b3Dot(l1, linVel1) + b3Dot(a1, angVel1); - } - - - static - inline - void setLinearAndAngular(const b3Vector3& n, const b3Vector3& r0, const b3Vector3& r1, - b3Vector3& linear, b3Vector3& angular0, b3Vector3& angular1) - { - linear = n; - angular0 = b3Cross(r0, n); - angular1 = -b3Cross(r1, n); - } - - -static __inline void solveContact(b3GpuConstraint4& cs, - const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA, - const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB, - float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB) +static inline float calcRelVel(const b3Vector3& l0, const b3Vector3& l1, const b3Vector3& a0, const b3Vector3& a1, + const b3Vector3& linVel0, const b3Vector3& angVel0, const b3Vector3& linVel1, const b3Vector3& angVel1) { + return b3Dot(l0, linVel0) + b3Dot(a0, angVel0) + b3Dot(l1, linVel1) + b3Dot(a1, angVel1); +} +static inline void setLinearAndAngular(const b3Vector3& n, const b3Vector3& r0, const b3Vector3& r1, + b3Vector3& linear, b3Vector3& angular0, b3Vector3& angular1) +{ + linear = n; + angular0 = b3Cross(r0, n); + angular1 = -b3Cross(r1, n); +} - for(int ic=0; ic<4; ic++) +static __inline void solveContact(b3GpuConstraint4& cs, + const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB) +{ + for (int ic = 0; ic < 4; ic++) { // dont necessary because this makes change to 0 - if( cs.m_jacCoeffInv[ic] == 0.f ) continue; + if (cs.m_jacCoeffInv[ic] == 0.f) continue; { b3Vector3 angular0, angular1, linear; b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA; b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB; - setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, linear, angular0, angular1 ); + setLinearAndAngular((const b3Vector3&)cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, linear, angular0, angular1); - float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1, - linVelARO+dLinVelA, angVelARO+dAngVelA, linVelBRO+dLinVelB, angVelBRO+dAngVelB ) + cs.m_b[ic]; + float rambdaDt = calcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1, + linVelARO + dLinVelA, angVelARO + dAngVelA, linVelBRO + dLinVelB, angVelBRO + dAngVelB) + + cs.m_b[ic]; rambdaDt *= cs.m_jacCoeffInv[ic]; { float prevSum = cs.m_appliedRambdaDt[ic]; float updated = prevSum; updated += rambdaDt; - updated = b3Max( updated, minRambdaDt[ic] ); - updated = b3Min( updated, maxRambdaDt[ic] ); + updated = b3Max(updated, minRambdaDt[ic]); + updated = b3Min(updated, maxRambdaDt[ic]); rambdaDt = updated - prevSum; cs.m_appliedRambdaDt[ic] = updated; } - b3Vector3 linImp0 = invMassA*linear*rambdaDt; - b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; - b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; - b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; + b3Vector3 linImp0 = invMassA * linear * rambdaDt; + b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt; + b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt; + b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt; #ifdef _WIN32 - b3Assert(_finite(linImp0.getX())); + b3Assert(_finite(linImp0.getX())); b3Assert(_finite(linImp1.getX())); #endif - + if (invMassA) { dLinVelA += linImp0; @@ -207,43 +186,42 @@ static __inline void solveContact(b3GpuConstraint4& cs, } } - - void solveContact3(b3GpuConstraint4* cs, - b3Vector3* posAPtr, b3Vector3* linVelA, b3Vector3* angVelA, float invMassA, const b3Matrix3x3& invInertiaA, - b3Vector3* posBPtr, b3Vector3* linVelB, b3Vector3* angVelB, float invMassB, const b3Matrix3x3& invInertiaB, - b3Vector3* dLinVelA, b3Vector3* dAngVelA, b3Vector3* dLinVelB, b3Vector3* dAngVelB) + b3Vector3* posAPtr, b3Vector3* linVelA, b3Vector3* angVelA, float invMassA, const b3Matrix3x3& invInertiaA, + b3Vector3* posBPtr, b3Vector3* linVelB, b3Vector3* angVelB, float invMassB, const b3Matrix3x3& invInertiaB, + b3Vector3* dLinVelA, b3Vector3* dAngVelA, b3Vector3* dLinVelB, b3Vector3* dAngVelB) { float minRambdaDt = 0; float maxRambdaDt = FLT_MAX; - for(int ic=0; ic<4; ic++) + for (int ic = 0; ic < 4; ic++) { - if( cs->m_jacCoeffInv[ic] == 0.f ) continue; + if (cs->m_jacCoeffInv[ic] == 0.f) continue; b3Vector3 angular0, angular1, linear; b3Vector3 r0 = cs->m_worldPos[ic] - *posAPtr; b3Vector3 r1 = cs->m_worldPos[ic] - *posBPtr; - setLinearAndAngular( cs->m_linear, r0, r1, linear, angular0, angular1 ); + setLinearAndAngular(cs->m_linear, r0, r1, linear, angular0, angular1); - float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, - *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic]; + float rambdaDt = calcRelVel(cs->m_linear, -cs->m_linear, angular0, angular1, + *linVelA + *dLinVelA, *angVelA + *dAngVelA, *linVelB + *dLinVelB, *angVelB + *dAngVelB) + + cs->m_b[ic]; rambdaDt *= cs->m_jacCoeffInv[ic]; { float prevSum = cs->m_appliedRambdaDt[ic]; float updated = prevSum; updated += rambdaDt; - updated = b3Max( updated, minRambdaDt ); - updated = b3Min( updated, maxRambdaDt ); + updated = b3Max(updated, minRambdaDt); + updated = b3Min(updated, maxRambdaDt); rambdaDt = updated - prevSum; cs->m_appliedRambdaDt[ic] = updated; } - b3Vector3 linImp0 = invMassA*linear*rambdaDt; - b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; - b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; - b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; + b3Vector3 linImp0 = invMassA * linear * rambdaDt; + b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt; + b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt; + b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt; if (invMassA) { @@ -258,58 +236,56 @@ void solveContact3(b3GpuConstraint4* cs, } } - -static inline void solveFriction(b3GpuConstraint4& cs, - const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA, - const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB, - float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB) +static inline void solveFriction(b3GpuConstraint4& cs, + const b3Vector3& posA, const b3Vector3& linVelARO, const b3Vector3& angVelARO, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, const b3Vector3& linVelBRO, const b3Vector3& angVelBRO, float invMassB, const b3Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4], b3Vector3& dLinVelA, b3Vector3& dAngVelA, b3Vector3& dLinVelB, b3Vector3& dAngVelB) { + b3Vector3 linVelA = linVelARO + dLinVelA; + b3Vector3 linVelB = linVelBRO + dLinVelB; + b3Vector3 angVelA = angVelARO + dAngVelA; + b3Vector3 angVelB = angVelBRO + dAngVelB; - b3Vector3 linVelA = linVelARO+dLinVelA; - b3Vector3 linVelB = linVelBRO+dLinVelB; - b3Vector3 angVelA = angVelARO+dAngVelA; - b3Vector3 angVelB = angVelBRO+dAngVelB; - - if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return; + if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return; const b3Vector3& center = (const b3Vector3&)cs.m_center; b3Vector3 n = -(const b3Vector3&)cs.m_linear; b3Vector3 tangent[2]; -#if 1 - b3PlaneSpace1 (n, tangent[0],tangent[1]); +#if 1 + b3PlaneSpace1(n, tangent[0], tangent[1]); #else - b3Vector3 r = cs.m_worldPos[0]-center; - tangent[0] = cross3( n, r ); - tangent[1] = cross3( tangent[0], n ); - tangent[0] = normalize3( tangent[0] ); - tangent[1] = normalize3( tangent[1] ); + b3Vector3 r = cs.m_worldPos[0] - center; + tangent[0] = cross3(n, r); + tangent[1] = cross3(tangent[0], n); + tangent[0] = normalize3(tangent[0]); + tangent[1] = normalize3(tangent[1]); #endif b3Vector3 angular0, angular1, linear; b3Vector3 r0 = center - posA; b3Vector3 r1 = center - posB; - for(int i=0; i<2; i++) + for (int i = 0; i < 2; i++) { - setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 ); + setLinearAndAngular(tangent[i], r0, r1, linear, angular0, angular1); float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, - linVelA, angVelA, linVelB, angVelB ); + linVelA, angVelA, linVelB, angVelB); rambdaDt *= cs.m_fJacCoeffInv[i]; - { - float prevSum = cs.m_fAppliedRambdaDt[i]; - float updated = prevSum; - updated += rambdaDt; - updated = b3Max( updated, minRambdaDt[i] ); - updated = b3Min( updated, maxRambdaDt[i] ); - rambdaDt = updated - prevSum; - cs.m_fAppliedRambdaDt[i] = updated; - } + { + float prevSum = cs.m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = b3Max(updated, minRambdaDt[i]); + updated = b3Min(updated, maxRambdaDt[i]); + rambdaDt = updated - prevSum; + cs.m_fAppliedRambdaDt[i] = updated; + } - b3Vector3 linImp0 = invMassA*linear*rambdaDt; - b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; - b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; - b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; + b3Vector3 linImp0 = invMassA * linear * rambdaDt; + b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt; + b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt; + b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt; #ifdef _WIN32 b3Assert(_finite(linImp0.getX())); b3Assert(_finite(linImp1.getX())); @@ -326,65 +302,58 @@ static inline void solveFriction(b3GpuConstraint4& cs, } } - { // angular damping for point constraint - b3Vector3 ab = ( posB - posA ).normalized(); - b3Vector3 ac = ( center - posA ).normalized(); - if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { // angular damping for point constraint + b3Vector3 ab = (posB - posA).normalized(); + b3Vector3 ac = (center - posA).normalized(); + if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) { - float angNA = b3Dot( n, angVelA ); - float angNB = b3Dot( n, angVelB ); + float angNA = b3Dot(n, angVelA); + float angNB = b3Dot(n, angVelB); if (invMassA) - dAngVelA -= (angNA*0.1f)*n; + dAngVelA -= (angNA * 0.1f) * n; if (invMassB) - dAngVelB -= (angNB*0.1f)*n; + dAngVelB -= (angNB * 0.1f) * n; } } - } - - - float calcJacCoeff(const b3Vector3& linear0, const b3Vector3& linear1, const b3Vector3& angular0, const b3Vector3& angular1, - float invMass0, const b3Matrix3x3* invInertia0, float invMass1, const b3Matrix3x3* invInertia1, float countA, float countB) + float invMass0, const b3Matrix3x3* invInertia0, float invMass1, const b3Matrix3x3* invInertia1, float countA, float countB) { // linear0,1 are normlized - float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0; - - float jmj1 = b3Dot(mtMul3(angular0,*invInertia0), angular0); - float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1; - float jmj3 = b3Dot(mtMul3(angular1,*invInertia1), angular1); - return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB); -// return -1.f/((jmj0+jmj1)+(jmj2+jmj3)); + float jmj0 = invMass0; //dot3F4(linear0, linear0)*invMass0; + float jmj1 = b3Dot(mtMul3(angular0, *invInertia0), angular0); + float jmj2 = invMass1; //dot3F4(linear1, linear1)*invMass1; + float jmj3 = b3Dot(mtMul3(angular1, *invInertia1), angular1); + return -1.f / ((jmj0 + jmj1) * countA + (jmj2 + jmj3) * countB); + // return -1.f/((jmj0+jmj1)+(jmj2+jmj3)); } - -void setConstraint4( const b3Vector3& posA, const b3Vector3& linVelA, const b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, - const b3Vector3& posB, const b3Vector3& linVelB, const b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, - b3Contact4* src, float dt, float positionDrift, float positionConstraintCoeff, float countA, float countB, - b3GpuConstraint4* dstC ) +void setConstraint4(const b3Vector3& posA, const b3Vector3& linVelA, const b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, const b3Vector3& linVelB, const b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, + b3Contact4* src, float dt, float positionDrift, float positionConstraintCoeff, float countA, float countB, + b3GpuConstraint4* dstC) { dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit); dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit); - float dtInv = 1.f/dt; - for(int ic=0; ic<4; ic++) + float dtInv = 1.f / dt; + for (int ic = 0; ic < 4; ic++) { dstC->m_appliedRambdaDt[ic] = 0.f; } dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f; - dstC->m_linear = src->m_worldNormalOnB; - dstC->m_linear[3] = 0.7f ;//src->getFrictionCoeff() ); - for(int ic=0; ic<4; ic++) + dstC->m_linear[3] = 0.7f; //src->getFrictionCoeff() ); + for (int ic = 0; ic < 4; ic++) { b3Vector3 r0 = src->m_worldPosB[ic] - posA; b3Vector3 r1 = src->m_worldPosB[ic] - posB; - if( ic >= src->m_worldNormalOnB[3] )//npoints + if (ic >= src->m_worldNormalOnB[3]) //npoints { dstC->m_jacCoeffInv[ic] = 0.f; continue; @@ -396,53 +365,53 @@ void setConstraint4( const b3Vector3& posA, const b3Vector3& linVelA, const b3Ve setLinearAndAngular(src->m_worldNormalOnB, r0, r1, linear, angular0, angular1); dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1, - invMassA, &invInertiaA, invMassB, &invInertiaB ,countA,countB); + invMassA, &invInertiaA, invMassB, &invInertiaB, countA, countB); relVelN = calcRelVel(linear, -linear, angular0, angular1, - linVelA, angVelA, linVelB, angVelB); + linVelA, angVelA, linVelB, angVelB); - float e = 0.f;//src->getRestituitionCoeff(); - if( relVelN*relVelN < 0.004f ) + float e = 0.f; //src->getRestituitionCoeff(); + if (relVelN * relVelN < 0.004f) { e = 0.f; } - dstC->m_b[ic] = e*relVelN; + dstC->m_b[ic] = e * relVelN; //float penetration = src->m_worldPos[ic].w; - dstC->m_b[ic] += (src->m_worldPosB[ic][3] + positionDrift)*positionConstraintCoeff*dtInv; + dstC->m_b[ic] += (src->m_worldPosB[ic][3] + positionDrift) * positionConstraintCoeff * dtInv; dstC->m_appliedRambdaDt[ic] = 0.f; } } - if( src->m_worldNormalOnB[3] > 0 )//npoints - { // prepare friction + if (src->m_worldNormalOnB[3] > 0) //npoints + { // prepare friction b3Vector3 center = make_float4(0.f); - for(int i=0; i<src->m_worldNormalOnB[3]; i++) + for (int i = 0; i < src->m_worldNormalOnB[3]; i++) center += src->m_worldPosB[i]; center /= (float)src->m_worldNormalOnB[3]; b3Vector3 tangent[2]; - b3PlaneSpace1(src->m_worldNormalOnB,tangent[0],tangent[1]); - + b3PlaneSpace1(src->m_worldNormalOnB, tangent[0], tangent[1]); + b3Vector3 r[2]; r[0] = center - posA; r[1] = center - posB; - for(int i=0; i<2; i++) + for (int i = 0; i < 2; i++) { b3Vector3 linear, angular0, angular1; setLinearAndAngular(tangent[i], r[0], r[1], linear, angular0, angular1); dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1, - invMassA, &invInertiaA, invMassB, &invInertiaB ,countA,countB); + invMassA, &invInertiaA, invMassB, &invInertiaB, countA, countB); dstC->m_fAppliedRambdaDt[i] = 0.f; } dstC->m_center = center; } - for(int i=0; i<4; i++) + for (int i = 0; i < 4; i++) { - if( i<src->m_worldNormalOnB[3] ) + if (i < src->m_worldNormalOnB[3]) { dstC->m_worldPos[i] = src->m_worldPosB[i]; } @@ -453,17 +422,14 @@ void setConstraint4( const b3Vector3& posA, const b3Vector3& linVelA, const b3Ve } } - - void ContactToConstraintKernel(b3Contact4* gContact, b3RigidBodyData* gBodies, b3InertiaData* gShapes, b3GpuConstraint4* gConstraintOut, int nContacts, -float dt, -float positionDrift, -float positionConstraintCoeff, int gIdx, b3AlignedObjectArray<unsigned int>& bodyCount -) + float dt, + float positionDrift, + float positionConstraintCoeff, int gIdx, b3AlignedObjectArray<unsigned int>& bodyCount) { //int gIdx = 0;//GET_GLOBAL_IDX; - - if( gIdx < nContacts ) + + if (gIdx < nContacts) { int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); @@ -472,50 +438,46 @@ float positionConstraintCoeff, int gIdx, b3AlignedObjectArray<unsigned int>& bod b3Vector3 linVelA = gBodies[aIdx].m_linVel; b3Vector3 angVelA = gBodies[aIdx].m_angVel; float invMassA = gBodies[aIdx].m_invMass; - b3Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertiaWorld;//.m_invInertia; + b3Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertiaWorld; //.m_invInertia; b3Vector3 posB = gBodies[bIdx].m_pos; b3Vector3 linVelB = gBodies[bIdx].m_linVel; b3Vector3 angVelB = gBodies[bIdx].m_angVel; float invMassB = gBodies[bIdx].m_invMass; - b3Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertiaWorld;//m_invInertia; + b3Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertiaWorld; //m_invInertia; b3GpuConstraint4 cs; float countA = invMassA ? (float)(bodyCount[aIdx]) : 1; float countB = invMassB ? (float)(bodyCount[bIdx]) : 1; - setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, - &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB, - &cs ); - + setConstraint4(posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, + &gContact[gIdx], dt, positionDrift, positionConstraintCoeff, countA, countB, + &cs); - cs.m_batchIdx = gContact[gIdx].m_batchIdx; gConstraintOut[gIdx] = cs; } } - -void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,b3Contact4* manifoldPtr, int numManifolds,const b3JacobiSolverInfo& solverInfo) +void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, b3Contact4* manifoldPtr, int numManifolds, const b3JacobiSolverInfo& solverInfo) { B3_PROFILE("b3GpuJacobiContactSolver::solveGroup"); b3AlignedObjectArray<unsigned int> bodyCount; bodyCount.resize(numBodies); - for (int i=0;i<numBodies;i++) + for (int i = 0; i < numBodies; i++) bodyCount[i] = 0; b3AlignedObjectArray<b3Int2> contactConstraintOffsets; contactConstraintOffsets.resize(numManifolds); - - for (int i=0;i<numManifolds;i++) + for (int i = 0; i < numManifolds; i++) { int pa = manifoldPtr[i].m_bodyAPtrAndSignBit; int pb = manifoldPtr[i].m_bodyBPtrAndSignBit; - bool isFixedA = (pa <0) || (pa == solverInfo.m_fixedBodyIndex); - bool isFixedB = (pb <0) || (pb == solverInfo.m_fixedBodyIndex); + bool isFixedA = (pa < 0) || (pa == solverInfo.m_fixedBodyIndex); + bool isFixedB = (pb < 0) || (pb == solverInfo.m_fixedBodyIndex); int bodyIndexA = manifoldPtr[i].getBodyA(); int bodyIndexB = manifoldPtr[i].getBodyB(); @@ -529,71 +491,63 @@ void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies,b3InertiaD { contactConstraintOffsets[i].y = bodyCount[bodyIndexB]; bodyCount[bodyIndexB]++; - } + } } b3AlignedObjectArray<unsigned int> offsetSplitBodies; offsetSplitBodies.resize(numBodies); unsigned int totalNumSplitBodies; - m_data->m_scan->executeHost(bodyCount,offsetSplitBodies,numBodies,&totalNumSplitBodies); - int numlastBody = bodyCount[numBodies-1]; + m_data->m_scan->executeHost(bodyCount, offsetSplitBodies, numBodies, &totalNumSplitBodies); + int numlastBody = bodyCount[numBodies - 1]; totalNumSplitBodies += numlastBody; - printf("totalNumSplitBodies = %d\n",totalNumSplitBodies); - - - - + printf("totalNumSplitBodies = %d\n", totalNumSplitBodies); b3AlignedObjectArray<b3GpuConstraint4> contactConstraints; contactConstraints.resize(numManifolds); - for (int i=0;i<numManifolds;i++) + for (int i = 0; i < numManifolds; i++) { - ContactToConstraintKernel(&manifoldPtr[0],bodies,inertias,&contactConstraints[0],numManifolds, - solverInfo.m_deltaTime, - solverInfo.m_positionDrift, - solverInfo.m_positionConstraintCoeff, - i, bodyCount); + ContactToConstraintKernel(&manifoldPtr[0], bodies, inertias, &contactConstraints[0], numManifolds, + solverInfo.m_deltaTime, + solverInfo.m_positionDrift, + solverInfo.m_positionConstraintCoeff, + i, bodyCount); } int maxIter = solverInfo.m_numIterations; - b3AlignedObjectArray<b3Vector3> deltaLinearVelocities; b3AlignedObjectArray<b3Vector3> deltaAngularVelocities; deltaLinearVelocities.resize(totalNumSplitBodies); deltaAngularVelocities.resize(totalNumSplitBodies); - for (unsigned int i=0;i<totalNumSplitBodies;i++) + for (unsigned int i = 0; i < totalNumSplitBodies; i++) { deltaLinearVelocities[i].setZero(); deltaAngularVelocities[i].setZero(); } - - - for (int iter = 0;iter<maxIter;iter++) + for (int iter = 0; iter < maxIter; iter++) { - int i=0; - for( i=0; i<numManifolds; i++) + int i = 0; + for (i = 0; i < numManifolds; i++) { - //float frictionCoeff = contactConstraints[i].getFrictionCoeff(); int aIdx = (int)contactConstraints[i].m_bodyA; int bIdx = (int)contactConstraints[i].m_bodyB; b3RigidBodyData& bodyA = bodies[aIdx]; b3RigidBodyData& bodyB = bodies[bIdx]; - b3Vector3 zero = b3MakeVector3(0,0,0); - - b3Vector3* dlvAPtr=&zero; - b3Vector3* davAPtr=&zero; - b3Vector3* dlvBPtr=&zero; - b3Vector3* davBPtr=&zero; - + b3Vector3 zero = b3MakeVector3(0, 0, 0); + + b3Vector3* dlvAPtr = &zero; + b3Vector3* davAPtr = &zero; + b3Vector3* dlvBPtr = &zero; + b3Vector3* davBPtr = &zero; + if (bodyA.m_invMass) { int bodyOffsetA = offsetSplitBodies[aIdx]; int constraintOffsetA = contactConstraintOffsets[i].x; - int splitIndexA = bodyOffsetA+constraintOffsetA; + int splitIndexA = bodyOffsetA + constraintOffsetA; dlvAPtr = &deltaLinearVelocities[splitIndexA]; davAPtr = &deltaAngularVelocities[splitIndexA]; } @@ -602,67 +556,61 @@ void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies,b3InertiaD { int bodyOffsetB = offsetSplitBodies[bIdx]; int constraintOffsetB = contactConstraintOffsets[i].y; - int splitIndexB= bodyOffsetB+constraintOffsetB; - dlvBPtr =&deltaLinearVelocities[splitIndexB]; + int splitIndexB = bodyOffsetB + constraintOffsetB; + dlvBPtr = &deltaLinearVelocities[splitIndexB]; davBPtr = &deltaAngularVelocities[splitIndexB]; } - - { - float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; - float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; - - solveContact( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertias[aIdx].m_invInertiaWorld, - (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld, - maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr ); - + float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX}; + float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f}; + solveContact(contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertias[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt, *dlvAPtr, *davAPtr, *dlvBPtr, *davBPtr); } - } - //easy - for (int i=0;i<numBodies;i++) + for (int i = 0; i < numBodies; i++) { if (bodies[i].m_invMass) { int bodyOffset = offsetSplitBodies[i]; int count = bodyCount[i]; - float factor = 1.f/float(count); + float factor = 1.f / float(count); b3Vector3 averageLinVel; averageLinVel.setZero(); b3Vector3 averageAngVel; averageAngVel.setZero(); - for (int j=0;j<count;j++) + for (int j = 0; j < count; j++) { - averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor; - averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor; + averageLinVel += deltaLinearVelocities[bodyOffset + j] * factor; + averageAngVel += deltaAngularVelocities[bodyOffset + j] * factor; } - for (int j=0;j<count;j++) + for (int j = 0; j < count; j++) { - deltaLinearVelocities[bodyOffset+j] = averageLinVel; - deltaAngularVelocities[bodyOffset+j] = averageAngVel; + deltaLinearVelocities[bodyOffset + j] = averageLinVel; + deltaAngularVelocities[bodyOffset + j] = averageAngVel; } } } } - for (int iter = 0;iter<maxIter;iter++) + for (int iter = 0; iter < maxIter; iter++) { //int i=0; - + //solve friction - for(int i=0; i<numManifolds; i++) + for (int i = 0; i < numManifolds; i++) { - float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; - float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX}; + float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f}; float sum = 0; - for(int j=0; j<4; j++) + for (int j = 0; j < 4; j++) { - sum +=contactConstraints[i].m_appliedRambdaDt[j]; + sum += contactConstraints[i].m_appliedRambdaDt[j]; } float frictionCoeff = contactConstraints[i].getFrictionCoeff(); int aIdx = (int)contactConstraints[i].m_bodyA; @@ -670,18 +618,18 @@ void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies,b3InertiaD b3RigidBodyData& bodyA = bodies[aIdx]; b3RigidBodyData& bodyB = bodies[bIdx]; - b3Vector3 zero = b3MakeVector3(0,0,0); - - b3Vector3* dlvAPtr=&zero; - b3Vector3* davAPtr=&zero; - b3Vector3* dlvBPtr=&zero; - b3Vector3* davBPtr=&zero; - + b3Vector3 zero = b3MakeVector3(0, 0, 0); + + b3Vector3* dlvAPtr = &zero; + b3Vector3* davAPtr = &zero; + b3Vector3* dlvBPtr = &zero; + b3Vector3* davBPtr = &zero; + if (bodyA.m_invMass) { int bodyOffsetA = offsetSplitBodies[aIdx]; int constraintOffsetA = contactConstraintOffsets[i].x; - int splitIndexA = bodyOffsetA+constraintOffsetA; + int splitIndexA = bodyOffsetA + constraintOffsetA; dlvAPtr = &deltaLinearVelocities[splitIndexA]; davAPtr = &deltaAngularVelocities[splitIndexA]; } @@ -690,55 +638,50 @@ void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies,b3InertiaD { int bodyOffsetB = offsetSplitBodies[bIdx]; int constraintOffsetB = contactConstraintOffsets[i].y; - int splitIndexB= bodyOffsetB+constraintOffsetB; - dlvBPtr =&deltaLinearVelocities[splitIndexB]; + int splitIndexB = bodyOffsetB + constraintOffsetB; + dlvBPtr = &deltaLinearVelocities[splitIndexB]; davBPtr = &deltaAngularVelocities[splitIndexB]; } - for(int j=0; j<4; j++) + for (int j = 0; j < 4; j++) { - maxRambdaDt[j] = frictionCoeff*sum; + maxRambdaDt[j] = frictionCoeff * sum; minRambdaDt[j] = -maxRambdaDt[j]; } - solveFriction( contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,inertias[aIdx].m_invInertiaWorld, - (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld, - maxRambdaDt, minRambdaDt , *dlvAPtr,*davAPtr,*dlvBPtr,*davBPtr); - + solveFriction(contactConstraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, inertias[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, inertias[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt, *dlvAPtr, *davAPtr, *dlvBPtr, *davBPtr); } //easy - for (int i=0;i<numBodies;i++) + for (int i = 0; i < numBodies; i++) { if (bodies[i].m_invMass) { int bodyOffset = offsetSplitBodies[i]; int count = bodyCount[i]; - float factor = 1.f/float(count); + float factor = 1.f / float(count); b3Vector3 averageLinVel; averageLinVel.setZero(); b3Vector3 averageAngVel; averageAngVel.setZero(); - for (int j=0;j<count;j++) + for (int j = 0; j < count; j++) { - averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor; - averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor; + averageLinVel += deltaLinearVelocities[bodyOffset + j] * factor; + averageAngVel += deltaAngularVelocities[bodyOffset + j] * factor; } - for (int j=0;j<count;j++) + for (int j = 0; j < count; j++) { - deltaLinearVelocities[bodyOffset+j] = averageLinVel; - deltaAngularVelocities[bodyOffset+j] = averageAngVel; + deltaLinearVelocities[bodyOffset + j] = averageLinVel; + deltaAngularVelocities[bodyOffset + j] = averageAngVel; } } } - - - } - //easy - for (int i=0;i<numBodies;i++) + for (int i = 0; i < numBodies; i++) { if (bodies[i].m_invMass) { @@ -753,8 +696,6 @@ void b3GpuJacobiContactSolver::solveGroupHost(b3RigidBodyData* bodies,b3InertiaD } } - - void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index) // // @@ -762,49 +703,47 @@ void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_m { b3JacobiSolverInfo solverInfo; solverInfo.m_fixedBodyIndex = static0Index; - B3_PROFILE("b3GpuJacobiContactSolver::solveGroup"); //int numBodies = bodies->size(); - int numManifolds = numContacts;//manifoldPtr->size(); + int numManifolds = numContacts; //manifoldPtr->size(); { B3_PROFILE("resize"); m_data->m_bodyCount->resize(numBodies); } - - unsigned int val=0; + + unsigned int val = 0; b3Int2 val2; - val2.x=0; - val2.y=0; + val2.x = 0; + val2.y = 0; - { + { B3_PROFILE("m_filler"); m_data->m_contactConstraintOffsets->resize(numManifolds); - m_data->m_filler->execute(*m_data->m_bodyCount,val,numBodies); - - - m_data->m_filler->execute(*m_data->m_contactConstraintOffsets,val2,numManifolds); + m_data->m_filler->execute(*m_data->m_bodyCount, val, numBodies); + + m_data->m_filler->execute(*m_data->m_contactConstraintOffsets, val2, numManifolds); } { B3_PROFILE("m_countBodiesKernel"); - b3LauncherCL launcher(this->m_queue,m_data->m_countBodiesKernel,"m_countBodiesKernel"); - launcher.setBuffer(contactBuf);//manifoldPtr->getBufferCL()); + b3LauncherCL launcher(this->m_queue, m_data->m_countBodiesKernel, "m_countBodiesKernel"); + launcher.setBuffer(contactBuf); //manifoldPtr->getBufferCL()); launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); launcher.setBuffer(m_data->m_contactConstraintOffsets->getBufferCL()); launcher.setConst(numManifolds); launcher.setConst(solverInfo.m_fixedBodyIndex); launcher.launch1D(numManifolds); } - unsigned int totalNumSplitBodies=0; + unsigned int totalNumSplitBodies = 0; { B3_PROFILE("m_scan->execute"); - + m_data->m_offsetSplitBodies->resize(numBodies); - m_data->m_scan->execute(*m_data->m_bodyCount,*m_data->m_offsetSplitBodies,numBodies,&totalNumSplitBodies); - totalNumSplitBodies+=m_data->m_bodyCount->at(numBodies-1); + m_data->m_scan->execute(*m_data->m_bodyCount, *m_data->m_offsetSplitBodies, numBodies, &totalNumSplitBodies); + totalNumSplitBodies += m_data->m_bodyCount->at(numBodies - 1); } { @@ -812,50 +751,45 @@ void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_m //int numContacts = manifoldPtr->size(); m_data->m_contactConstraints->resize(numContacts); } - + { B3_PROFILE("contactToConstraintSplitKernel"); - b3LauncherCL launcher( m_queue, m_data->m_contactToConstraintSplitKernel,"m_contactToConstraintSplitKernel"); + b3LauncherCL launcher(m_queue, m_data->m_contactToConstraintSplitKernel, "m_contactToConstraintSplitKernel"); launcher.setBuffer(contactBuf); launcher.setBuffer(bodyBuf); launcher.setBuffer(inertiaBuf); launcher.setBuffer(m_data->m_contactConstraints->getBufferCL()); launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); - launcher.setConst(numContacts); + launcher.setConst(numContacts); launcher.setConst(solverInfo.m_deltaTime); launcher.setConst(solverInfo.m_positionDrift); launcher.setConst(solverInfo.m_positionConstraintCoeff); - launcher.launch1D( numContacts, 64 ); - + launcher.launch1D(numContacts, 64); } - { B3_PROFILE("m_data->m_deltaLinearVelocities->resize"); m_data->m_deltaLinearVelocities->resize(totalNumSplitBodies); m_data->m_deltaAngularVelocities->resize(totalNumSplitBodies); } - - { B3_PROFILE("m_clearVelocitiesKernel"); - b3LauncherCL launch(m_queue,m_data->m_clearVelocitiesKernel,"m_clearVelocitiesKernel"); + b3LauncherCL launch(m_queue, m_data->m_clearVelocitiesKernel, "m_clearVelocitiesKernel"); launch.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); launch.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); launch.setConst(totalNumSplitBodies); launch.launch1D(totalNumSplitBodies); clFinish(m_queue); } - - + int maxIter = solverInfo.m_numIterations; - for (int iter = 0;iter<maxIter;iter++) + for (int iter = 0; iter < maxIter; iter++) { { B3_PROFILE("m_solveContactKernel"); - b3LauncherCL launcher( m_queue, m_data->m_solveContactKernel,"m_solveContactKernel" ); + b3LauncherCL launcher(m_queue, m_data->m_solveContactKernel, "m_solveContactKernel"); launcher.setBuffer(m_data->m_contactConstraints->getBufferCL()); launcher.setBuffer(bodyBuf); launcher.setBuffer(inertiaBuf); @@ -873,11 +807,9 @@ void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_m clFinish(m_queue); } - - { B3_PROFILE("average velocities"); - b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel,"m_averageVelocitiesKernel"); + b3LauncherCL launcher(m_queue, m_data->m_averageVelocitiesKernel, "m_averageVelocitiesKernel"); launcher.setBuffer(bodyBuf); launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); @@ -888,10 +820,9 @@ void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_m clFinish(m_queue); } - { B3_PROFILE("m_solveFrictionKernel"); - b3LauncherCL launcher( m_queue, m_data->m_solveFrictionKernel,"m_solveFrictionKernel"); + b3LauncherCL launcher(m_queue, m_data->m_solveFrictionKernel, "m_solveFrictionKernel"); launcher.setBuffer(m_data->m_contactConstraints->getBufferCL()); launcher.setBuffer(bodyBuf); launcher.setBuffer(inertiaBuf); @@ -909,10 +840,9 @@ void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_m clFinish(m_queue); } - { B3_PROFILE("average velocities"); - b3LauncherCL launcher( m_queue, m_data->m_averageVelocitiesKernel,"m_averageVelocitiesKernel"); + b3LauncherCL launcher(m_queue, m_data->m_averageVelocitiesKernel, "m_averageVelocitiesKernel"); launcher.setBuffer(bodyBuf); launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); @@ -922,27 +852,20 @@ void b3GpuJacobiContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_m launcher.launch1D(numBodies); clFinish(m_queue); } - - - } - { - B3_PROFILE("update body velocities"); - b3LauncherCL launcher( m_queue, m_data->m_updateBodyVelocitiesKernel,"m_updateBodyVelocitiesKernel"); - launcher.setBuffer(bodyBuf); - launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); - launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); - launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); - launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); - launcher.setConst(numBodies); - launcher.launch1D(numBodies); - clFinish(m_queue); - } - - - + B3_PROFILE("update body velocities"); + b3LauncherCL launcher(m_queue, m_data->m_updateBodyVelocitiesKernel, "m_updateBodyVelocitiesKernel"); + launcher.setBuffer(bodyBuf); + launcher.setBuffer(m_data->m_offsetSplitBodies->getBufferCL()); + launcher.setBuffer(m_data->m_bodyCount->getBufferCL()); + launcher.setBuffer(m_data->m_deltaLinearVelocities->getBufferCL()); + launcher.setBuffer(m_data->m_deltaAngularVelocities->getBufferCL()); + launcher.setConst(numBodies); + launcher.launch1D(numBodies); + clFinish(m_queue); + } } #if 0 diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h index b418f29ec4..8281aee05d 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h @@ -8,7 +8,6 @@ #include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" - //struct b3InertiaData; //b3InertiaData @@ -21,21 +20,20 @@ struct b3JacobiSolverInfo float m_deltaTime; float m_positionDrift; float m_positionConstraintCoeff; - int m_numIterations; + int m_numIterations; b3JacobiSolverInfo() - :m_fixedBodyIndex(0), - m_deltaTime(1./60.f), - m_positionDrift( 0.005f ), - m_positionConstraintCoeff( 0.99f ), - m_numIterations(7) + : m_fixedBodyIndex(0), + m_deltaTime(1. / 60.f), + m_positionDrift(0.005f), + m_positionConstraintCoeff(0.99f), + m_numIterations(7) { } }; class b3GpuJacobiContactSolver { protected: - struct b3GpuJacobiSolverInternalData* m_data; cl_context m_context; @@ -43,20 +41,16 @@ protected: cl_command_queue m_queue; public: - b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity); virtual ~b3GpuJacobiContactSolver(); - void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index); - void solveGroupHost(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,struct b3Contact4* manifoldPtr, int numManifolds,const b3JacobiSolverInfo& solverInfo); + void solveGroupHost(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, struct b3Contact4* manifoldPtr, int numManifolds, const b3JacobiSolverInfo& solverInfo); //void solveGroupHost(btRigidBodyCL* bodies,b3InertiaData* inertias,int numBodies,btContact4* manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btJacobiSolverInfo& solverInfo); //b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal); //void solveGroup(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo); //void solveGroupMixed(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo); - }; -#endif //B3_GPU_JACOBI_CONTACT_SOLVER_H - +#endif //B3_GPU_JACOBI_CONTACT_SOLVER_H diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp index 698fa15f96..2e4f6c1572 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp @@ -1,6 +1,5 @@ #include "b3GpuNarrowPhase.h" - #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" #include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h" #include "Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h" @@ -16,107 +15,87 @@ #include "Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h" #include "Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h" - - - b3GpuNarrowPhase::b3GpuNarrowPhase(cl_context ctx, cl_device_id device, cl_command_queue queue, const b3Config& config) -:m_data(0) ,m_planeBodyIndex(-1),m_static0Index(-1), -m_context(ctx), -m_device(device), -m_queue(queue) + : m_data(0), m_planeBodyIndex(-1), m_static0Index(-1), m_context(ctx), m_device(device), m_queue(queue) { - m_data = new b3GpuNarrowPhaseInternalData(); m_data->m_currentContactBuffer = 0; - memset(m_data,0,sizeof(b3GpuNarrowPhaseInternalData)); - + memset(m_data, 0, sizeof(b3GpuNarrowPhaseInternalData)); m_data->m_config = config; - - m_data->m_gpuSatCollision = new GpuSatCollision(ctx,device,queue); - - - m_data->m_triangleConvexPairs = new b3OpenCLArray<b3Int4>(m_context,m_queue, config.m_maxTriConvexPairCapacity); + m_data->m_gpuSatCollision = new GpuSatCollision(ctx, device, queue); + + m_data->m_triangleConvexPairs = new b3OpenCLArray<b3Int4>(m_context, m_queue, config.m_maxTriConvexPairCapacity); //m_data->m_convexPairsOutGPU = new b3OpenCLArray<b3Int2>(ctx,queue,config.m_maxBroadphasePairs,false); //m_data->m_planePairs = new b3OpenCLArray<b3Int2>(ctx,queue,config.m_maxBroadphasePairs,false); - + m_data->m_pBufContactOutCPU = new b3AlignedObjectArray<b3Contact4>(); m_data->m_pBufContactOutCPU->resize(config.m_maxBroadphasePairs); m_data->m_bodyBufferCPU = new b3AlignedObjectArray<b3RigidBodyData>(); m_data->m_bodyBufferCPU->resize(config.m_maxConvexBodies); - + m_data->m_inertiaBufferCPU = new b3AlignedObjectArray<b3InertiaData>(); m_data->m_inertiaBufferCPU->resize(config.m_maxConvexBodies); - - m_data->m_pBufContactBuffersGPU[0] = new b3OpenCLArray<b3Contact4>(ctx,queue, config.m_maxContactCapacity,true); - m_data->m_pBufContactBuffersGPU[1] = new b3OpenCLArray<b3Contact4>(ctx,queue, config.m_maxContactCapacity,true); - - m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx,queue,config.m_maxConvexBodies,false); - m_data->m_collidablesGPU = new b3OpenCLArray<b3Collidable>(ctx,queue,config.m_maxConvexShapes); + + m_data->m_pBufContactBuffersGPU[0] = new b3OpenCLArray<b3Contact4>(ctx, queue, config.m_maxContactCapacity, true); + m_data->m_pBufContactBuffersGPU[1] = new b3OpenCLArray<b3Contact4>(ctx, queue, config.m_maxContactCapacity, true); + + m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx, queue, config.m_maxConvexBodies, false); + m_data->m_collidablesGPU = new b3OpenCLArray<b3Collidable>(ctx, queue, config.m_maxConvexShapes); m_data->m_collidablesCPU.reserve(config.m_maxConvexShapes); m_data->m_localShapeAABBCPU = new b3AlignedObjectArray<b3SapAabb>; - m_data->m_localShapeAABBGPU = new b3OpenCLArray<b3SapAabb>(ctx,queue,config.m_maxConvexShapes); - - + m_data->m_localShapeAABBGPU = new b3OpenCLArray<b3SapAabb>(ctx, queue, config.m_maxConvexShapes); + //m_data->m_solverDataGPU = adl::Solver<adl::TYPE_CL>::allocate(ctx,queue, config.m_maxBroadphasePairs,false); - m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx,queue, config.m_maxConvexBodies,false); + m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx, queue, config.m_maxConvexBodies, false); - m_data->m_convexFacesGPU = new b3OpenCLArray<b3GpuFace>(ctx,queue,config.m_maxConvexShapes*config.m_maxFacesPerShape,false); - m_data->m_convexFaces.reserve(config.m_maxConvexShapes*config.m_maxFacesPerShape); + m_data->m_convexFacesGPU = new b3OpenCLArray<b3GpuFace>(ctx, queue, config.m_maxConvexShapes * config.m_maxFacesPerShape, false); + m_data->m_convexFaces.reserve(config.m_maxConvexShapes * config.m_maxFacesPerShape); - m_data->m_gpuChildShapes = new b3OpenCLArray<b3GpuChildShape>(ctx,queue,config.m_maxCompoundChildShapes,false); - - m_data->m_convexPolyhedraGPU = new b3OpenCLArray<b3ConvexPolyhedronData>(ctx,queue,config.m_maxConvexShapes,false); + m_data->m_gpuChildShapes = new b3OpenCLArray<b3GpuChildShape>(ctx, queue, config.m_maxCompoundChildShapes, false); + + m_data->m_convexPolyhedraGPU = new b3OpenCLArray<b3ConvexPolyhedronData>(ctx, queue, config.m_maxConvexShapes, false); m_data->m_convexPolyhedra.reserve(config.m_maxConvexShapes); - m_data->m_uniqueEdgesGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexUniqueEdges,true); + m_data->m_uniqueEdgesGPU = new b3OpenCLArray<b3Vector3>(ctx, queue, config.m_maxConvexUniqueEdges, true); m_data->m_uniqueEdges.reserve(config.m_maxConvexUniqueEdges); - - - m_data->m_convexVerticesGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexVertices,true); + m_data->m_convexVerticesGPU = new b3OpenCLArray<b3Vector3>(ctx, queue, config.m_maxConvexVertices, true); m_data->m_convexVertices.reserve(config.m_maxConvexVertices); - m_data->m_convexIndicesGPU = new b3OpenCLArray<int>(ctx,queue,config.m_maxConvexIndices,true); - m_data->m_convexIndices.reserve(config.m_maxConvexIndices); - - m_data->m_worldVertsB1GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace); - m_data->m_clippingFacesOutGPU = new b3OpenCLArray<b3Int4>(ctx,queue,config.m_maxConvexBodies); - m_data->m_worldNormalsAGPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies); - m_data->m_worldVertsA1GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace); - m_data->m_worldVertsB2GPU = new b3OpenCLArray<b3Vector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace); - - + m_data->m_convexIndicesGPU = new b3OpenCLArray<int>(ctx, queue, config.m_maxConvexIndices, true); + m_data->m_convexIndices.reserve(config.m_maxConvexIndices); + + m_data->m_worldVertsB1GPU = new b3OpenCLArray<b3Vector3>(ctx, queue, config.m_maxConvexBodies * config.m_maxVerticesPerFace); + m_data->m_clippingFacesOutGPU = new b3OpenCLArray<b3Int4>(ctx, queue, config.m_maxConvexBodies); + m_data->m_worldNormalsAGPU = new b3OpenCLArray<b3Vector3>(ctx, queue, config.m_maxConvexBodies); + m_data->m_worldVertsA1GPU = new b3OpenCLArray<b3Vector3>(ctx, queue, config.m_maxConvexBodies * config.m_maxVerticesPerFace); + m_data->m_worldVertsB2GPU = new b3OpenCLArray<b3Vector3>(ctx, queue, config.m_maxConvexBodies * config.m_maxVerticesPerFace); - m_data->m_convexData = new b3AlignedObjectArray<b3ConvexUtility* >(); + m_data->m_convexData = new b3AlignedObjectArray<b3ConvexUtility*>(); m_data->m_convexData->resize(config.m_maxConvexShapes); m_data->m_convexPolyhedra.resize(config.m_maxConvexShapes); - + m_data->m_numAcceleratedShapes = 0; m_data->m_numAcceleratedRigidBodies = 0; - - - m_data->m_subTreesGPU = new b3OpenCLArray<b3BvhSubtreeInfo>(this->m_context,this->m_queue); - m_data->m_treeNodesGPU = new b3OpenCLArray<b3QuantizedBvhNode>(this->m_context,this->m_queue); - m_data->m_bvhInfoGPU = new b3OpenCLArray<b3BvhInfo>(this->m_context,this->m_queue); + + m_data->m_subTreesGPU = new b3OpenCLArray<b3BvhSubtreeInfo>(this->m_context, this->m_queue); + m_data->m_treeNodesGPU = new b3OpenCLArray<b3QuantizedBvhNode>(this->m_context, this->m_queue); + m_data->m_bvhInfoGPU = new b3OpenCLArray<b3BvhInfo>(this->m_context, this->m_queue); //m_data->m_contactCGPU = new b3OpenCLArray<Constraint4>(ctx,queue,config.m_maxBroadphasePairs,false); //m_data->m_frictionCGPU = new b3OpenCLArray<adl::Solver<adl::TYPE_CL>::allocateFrictionConstraint( m_data->m_deviceCL, config.m_maxBroadphasePairs); - - - } - b3GpuNarrowPhase::~b3GpuNarrowPhase() { delete m_data->m_gpuSatCollision; - + delete m_data->m_triangleConvexPairs; //delete m_data->m_convexPairsOutGPU; //delete m_data->m_planePairs; @@ -126,7 +105,6 @@ b3GpuNarrowPhase::~b3GpuNarrowPhase() delete m_data->m_pBufContactBuffersGPU[0]; delete m_data->m_pBufContactBuffersGPU[1]; - delete m_data->m_inertiaBufferGPU; delete m_data->m_collidablesGPU; delete m_data->m_localShapeAABBCPU; @@ -139,18 +117,18 @@ b3GpuNarrowPhase::~b3GpuNarrowPhase() delete m_data->m_convexVerticesGPU; delete m_data->m_convexIndicesGPU; delete m_data->m_worldVertsB1GPU; - delete m_data->m_clippingFacesOutGPU; - delete m_data->m_worldNormalsAGPU; + delete m_data->m_clippingFacesOutGPU; + delete m_data->m_worldNormalsAGPU; delete m_data->m_worldVertsA1GPU; - delete m_data->m_worldVertsB2GPU; - + delete m_data->m_worldVertsB2GPU; + delete m_data->m_bvhInfoGPU; - for (int i=0;i<m_data->m_bvhData.size();i++) + for (int i = 0; i < m_data->m_bvhData.size(); i++) { delete m_data->m_bvhData[i]; } - for (int i=0;i<m_data->m_meshInterfaces.size();i++) + for (int i = 0; i < m_data->m_meshInterfaces.size(); i++) { delete m_data->m_meshInterfaces[i]; } @@ -159,198 +137,180 @@ b3GpuNarrowPhase::~b3GpuNarrowPhase() delete m_data->m_treeNodesGPU; delete m_data->m_subTreesGPU; - - delete m_data->m_convexData; + delete m_data->m_convexData; delete m_data; } - -int b3GpuNarrowPhase::allocateCollidable() +int b3GpuNarrowPhase::allocateCollidable() { int curSize = m_data->m_collidablesCPU.size(); - if (curSize<m_data->m_config.m_maxConvexShapes) + if (curSize < m_data->m_config.m_maxConvexShapes) { m_data->m_collidablesCPU.expand(); return curSize; } else { - b3Error("allocateCollidable out-of-range %d\n",m_data->m_config.m_maxConvexShapes); + b3Error("allocateCollidable out-of-range %d\n", m_data->m_config.m_maxConvexShapes); } return -1; - } - - - - -int b3GpuNarrowPhase::registerSphereShape(float radius) +int b3GpuNarrowPhase::registerSphereShape(float radius) { int collidableIndex = allocateCollidable(); - if (collidableIndex<0) + if (collidableIndex < 0) return collidableIndex; - b3Collidable& col = getCollidableCpu(collidableIndex); col.m_shapeType = SHAPE_SPHERE; col.m_shapeIndex = 0; col.m_radius = radius; - - if (col.m_shapeIndex>=0) + + if (col.m_shapeIndex >= 0) { b3SapAabb aabb; - b3Vector3 myAabbMin=b3MakeVector3(-radius,-radius,-radius); - b3Vector3 myAabbMax=b3MakeVector3(radius,radius,radius); + b3Vector3 myAabbMin = b3MakeVector3(-radius, -radius, -radius); + b3Vector3 myAabbMax = b3MakeVector3(radius, radius, radius); - aabb.m_min[0] = myAabbMin[0];//s_convexHeightField->m_aabb.m_min.x; - aabb.m_min[1] = myAabbMin[1];//s_convexHeightField->m_aabb.m_min.y; - aabb.m_min[2] = myAabbMin[2];//s_convexHeightField->m_aabb.m_min.z; + aabb.m_min[0] = myAabbMin[0]; //s_convexHeightField->m_aabb.m_min.x; + aabb.m_min[1] = myAabbMin[1]; //s_convexHeightField->m_aabb.m_min.y; + aabb.m_min[2] = myAabbMin[2]; //s_convexHeightField->m_aabb.m_min.z; aabb.m_minIndices[3] = 0; - aabb.m_max[0] = myAabbMax[0];//s_convexHeightField->m_aabb.m_max.x; - aabb.m_max[1] = myAabbMax[1];//s_convexHeightField->m_aabb.m_max.y; - aabb.m_max[2] = myAabbMax[2];//s_convexHeightField->m_aabb.m_max.z; + aabb.m_max[0] = myAabbMax[0]; //s_convexHeightField->m_aabb.m_max.x; + aabb.m_max[1] = myAabbMax[1]; //s_convexHeightField->m_aabb.m_max.y; + aabb.m_max[2] = myAabbMax[2]; //s_convexHeightField->m_aabb.m_max.z; aabb.m_signedMaxIndices[3] = 0; m_data->m_localShapeAABBCPU->push_back(aabb); -// m_data->m_localShapeAABBGPU->push_back(aabb); + // m_data->m_localShapeAABBGPU->push_back(aabb); clFinish(m_queue); } - + return collidableIndex; } - int b3GpuNarrowPhase::registerFace(const b3Vector3& faceNormal, float faceConstant) { int faceOffset = m_data->m_convexFaces.size(); b3GpuFace& face = m_data->m_convexFaces.expand(); - face.m_plane = b3MakeVector3(faceNormal.x,faceNormal.y,faceNormal.z,faceConstant); + face.m_plane = b3MakeVector3(faceNormal.x, faceNormal.y, faceNormal.z, faceConstant); return faceOffset; } -int b3GpuNarrowPhase::registerPlaneShape(const b3Vector3& planeNormal, float planeConstant) +int b3GpuNarrowPhase::registerPlaneShape(const b3Vector3& planeNormal, float planeConstant) { int collidableIndex = allocateCollidable(); - if (collidableIndex<0) + if (collidableIndex < 0) return collidableIndex; - b3Collidable& col = getCollidableCpu(collidableIndex); col.m_shapeType = SHAPE_PLANE; - col.m_shapeIndex = registerFace(planeNormal,planeConstant); + col.m_shapeIndex = registerFace(planeNormal, planeConstant); col.m_radius = planeConstant; - - if (col.m_shapeIndex>=0) + + if (col.m_shapeIndex >= 0) { b3SapAabb aabb; aabb.m_min[0] = -1e30f; aabb.m_min[1] = -1e30f; aabb.m_min[2] = -1e30f; aabb.m_minIndices[3] = 0; - + aabb.m_max[0] = 1e30f; aabb.m_max[1] = 1e30f; aabb.m_max[2] = 1e30f; aabb.m_signedMaxIndices[3] = 0; m_data->m_localShapeAABBCPU->push_back(aabb); -// m_data->m_localShapeAABBGPU->push_back(aabb); + // m_data->m_localShapeAABBGPU->push_back(aabb); clFinish(m_queue); } - + return collidableIndex; } - -int b3GpuNarrowPhase::registerConvexHullShapeInternal(b3ConvexUtility* convexPtr,b3Collidable& col) +int b3GpuNarrowPhase::registerConvexHullShapeInternal(b3ConvexUtility* convexPtr, b3Collidable& col) { + m_data->m_convexData->resize(m_data->m_numAcceleratedShapes + 1); + m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes + 1); - m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1); - m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1); - - - b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1); + b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size() - 1); convex.mC = convexPtr->mC; convex.mE = convexPtr->mE; - convex.m_extents= convexPtr->m_extents; + convex.m_extents = convexPtr->m_extents; convex.m_localCenter = convexPtr->m_localCenter; convex.m_radius = convexPtr->m_radius; - + convex.m_numUniqueEdges = convexPtr->m_uniqueEdges.size(); int edgeOffset = m_data->m_uniqueEdges.size(); convex.m_uniqueEdgesOffset = edgeOffset; - - m_data->m_uniqueEdges.resize(edgeOffset+convex.m_numUniqueEdges); - + + m_data->m_uniqueEdges.resize(edgeOffset + convex.m_numUniqueEdges); + //convex data here int i; - for ( i=0;i<convexPtr->m_uniqueEdges.size();i++) + for (i = 0; i < convexPtr->m_uniqueEdges.size(); i++) { - m_data->m_uniqueEdges[edgeOffset+i] = convexPtr->m_uniqueEdges[i]; + m_data->m_uniqueEdges[edgeOffset + i] = convexPtr->m_uniqueEdges[i]; } - + int faceOffset = m_data->m_convexFaces.size(); convex.m_faceOffset = faceOffset; convex.m_numFaces = convexPtr->m_faces.size(); - m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces); - + m_data->m_convexFaces.resize(faceOffset + convex.m_numFaces); - for (i=0;i<convexPtr->m_faces.size();i++) + for (i = 0; i < convexPtr->m_faces.size(); i++) { - m_data->m_convexFaces[convex.m_faceOffset+i].m_plane = b3MakeVector3(convexPtr->m_faces[i].m_plane[0], - convexPtr->m_faces[i].m_plane[1], - convexPtr->m_faces[i].m_plane[2], - convexPtr->m_faces[i].m_plane[3]); + m_data->m_convexFaces[convex.m_faceOffset + i].m_plane = b3MakeVector3(convexPtr->m_faces[i].m_plane[0], + convexPtr->m_faces[i].m_plane[1], + convexPtr->m_faces[i].m_plane[2], + convexPtr->m_faces[i].m_plane[3]); - int indexOffset = m_data->m_convexIndices.size(); int numIndices = convexPtr->m_faces[i].m_indices.size(); - m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices; - m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset; - m_data->m_convexIndices.resize(indexOffset+numIndices); - for (int p=0;p<numIndices;p++) + m_data->m_convexFaces[convex.m_faceOffset + i].m_numIndices = numIndices; + m_data->m_convexFaces[convex.m_faceOffset + i].m_indexOffset = indexOffset; + m_data->m_convexIndices.resize(indexOffset + numIndices); + for (int p = 0; p < numIndices; p++) { - m_data->m_convexIndices[indexOffset+p] = convexPtr->m_faces[i].m_indices[p]; + m_data->m_convexIndices[indexOffset + p] = convexPtr->m_faces[i].m_indices[p]; } } - + convex.m_numVertices = convexPtr->m_vertices.size(); int vertexOffset = m_data->m_convexVertices.size(); - convex.m_vertexOffset =vertexOffset; - - m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices); - for (int i=0;i<convexPtr->m_vertices.size();i++) + convex.m_vertexOffset = vertexOffset; + + m_data->m_convexVertices.resize(vertexOffset + convex.m_numVertices); + for (int i = 0; i < convexPtr->m_vertices.size(); i++) { - m_data->m_convexVertices[vertexOffset+i] = convexPtr->m_vertices[i]; + m_data->m_convexVertices[vertexOffset + i] = convexPtr->m_vertices[i]; } (*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = convexPtr; - - - + return m_data->m_numAcceleratedShapes++; } - -int b3GpuNarrowPhase::registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling) +int b3GpuNarrowPhase::registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling) { b3AlignedObjectArray<b3Vector3> verts; - unsigned char* vts = (unsigned char*) vertices; - for (int i=0;i<numVertices;i++) + unsigned char* vts = (unsigned char*)vertices; + for (int i = 0; i < numVertices; i++) { - float* vertex = (float*) &vts[i*strideInBytes]; - verts.push_back(b3MakeVector3(vertex[0]*scaling[0],vertex[1]*scaling[1],vertex[2]*scaling[2])); + float* vertex = (float*)&vts[i * strideInBytes]; + verts.push_back(b3MakeVector3(vertex[0] * scaling[0], vertex[1] * scaling[1], vertex[2] * scaling[2])); } b3ConvexUtility* utilPtr = new b3ConvexUtility(); bool merge = true; if (numVertices) { - utilPtr->initializePolyhedralFeatures(&verts[0],verts.size(),merge); + utilPtr->initializePolyhedralFeatures(&verts[0], verts.size(), merge); } int collidableIndex = registerConvexHullShape(utilPtr); @@ -358,35 +318,34 @@ int b3GpuNarrowPhase::registerConvexHullShape(const float* vertices, int stride return collidableIndex; } -int b3GpuNarrowPhase::registerConvexHullShape(b3ConvexUtility* utilPtr) +int b3GpuNarrowPhase::registerConvexHullShape(b3ConvexUtility* utilPtr) { int collidableIndex = allocateCollidable(); - if (collidableIndex<0) + if (collidableIndex < 0) return collidableIndex; b3Collidable& col = getCollidableCpu(collidableIndex); col.m_shapeType = SHAPE_CONVEX_HULL; col.m_shapeIndex = -1; - - + { - b3Vector3 localCenter=b3MakeVector3(0,0,0); - for (int i=0;i<utilPtr->m_vertices.size();i++) - localCenter+=utilPtr->m_vertices[i]; - localCenter*= (1.f/utilPtr->m_vertices.size()); + b3Vector3 localCenter = b3MakeVector3(0, 0, 0); + for (int i = 0; i < utilPtr->m_vertices.size(); i++) + localCenter += utilPtr->m_vertices[i]; + localCenter *= (1.f / utilPtr->m_vertices.size()); utilPtr->m_localCenter = localCenter; - col.m_shapeIndex = registerConvexHullShapeInternal(utilPtr,col); + col.m_shapeIndex = registerConvexHullShapeInternal(utilPtr, col); } - if (col.m_shapeIndex>=0) + if (col.m_shapeIndex >= 0) { b3SapAabb aabb; - - b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f); - b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f); - for (int i=0;i<utilPtr->m_vertices.size();i++) + b3Vector3 myAabbMin = b3MakeVector3(1e30f, 1e30f, 1e30f); + b3Vector3 myAabbMax = b3MakeVector3(-1e30f, -1e30f, -1e30f); + + for (int i = 0; i < utilPtr->m_vertices.size(); i++) { myAabbMin.setMin(utilPtr->m_vertices[i]); myAabbMax.setMax(utilPtr->m_vertices[i]); @@ -402,18 +361,16 @@ int b3GpuNarrowPhase::registerConvexHullShape(b3ConvexUtility* utilPtr) aabb.m_signedMaxIndices[3] = 0; m_data->m_localShapeAABBCPU->push_back(aabb); -// m_data->m_localShapeAABBGPU->push_back(aabb); + // m_data->m_localShapeAABBGPU->push_back(aabb); } - - return collidableIndex; + return collidableIndex; } -int b3GpuNarrowPhase::registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes) +int b3GpuNarrowPhase::registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes) { - int collidableIndex = allocateCollidable(); - if (collidableIndex<0) + if (collidableIndex < 0) return collidableIndex; b3Collidable& col = getCollidableCpu(collidableIndex); @@ -422,44 +379,41 @@ int b3GpuNarrowPhase::registerCompoundShape(b3AlignedObjectArray<b3GpuChildShap col.m_compoundBvhIndex = m_data->m_bvhInfoCPU.size(); { - b3Assert(col.m_shapeIndex+childShapes->size()<m_data->m_config.m_maxCompoundChildShapes); - for (int i=0;i<childShapes->size();i++) + b3Assert(col.m_shapeIndex + childShapes->size() < m_data->m_config.m_maxCompoundChildShapes); + for (int i = 0; i < childShapes->size(); i++) { m_data->m_cpuChildShapes.push_back(childShapes->at(i)); } } - - col.m_numChildShapes = childShapes->size(); - - + b3SapAabb aabbLocalSpace; - b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f); - b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f); - + b3Vector3 myAabbMin = b3MakeVector3(1e30f, 1e30f, 1e30f); + b3Vector3 myAabbMax = b3MakeVector3(-1e30f, -1e30f, -1e30f); + b3AlignedObjectArray<b3Aabb> childLocalAabbs; childLocalAabbs.resize(childShapes->size()); //compute local AABB of the compound of all children - for (int i=0;i<childShapes->size();i++) + for (int i = 0; i < childShapes->size(); i++) { int childColIndex = childShapes->at(i).m_shapeIndex; //b3Collidable& childCol = getCollidableCpu(childColIndex); - b3SapAabb aabbLoc =m_data->m_localShapeAABBCPU->at(childColIndex); + b3SapAabb aabbLoc = m_data->m_localShapeAABBCPU->at(childColIndex); - b3Vector3 childLocalAabbMin=b3MakeVector3(aabbLoc.m_min[0],aabbLoc.m_min[1],aabbLoc.m_min[2]); - b3Vector3 childLocalAabbMax=b3MakeVector3(aabbLoc.m_max[0],aabbLoc.m_max[1],aabbLoc.m_max[2]); - b3Vector3 aMin,aMax; + b3Vector3 childLocalAabbMin = b3MakeVector3(aabbLoc.m_min[0], aabbLoc.m_min[1], aabbLoc.m_min[2]); + b3Vector3 childLocalAabbMax = b3MakeVector3(aabbLoc.m_max[0], aabbLoc.m_max[1], aabbLoc.m_max[2]); + b3Vector3 aMin, aMax; b3Scalar margin(0.f); b3Transform childTr; childTr.setIdentity(); childTr.setOrigin(childShapes->at(i).m_childPosition); childTr.setRotation(b3Quaternion(childShapes->at(i).m_childOrientation)); - b3TransformAabb(childLocalAabbMin,childLocalAabbMax,margin,childTr,aMin,aMax); + b3TransformAabb(childLocalAabbMin, childLocalAabbMax, margin, childTr, aMin, aMax); myAabbMin.setMin(aMin); - myAabbMax.setMax(aMax); + myAabbMax.setMax(aMax); childLocalAabbs[i].m_min[0] = aMin[0]; childLocalAabbs[i].m_min[1] = aMin[1]; childLocalAabbs[i].m_min[2] = aMin[2]; @@ -469,36 +423,35 @@ int b3GpuNarrowPhase::registerCompoundShape(b3AlignedObjectArray<b3GpuChildShap childLocalAabbs[i].m_max[2] = aMax[2]; childLocalAabbs[i].m_max[3] = 0; } - - aabbLocalSpace.m_min[0] = myAabbMin[0];//s_convexHeightField->m_aabb.m_min.x; - aabbLocalSpace.m_min[1]= myAabbMin[1];//s_convexHeightField->m_aabb.m_min.y; - aabbLocalSpace.m_min[2]= myAabbMin[2];//s_convexHeightField->m_aabb.m_min.z; + + aabbLocalSpace.m_min[0] = myAabbMin[0]; //s_convexHeightField->m_aabb.m_min.x; + aabbLocalSpace.m_min[1] = myAabbMin[1]; //s_convexHeightField->m_aabb.m_min.y; + aabbLocalSpace.m_min[2] = myAabbMin[2]; //s_convexHeightField->m_aabb.m_min.z; aabbLocalSpace.m_minIndices[3] = 0; - - aabbLocalSpace.m_max[0] = myAabbMax[0];//s_convexHeightField->m_aabb.m_max.x; - aabbLocalSpace.m_max[1]= myAabbMax[1];//s_convexHeightField->m_aabb.m_max.y; - aabbLocalSpace.m_max[2]= myAabbMax[2];//s_convexHeightField->m_aabb.m_max.z; + + aabbLocalSpace.m_max[0] = myAabbMax[0]; //s_convexHeightField->m_aabb.m_max.x; + aabbLocalSpace.m_max[1] = myAabbMax[1]; //s_convexHeightField->m_aabb.m_max.y; + aabbLocalSpace.m_max[2] = myAabbMax[2]; //s_convexHeightField->m_aabb.m_max.z; aabbLocalSpace.m_signedMaxIndices[3] = 0; - - m_data->m_localShapeAABBCPU->push_back(aabbLocalSpace); + m_data->m_localShapeAABBCPU->push_back(aabbLocalSpace); b3QuantizedBvh* bvh = new b3QuantizedBvh; - bvh->setQuantizationValues(myAabbMin,myAabbMax); - QuantizedNodeArray& nodes = bvh->getLeafNodeArray(); + bvh->setQuantizationValues(myAabbMin, myAabbMax); + QuantizedNodeArray& nodes = bvh->getLeafNodeArray(); int numNodes = childShapes->size(); - for (int i=0;i<numNodes;i++) + for (int i = 0; i < numNodes; i++) { b3QuantizedBvhNode node; - b3Vector3 aabbMin,aabbMax; - aabbMin = (b3Vector3&) childLocalAabbs[i].m_min; - aabbMax = (b3Vector3&) childLocalAabbs[i].m_max; + b3Vector3 aabbMin, aabbMax; + aabbMin = (b3Vector3&)childLocalAabbs[i].m_min; + aabbMax = (b3Vector3&)childLocalAabbs[i].m_max; - bvh->quantize(&node.m_quantizedAabbMin[0],aabbMin,0); - bvh->quantize(&node.m_quantizedAabbMax[0],aabbMax,1); + bvh->quantize(&node.m_quantizedAabbMin[0], aabbMin, 0); + bvh->quantize(&node.m_quantizedAabbMax[0], aabbMax, 1); int partId = 0; - node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | i; + node.m_escapeIndexOrTriangleIndex = (partId << (31 - MAX_NUM_PARTS_IN_BITS)) | i; nodes.push_back(node); } bvh->buildInternal(); @@ -511,7 +464,7 @@ int b3GpuNarrowPhase::registerCompoundShape(b3AlignedObjectArray<b3GpuChildShap //void buildInternal(); b3BvhInfo bvhInfo; - + bvhInfo.m_aabbMin = bvh->m_bvhAabbMin; bvhInfo.m_aabbMax = bvh->m_bvhAabbMax; bvhInfo.m_quantization = bvh->m_bvhQuantization; @@ -520,80 +473,72 @@ int b3GpuNarrowPhase::registerCompoundShape(b3AlignedObjectArray<b3GpuChildShap bvhInfo.m_nodeOffset = m_data->m_treeNodesCPU.size(); bvhInfo.m_subTreeOffset = m_data->m_subTreesCPU.size(); - int numNewNodes = bvh->getQuantizedNodeArray().size(); + int numNewNodes = bvh->getQuantizedNodeArray().size(); - for (int i=0;i<numNewNodes-1;i++) + for (int i = 0; i < numNewNodes - 1; i++) { - if (bvh->getQuantizedNodeArray()[i].isLeafNode()) { int orgIndex = bvh->getQuantizedNodeArray()[i].getTriangleIndex(); b3Vector3 nodeMinVec = bvh->unQuantize(bvh->getQuantizedNodeArray()[i].m_quantizedAabbMin); b3Vector3 nodeMaxVec = bvh->unQuantize(bvh->getQuantizedNodeArray()[i].m_quantizedAabbMax); - - for (int c=0;c<3;c++) + + for (int c = 0; c < 3; c++) { if (childLocalAabbs[orgIndex].m_min[c] < nodeMinVec[c]) { - printf("min org (%f) and new (%f) ? at i:%d,c:%d\n",childLocalAabbs[i].m_min[c],nodeMinVec[c],i,c); + printf("min org (%f) and new (%f) ? at i:%d,c:%d\n", childLocalAabbs[i].m_min[c], nodeMinVec[c], i, c); } if (childLocalAabbs[orgIndex].m_max[c] > nodeMaxVec[c]) { - printf("max org (%f) and new (%f) ? at i:%d,c:%d\n",childLocalAabbs[i].m_max[c],nodeMaxVec[c],i,c); + printf("max org (%f) and new (%f) ? at i:%d,c:%d\n", childLocalAabbs[i].m_max[c], nodeMaxVec[c], i, c); } - } } - } m_data->m_bvhInfoCPU.push_back(bvhInfo); int numNewSubtrees = bvh->getSubtreeInfoArray().size(); - m_data->m_subTreesCPU.reserve(m_data->m_subTreesCPU.size()+numNewSubtrees); - for (int i=0;i<numNewSubtrees;i++) + m_data->m_subTreesCPU.reserve(m_data->m_subTreesCPU.size() + numNewSubtrees); + for (int i = 0; i < numNewSubtrees; i++) { m_data->m_subTreesCPU.push_back(bvh->getSubtreeInfoArray()[i]); } int numNewTreeNodes = bvh->getQuantizedNodeArray().size(); - for (int i=0;i<numNewTreeNodes;i++) + for (int i = 0; i < numNewTreeNodes; i++) { m_data->m_treeNodesCPU.push_back(bvh->getQuantizedNodeArray()[i]); } -// m_data->m_localShapeAABBGPU->push_back(aabbWS); + // m_data->m_localShapeAABBGPU->push_back(aabbWS); clFinish(m_queue); return collidableIndex; - } - -int b3GpuNarrowPhase::registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling1) +int b3GpuNarrowPhase::registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling1) { - - - b3Vector3 scaling=b3MakeVector3(scaling1[0],scaling1[1],scaling1[2]); + b3Vector3 scaling = b3MakeVector3(scaling1[0], scaling1[1], scaling1[2]); int collidableIndex = allocateCollidable(); - if (collidableIndex<0) + if (collidableIndex < 0) return collidableIndex; b3Collidable& col = getCollidableCpu(collidableIndex); - + col.m_shapeType = SHAPE_CONCAVE_TRIMESH; - col.m_shapeIndex = registerConcaveMeshShape(vertices,indices,col,scaling); + col.m_shapeIndex = registerConcaveMeshShape(vertices, indices, col, scaling); col.m_bvhIndex = m_data->m_bvhInfoCPU.size(); - b3SapAabb aabb; - b3Vector3 myAabbMin=b3MakeVector3(1e30f,1e30f,1e30f); - b3Vector3 myAabbMax=b3MakeVector3(-1e30f,-1e30f,-1e30f); + b3Vector3 myAabbMin = b3MakeVector3(1e30f, 1e30f, 1e30f); + b3Vector3 myAabbMax = b3MakeVector3(-1e30f, -1e30f, -1e30f); - for (int i=0;i<vertices->size();i++) + for (int i = 0; i < vertices->size(); i++) { - b3Vector3 vtx(vertices->at(i)*scaling); + b3Vector3 vtx(vertices->at(i) * scaling); myAabbMin.setMin(vtx); myAabbMax.setMax(vtx); } @@ -603,27 +548,27 @@ int b3GpuNarrowPhase::registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vert aabb.m_minIndices[3] = 0; aabb.m_max[0] = myAabbMax[0]; - aabb.m_max[1]= myAabbMax[1]; - aabb.m_max[2]= myAabbMax[2]; - aabb.m_signedMaxIndices[3]= 0; + aabb.m_max[1] = myAabbMax[1]; + aabb.m_max[2] = myAabbMax[2]; + aabb.m_signedMaxIndices[3] = 0; m_data->m_localShapeAABBCPU->push_back(aabb); -// m_data->m_localShapeAABBGPU->push_back(aabb); + // m_data->m_localShapeAABBGPU->push_back(aabb); b3OptimizedBvh* bvh = new b3OptimizedBvh(); //void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax) - + bool useQuantizedAabbCompression = true; - b3TriangleIndexVertexArray* meshInterface=new b3TriangleIndexVertexArray(); + b3TriangleIndexVertexArray* meshInterface = new b3TriangleIndexVertexArray(); m_data->m_meshInterfaces.push_back(meshInterface); b3IndexedMesh mesh; - mesh.m_numTriangles = indices->size()/3; + mesh.m_numTriangles = indices->size() / 3; mesh.m_numVertices = vertices->size(); - mesh.m_vertexBase = (const unsigned char *)&vertices->at(0).x; + mesh.m_vertexBase = (const unsigned char*)&vertices->at(0).x; mesh.m_vertexStride = sizeof(b3Vector3); - mesh.m_triangleIndexStride = 3 * sizeof(int);// or sizeof(int) - mesh.m_triangleIndexBase = (const unsigned char *)&indices->at(0); - + mesh.m_triangleIndexStride = 3 * sizeof(int); // or sizeof(int) + mesh.m_triangleIndexBase = (const unsigned char*)&indices->at(0); + meshInterface->addIndexedMesh(mesh); bvh->build(meshInterface, useQuantizedAabbCompression, (b3Vector3&)aabb.m_min, (b3Vector3&)aabb.m_max); m_data->m_bvhData.push_back(bvh); @@ -632,7 +577,7 @@ int b3GpuNarrowPhase::registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vert int numSubTrees = bvh->getSubtreeInfoArray().size(); b3BvhInfo bvhInfo; - + bvhInfo.m_aabbMin = bvh->m_bvhAabbMin; bvhInfo.m_aabbMax = bvh->m_bvhAabbMax; bvhInfo.m_quantization = bvh->m_bvhQuantization; @@ -643,97 +588,87 @@ int b3GpuNarrowPhase::registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vert m_data->m_bvhInfoCPU.push_back(bvhInfo); - int numNewSubtrees = bvh->getSubtreeInfoArray().size(); - m_data->m_subTreesCPU.reserve(m_data->m_subTreesCPU.size()+numNewSubtrees); - for (int i=0;i<numNewSubtrees;i++) + m_data->m_subTreesCPU.reserve(m_data->m_subTreesCPU.size() + numNewSubtrees); + for (int i = 0; i < numNewSubtrees; i++) { m_data->m_subTreesCPU.push_back(bvh->getSubtreeInfoArray()[i]); } int numNewTreeNodes = bvh->getQuantizedNodeArray().size(); - for (int i=0;i<numNewTreeNodes;i++) + for (int i = 0; i < numNewTreeNodes; i++) { m_data->m_treeNodesCPU.push_back(bvh->getQuantizedNodeArray()[i]); } - - - return collidableIndex; } -int b3GpuNarrowPhase::registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,b3Collidable& col, const float* scaling1) +int b3GpuNarrowPhase::registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling1) { + b3Vector3 scaling = b3MakeVector3(scaling1[0], scaling1[1], scaling1[2]); + m_data->m_convexData->resize(m_data->m_numAcceleratedShapes + 1); + m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes + 1); - b3Vector3 scaling=b3MakeVector3(scaling1[0],scaling1[1],scaling1[2]); - - m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1); - m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1); - - - b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1); - convex.mC = b3MakeVector3(0,0,0); - convex.mE = b3MakeVector3(0,0,0); - convex.m_extents= b3MakeVector3(0,0,0); - convex.m_localCenter = b3MakeVector3(0,0,0); + b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size() - 1); + convex.mC = b3MakeVector3(0, 0, 0); + convex.mE = b3MakeVector3(0, 0, 0); + convex.m_extents = b3MakeVector3(0, 0, 0); + convex.m_localCenter = b3MakeVector3(0, 0, 0); convex.m_radius = 0.f; - + convex.m_numUniqueEdges = 0; int edgeOffset = m_data->m_uniqueEdges.size(); convex.m_uniqueEdgesOffset = edgeOffset; - + int faceOffset = m_data->m_convexFaces.size(); convex.m_faceOffset = faceOffset; - - convex.m_numFaces = indices->size()/3; - m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces); - m_data->m_convexIndices.reserve(convex.m_numFaces*3); - for (int i=0;i<convex.m_numFaces;i++) + + convex.m_numFaces = indices->size() / 3; + m_data->m_convexFaces.resize(faceOffset + convex.m_numFaces); + m_data->m_convexIndices.reserve(convex.m_numFaces * 3); + for (int i = 0; i < convex.m_numFaces; i++) { - if (i%256==0) + if (i % 256 == 0) { //printf("i=%d out of %d", i,convex.m_numFaces); } - b3Vector3 vert0(vertices->at(indices->at(i*3))*scaling); - b3Vector3 vert1(vertices->at(indices->at(i*3+1))*scaling); - b3Vector3 vert2(vertices->at(indices->at(i*3+2))*scaling); + b3Vector3 vert0(vertices->at(indices->at(i * 3)) * scaling); + b3Vector3 vert1(vertices->at(indices->at(i * 3 + 1)) * scaling); + b3Vector3 vert2(vertices->at(indices->at(i * 3 + 2)) * scaling); - b3Vector3 normal = ((vert1-vert0).cross(vert2-vert0)).normalize(); + b3Vector3 normal = ((vert1 - vert0).cross(vert2 - vert0)).normalize(); b3Scalar c = -(normal.dot(vert0)); - m_data->m_convexFaces[convex.m_faceOffset+i].m_plane = b3MakeVector4(normal.x,normal.y,normal.z,c); + m_data->m_convexFaces[convex.m_faceOffset + i].m_plane = b3MakeVector4(normal.x, normal.y, normal.z, c); int indexOffset = m_data->m_convexIndices.size(); int numIndices = 3; - m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices; - m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset; - m_data->m_convexIndices.resize(indexOffset+numIndices); - for (int p=0;p<numIndices;p++) + m_data->m_convexFaces[convex.m_faceOffset + i].m_numIndices = numIndices; + m_data->m_convexFaces[convex.m_faceOffset + i].m_indexOffset = indexOffset; + m_data->m_convexIndices.resize(indexOffset + numIndices); + for (int p = 0; p < numIndices; p++) { - int vi = indices->at(i*3+p); - m_data->m_convexIndices[indexOffset+p] = vi;//convexPtr->m_faces[i].m_indices[p]; + int vi = indices->at(i * 3 + p); + m_data->m_convexIndices[indexOffset + p] = vi; //convexPtr->m_faces[i].m_indices[p]; } } - + convex.m_numVertices = vertices->size(); int vertexOffset = m_data->m_convexVertices.size(); - convex.m_vertexOffset =vertexOffset; - m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices); - for (int i=0;i<vertices->size();i++) + convex.m_vertexOffset = vertexOffset; + m_data->m_convexVertices.resize(vertexOffset + convex.m_numVertices); + for (int i = 0; i < vertices->size(); i++) { - m_data->m_convexVertices[vertexOffset+i] = vertices->at(i)*scaling; + m_data->m_convexVertices[vertexOffset + i] = vertices->at(i) * scaling; } (*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = 0; - - + return m_data->m_numAcceleratedShapes++; } - - -cl_mem b3GpuNarrowPhase::getBodiesGpu() +cl_mem b3GpuNarrowPhase::getBodiesGpu() { return (cl_mem)m_data->m_bodyBufferGPU->getBufferCL(); } @@ -743,25 +678,21 @@ const struct b3RigidBodyData* b3GpuNarrowPhase::getBodiesCpu() const return &m_data->m_bodyBufferCPU->at(0); }; - - - -int b3GpuNarrowPhase::getNumBodiesGpu() const +int b3GpuNarrowPhase::getNumBodiesGpu() const { return m_data->m_bodyBufferGPU->size(); } -cl_mem b3GpuNarrowPhase::getBodyInertiasGpu() +cl_mem b3GpuNarrowPhase::getBodyInertiasGpu() { return (cl_mem)m_data->m_inertiaBufferGPU->getBufferCL(); } -int b3GpuNarrowPhase::getNumBodyInertiasGpu() const +int b3GpuNarrowPhase::getNumBodyInertiasGpu() const { return m_data->m_inertiaBufferGPU->size(); } - b3Collidable& b3GpuNarrowPhase::getCollidableCpu(int collidableIndex) { return m_data->m_collidablesCPU[collidableIndex]; @@ -789,25 +720,20 @@ const struct b3SapAabb* b3GpuNarrowPhase::getLocalSpaceAabbsCpu() const if (m_data->m_localShapeAABBCPU->size()) { return &m_data->m_localShapeAABBCPU->at(0); - } + } return 0; } - -cl_mem b3GpuNarrowPhase::getAabbLocalSpaceBufferGpu() +cl_mem b3GpuNarrowPhase::getAabbLocalSpaceBufferGpu() { return m_data->m_localShapeAABBGPU->getBufferCL(); } -int b3GpuNarrowPhase::getNumCollidablesGpu() const +int b3GpuNarrowPhase::getNumCollidablesGpu() const { return m_data->m_collidablesGPU->size(); } - - - - -int b3GpuNarrowPhase::getNumContactsGpu() const +int b3GpuNarrowPhase::getNumContactsGpu() const { return m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->size(); } @@ -824,37 +750,33 @@ const b3Contact4* b3GpuNarrowPhase::getContactsCPU() const void b3GpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects) { - cl_mem aabbsLocalSpace = m_data->m_localShapeAABBGPU->getBufferCL(); int nContactOut = 0; //swap buffer - m_data->m_currentContactBuffer=1-m_data->m_currentContactBuffer; + m_data->m_currentContactBuffer = 1 - m_data->m_currentContactBuffer; //int curSize = m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer]->size(); int maxTriConvexPairCapacity = m_data->m_config.m_maxTriConvexPairCapacity; - int numTriConvexPairsOut=0; - - b3OpenCLArray<b3Int4> broadphasePairsGPU(m_context,m_queue); - broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs,numBroadphasePairs); - - + int numTriConvexPairsOut = 0; + b3OpenCLArray<b3Int4> broadphasePairsGPU(m_context, m_queue); + broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs, numBroadphasePairs); - b3OpenCLArray<b3Aabb> clAabbArrayWorldSpace(this->m_context,this->m_queue); - clAabbArrayWorldSpace.setFromOpenCLBuffer(aabbsWorldSpace,numObjects); + b3OpenCLArray<b3Aabb> clAabbArrayWorldSpace(this->m_context, this->m_queue); + clAabbArrayWorldSpace.setFromOpenCLBuffer(aabbsWorldSpace, numObjects); - b3OpenCLArray<b3Aabb> clAabbArrayLocalSpace(this->m_context,this->m_queue); - clAabbArrayLocalSpace.setFromOpenCLBuffer(aabbsLocalSpace,numObjects); + b3OpenCLArray<b3Aabb> clAabbArrayLocalSpace(this->m_context, this->m_queue); + clAabbArrayLocalSpace.setFromOpenCLBuffer(aabbsLocalSpace, numObjects); m_data->m_gpuSatCollision->computeConvexConvexContactsGPUSAT( &broadphasePairsGPU, numBroadphasePairs, m_data->m_bodyBufferGPU, m_data->m_pBufContactBuffersGPU[m_data->m_currentContactBuffer], nContactOut, - m_data->m_pBufContactBuffersGPU[1-m_data->m_currentContactBuffer], + m_data->m_pBufContactBuffersGPU[1 - m_data->m_currentContactBuffer], m_data->m_config.m_maxContactCapacity, m_data->m_config.m_compoundPairCapacity, *m_data->m_convexPolyhedraGPU, @@ -878,8 +800,7 @@ void b3GpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphase numObjects, maxTriConvexPairCapacity, *m_data->m_triangleConvexPairs, - numTriConvexPairsOut - ); + numTriConvexPairsOut); /*b3AlignedObjectArray<b3Int4> broadphasePairsCPU; broadphasePairsGPU.copyToHost(broadphasePairsCPU); @@ -892,105 +813,97 @@ const b3SapAabb& b3GpuNarrowPhase::getLocalSpaceAabb(int collidableIndex) const return m_data->m_localShapeAABBCPU->at(collidableIndex); } - - - - -int b3GpuNarrowPhase::registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation , const float* aabbMinPtr, const float* aabbMaxPtr,bool writeToGpu) +int b3GpuNarrowPhase::registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMinPtr, const float* aabbMaxPtr, bool writeToGpu) { - b3Vector3 aabbMin=b3MakeVector3(aabbMinPtr[0],aabbMinPtr[1],aabbMinPtr[2]); - b3Vector3 aabbMax=b3MakeVector3(aabbMaxPtr[0],aabbMaxPtr[1],aabbMaxPtr[2]); - + b3Vector3 aabbMin = b3MakeVector3(aabbMinPtr[0], aabbMinPtr[1], aabbMinPtr[2]); + b3Vector3 aabbMax = b3MakeVector3(aabbMaxPtr[0], aabbMaxPtr[1], aabbMaxPtr[2]); if (m_data->m_numAcceleratedRigidBodies >= (m_data->m_config.m_maxConvexBodies)) { - b3Error("registerRigidBody: exceeding the number of rigid bodies, %d > %d \n",m_data->m_numAcceleratedRigidBodies,m_data->m_config.m_maxConvexBodies); + b3Error("registerRigidBody: exceeding the number of rigid bodies, %d > %d \n", m_data->m_numAcceleratedRigidBodies, m_data->m_config.m_maxConvexBodies); return -1; } - - m_data->m_bodyBufferCPU->resize(m_data->m_numAcceleratedRigidBodies+1); - + + m_data->m_bodyBufferCPU->resize(m_data->m_numAcceleratedRigidBodies + 1); + b3RigidBodyData& body = m_data->m_bodyBufferCPU->at(m_data->m_numAcceleratedRigidBodies); - + float friction = 1.f; float restitution = 0.f; - + body.m_frictionCoeff = friction; body.m_restituitionCoeff = restitution; - body.m_angVel = b3MakeVector3(0,0,0); - body.m_linVel=b3MakeVector3(0,0,0);//.setZero(); - body.m_pos =b3MakeVector3(position[0],position[1],position[2]); - body.m_quat.setValue(orientation[0],orientation[1],orientation[2],orientation[3]); + body.m_angVel = b3MakeVector3(0, 0, 0); + body.m_linVel = b3MakeVector3(0, 0, 0); //.setZero(); + body.m_pos = b3MakeVector3(position[0], position[1], position[2]); + body.m_quat.setValue(orientation[0], orientation[1], orientation[2], orientation[3]); body.m_collidableIdx = collidableIndex; - if (collidableIndex>=0) + if (collidableIndex >= 0) { -// body.m_shapeType = m_data->m_collidablesCPU.at(collidableIndex).m_shapeType; - } else + // body.m_shapeType = m_data->m_collidablesCPU.at(collidableIndex).m_shapeType; + } + else { - // body.m_shapeType = CollisionShape::SHAPE_PLANE; + // body.m_shapeType = CollisionShape::SHAPE_PLANE; m_planeBodyIndex = m_data->m_numAcceleratedRigidBodies; } //body.m_shapeType = shapeType; - - - body.m_invMass = mass? 1.f/mass : 0.f; - + + body.m_invMass = mass ? 1.f / mass : 0.f; + if (writeToGpu) { - m_data->m_bodyBufferGPU->copyFromHostPointer(&body,1,m_data->m_numAcceleratedRigidBodies); + m_data->m_bodyBufferGPU->copyFromHostPointer(&body, 1, m_data->m_numAcceleratedRigidBodies); } - + b3InertiaData& shapeInfo = m_data->m_inertiaBufferCPU->at(m_data->m_numAcceleratedRigidBodies); - - if (mass==0.f) + + if (mass == 0.f) { - if (m_data->m_numAcceleratedRigidBodies==0) + if (m_data->m_numAcceleratedRigidBodies == 0) m_static0Index = 0; - - shapeInfo.m_initInvInertia.setValue(0,0,0,0,0,0,0,0,0); - shapeInfo.m_invInertiaWorld.setValue(0,0,0,0,0,0,0,0,0); - } else + + shapeInfo.m_initInvInertia.setValue(0, 0, 0, 0, 0, 0, 0, 0, 0); + shapeInfo.m_invInertiaWorld.setValue(0, 0, 0, 0, 0, 0, 0, 0, 0); + } + else { - - b3Assert(body.m_collidableIdx>=0); - + b3Assert(body.m_collidableIdx >= 0); + //approximate using the aabb of the shape - + //Aabb aabb = (*m_data->m_shapePointers)[shapeIndex]->m_aabb; - b3Vector3 halfExtents = (aabbMax-aabbMin);//*0.5f;//fake larger inertia makes demos more stable ;-) - + b3Vector3 halfExtents = (aabbMax - aabbMin); //*0.5f;//fake larger inertia makes demos more stable ;-) + b3Vector3 localInertia; - - float lx=2.f*halfExtents[0]; - float ly=2.f*halfExtents[1]; - float lz=2.f*halfExtents[2]; - - localInertia.setValue( (mass/12.0f) * (ly*ly + lz*lz), - (mass/12.0f) * (lx*lx + lz*lz), - (mass/12.0f) * (lx*lx + ly*ly)); - + + float lx = 2.f * halfExtents[0]; + float ly = 2.f * halfExtents[1]; + float lz = 2.f * halfExtents[2]; + + localInertia.setValue((mass / 12.0f) * (ly * ly + lz * lz), + (mass / 12.0f) * (lx * lx + lz * lz), + (mass / 12.0f) * (lx * lx + ly * ly)); + b3Vector3 invLocalInertia; - invLocalInertia[0] = 1.f/localInertia[0]; - invLocalInertia[1] = 1.f/localInertia[1]; - invLocalInertia[2] = 1.f/localInertia[2]; + invLocalInertia[0] = 1.f / localInertia[0]; + invLocalInertia[1] = 1.f / localInertia[1]; + invLocalInertia[2] = 1.f / localInertia[2]; invLocalInertia[3] = 0.f; - + shapeInfo.m_initInvInertia.setValue( - invLocalInertia[0], 0, 0, - 0, invLocalInertia[1], 0, - 0, 0, invLocalInertia[2]); + invLocalInertia[0], 0, 0, + 0, invLocalInertia[1], 0, + 0, 0, invLocalInertia[2]); - b3Matrix3x3 m (body.m_quat); + b3Matrix3x3 m(body.m_quat); shapeInfo.m_invInertiaWorld = m.scaled(invLocalInertia) * m.transpose(); - } - + if (writeToGpu) - m_data->m_inertiaBufferGPU->copyFromHostPointer(&shapeInfo,1,m_data->m_numAcceleratedRigidBodies); - - - + m_data->m_inertiaBufferGPU->copyFromHostPointer(&shapeInfo, 1, m_data->m_numAcceleratedRigidBodies); + return m_data->m_numAcceleratedRigidBodies++; } @@ -999,15 +912,13 @@ int b3GpuNarrowPhase::getNumRigidBodies() const return m_data->m_numAcceleratedRigidBodies; } -void b3GpuNarrowPhase::writeAllBodiesToGpu() +void b3GpuNarrowPhase::writeAllBodiesToGpu() { - if (m_data->m_localShapeAABBCPU->size()) { m_data->m_localShapeAABBGPU->copyFromHost(*m_data->m_localShapeAABBCPU); } - - + m_data->m_gpuChildShapes->copyFromHost(m_data->m_cpuChildShapes); m_data->m_convexFacesGPU->copyFromHost(m_data->m_convexFaces); m_data->m_convexPolyhedraGPU->copyFromHost(m_data->m_convexPolyhedra); @@ -1018,25 +929,21 @@ void b3GpuNarrowPhase::writeAllBodiesToGpu() m_data->m_treeNodesGPU->copyFromHost(m_data->m_treeNodesCPU); m_data->m_subTreesGPU->copyFromHost(m_data->m_subTreesCPU); - m_data->m_bodyBufferGPU->resize(m_data->m_numAcceleratedRigidBodies); m_data->m_inertiaBufferGPU->resize(m_data->m_numAcceleratedRigidBodies); - + if (m_data->m_numAcceleratedRigidBodies) { - m_data->m_bodyBufferGPU->copyFromHostPointer(&m_data->m_bodyBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies); - m_data->m_inertiaBufferGPU->copyFromHostPointer(&m_data->m_inertiaBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies); + m_data->m_bodyBufferGPU->copyFromHostPointer(&m_data->m_bodyBufferCPU->at(0), m_data->m_numAcceleratedRigidBodies); + m_data->m_inertiaBufferGPU->copyFromHostPointer(&m_data->m_inertiaBufferCPU->at(0), m_data->m_numAcceleratedRigidBodies); } - if (m_data->m_collidablesCPU.size()) + if (m_data->m_collidablesCPU.size()) { m_data->m_collidablesGPU->copyFromHost(m_data->m_collidablesCPU); } - - } - -void b3GpuNarrowPhase::reset() +void b3GpuNarrowPhase::reset() { m_data->m_numAcceleratedShapes = 0; m_data->m_numAcceleratedRigidBodies = 0; @@ -1053,21 +960,19 @@ void b3GpuNarrowPhase::reset() m_data->m_treeNodesCPU.resize(0); m_data->m_subTreesCPU.resize(0); m_data->m_bvhInfoCPU.resize(0); - } - -void b3GpuNarrowPhase::readbackAllBodiesToCpu() +void b3GpuNarrowPhase::readbackAllBodiesToCpu() { - m_data->m_bodyBufferGPU->copyToHostPointer(&m_data->m_bodyBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies); + m_data->m_bodyBufferGPU->copyToHostPointer(&m_data->m_bodyBufferCPU->at(0), m_data->m_numAcceleratedRigidBodies); } -void b3GpuNarrowPhase::setObjectTransformCpu(float* position, float* orientation , int bodyIndex) +void b3GpuNarrowPhase::setObjectTransformCpu(float* position, float* orientation, int bodyIndex) { - if (bodyIndex>=0 && bodyIndex<m_data->m_bodyBufferCPU->size()) + if (bodyIndex >= 0 && bodyIndex < m_data->m_bodyBufferCPU->size()) { - m_data->m_bodyBufferCPU->at(bodyIndex).m_pos=b3MakeVector3(position[0],position[1],position[2]); - m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.setValue(orientation[0],orientation[1],orientation[2],orientation[3]); + m_data->m_bodyBufferCPU->at(bodyIndex).m_pos = b3MakeVector3(position[0], position[1], position[2]); + m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.setValue(orientation[0], orientation[1], orientation[2], orientation[3]); } else { @@ -1076,24 +981,25 @@ void b3GpuNarrowPhase::setObjectTransformCpu(float* position, float* orientation } void b3GpuNarrowPhase::setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex) { - if (bodyIndex>=0 && bodyIndex<m_data->m_bodyBufferCPU->size()) + if (bodyIndex >= 0 && bodyIndex < m_data->m_bodyBufferCPU->size()) { - m_data->m_bodyBufferCPU->at(bodyIndex).m_linVel=b3MakeVector3(linVel[0],linVel[1],linVel[2]); - m_data->m_bodyBufferCPU->at(bodyIndex).m_angVel=b3MakeVector3(angVel[0],angVel[1],angVel[2]); - } else + m_data->m_bodyBufferCPU->at(bodyIndex).m_linVel = b3MakeVector3(linVel[0], linVel[1], linVel[2]); + m_data->m_bodyBufferCPU->at(bodyIndex).m_angVel = b3MakeVector3(angVel[0], angVel[1], angVel[2]); + } + else { b3Warning("setObjectVelocityCpu out of range.\n"); } } -bool b3GpuNarrowPhase::getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const +bool b3GpuNarrowPhase::getObjectTransformFromCpu(float* position, float* orientation, int bodyIndex) const { - if (bodyIndex>=0 && bodyIndex<m_data->m_bodyBufferCPU->size()) + if (bodyIndex >= 0 && bodyIndex < m_data->m_bodyBufferCPU->size()) { position[0] = m_data->m_bodyBufferCPU->at(bodyIndex).m_pos.x; position[1] = m_data->m_bodyBufferCPU->at(bodyIndex).m_pos.y; position[2] = m_data->m_bodyBufferCPU->at(bodyIndex).m_pos.z; - position[3] = 1.f;//or 1 + position[3] = 1.f; //or 1 orientation[0] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.x; orientation[1] = m_data->m_bodyBufferCPU->at(bodyIndex).m_quat.y; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h index 05ff3fd09e..21a68de343 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h @@ -9,11 +9,10 @@ class b3GpuNarrowPhase { protected: - - struct b3GpuNarrowPhaseInternalData* m_data; + struct b3GpuNarrowPhaseInternalData* m_data; int m_acceleratedCompanionShapeIndex; int m_planeBodyIndex; - int m_static0Index; + int m_static0Index; cl_context m_context; cl_device_id m_device; @@ -23,64 +22,58 @@ protected: int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling); public: - - - - b3GpuNarrowPhase(cl_context vtx, cl_device_id dev, cl_command_queue q, const struct b3Config& config); virtual ~b3GpuNarrowPhase(void); - int registerSphereShape(float radius); - int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant); + int registerSphereShape(float radius); + int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant); int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes); int registerFace(const b3Vector3& faceNormal, float faceConstant); - - int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling); - + + int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling); + //do they need to be merged? - - int registerConvexHullShape(b3ConvexUtility* utilPtr); - int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling); - int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu); - void setObjectTransform(const float* position, const float* orientation , int bodyIndex); + int registerConvexHullShape(b3ConvexUtility* utilPtr); + int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling); + + int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax, bool writeToGpu); + void setObjectTransform(const float* position, const float* orientation, int bodyIndex); - void writeAllBodiesToGpu(); - void reset(); - void readbackAllBodiesToCpu(); - bool getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const; + void writeAllBodiesToGpu(); + void reset(); + void readbackAllBodiesToCpu(); + bool getObjectTransformFromCpu(float* position, float* orientation, int bodyIndex) const; - void setObjectTransformCpu(float* position, float* orientation , int bodyIndex); + void setObjectTransformCpu(float* position, float* orientation, int bodyIndex); void setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex); - virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects); - - cl_mem getBodiesGpu(); + cl_mem getBodiesGpu(); const struct b3RigidBodyData* getBodiesCpu() const; //struct b3RigidBodyData* getBodiesCpu(); - int getNumBodiesGpu() const; + int getNumBodiesGpu() const; - cl_mem getBodyInertiasGpu(); - int getNumBodyInertiasGpu() const; + cl_mem getBodyInertiasGpu(); + int getNumBodyInertiasGpu() const; - cl_mem getCollidablesGpu(); + cl_mem getCollidablesGpu(); const struct b3Collidable* getCollidablesCpu() const; - int getNumCollidablesGpu() const; + int getNumCollidablesGpu() const; const struct b3SapAabb* getLocalSpaceAabbsCpu() const; const struct b3Contact4* getContactsCPU() const; - cl_mem getContactsGpu(); - int getNumContactsGpu() const; + cl_mem getContactsGpu(); + int getNumContactsGpu() const; + + cl_mem getAabbLocalSpaceBufferGpu(); - cl_mem getAabbLocalSpaceBufferGpu(); - int getNumRigidBodies() const; int allocateCollidable(); @@ -92,18 +85,17 @@ public: b3Collidable& getCollidableCpu(int collidableIndex); const b3Collidable& getCollidableCpu(int collidableIndex) const; - const b3GpuNarrowPhaseInternalData* getInternalData() const + const b3GpuNarrowPhaseInternalData* getInternalData() const { - return m_data; + return m_data; } - b3GpuNarrowPhaseInternalData* getInternalData() + b3GpuNarrowPhaseInternalData* getInternalData() { - return m_data; + return m_data; } const struct b3SapAabb& getLocalSpaceAabb(int collidableIndex) const; }; -#endif //B3_GPU_NARROWPHASE_H - +#endif //B3_GPU_NARROWPHASE_H diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h index 8a7f1ea859..716a5ea0fc 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h @@ -20,57 +20,53 @@ #include "Bullet3Common/shared/b3Int4.h" #include "Bullet3Common/shared/b3Int2.h" - class b3ConvexUtility; struct b3GpuNarrowPhaseInternalData { b3AlignedObjectArray<b3ConvexUtility*>* m_convexData; - + b3AlignedObjectArray<b3ConvexPolyhedronData> m_convexPolyhedra; b3AlignedObjectArray<b3Vector3> m_uniqueEdges; b3AlignedObjectArray<b3Vector3> m_convexVertices; b3AlignedObjectArray<int> m_convexIndices; - + b3OpenCLArray<b3ConvexPolyhedronData>* m_convexPolyhedraGPU; b3OpenCLArray<b3Vector3>* m_uniqueEdgesGPU; b3OpenCLArray<b3Vector3>* m_convexVerticesGPU; b3OpenCLArray<int>* m_convexIndicesGPU; - - b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU; - b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU; - b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU; - b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU; - b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU; - + + b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU; + b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU; + b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU; + b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU; + b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU; + b3AlignedObjectArray<b3GpuChildShape> m_cpuChildShapes; - b3OpenCLArray<b3GpuChildShape>* m_gpuChildShapes; - + b3OpenCLArray<b3GpuChildShape>* m_gpuChildShapes; + b3AlignedObjectArray<b3GpuFace> m_convexFaces; b3OpenCLArray<b3GpuFace>* m_convexFacesGPU; - - struct GpuSatCollision* m_gpuSatCollision; - - - b3OpenCLArray<b3Int4>* m_triangleConvexPairs; - - + + struct GpuSatCollision* m_gpuSatCollision; + + b3OpenCLArray<b3Int4>* m_triangleConvexPairs; + b3OpenCLArray<b3Contact4>* m_pBufContactBuffersGPU[2]; - int m_currentContactBuffer; + int m_currentContactBuffer; b3AlignedObjectArray<b3Contact4>* m_pBufContactOutCPU; - - + b3AlignedObjectArray<b3RigidBodyData>* m_bodyBufferCPU; b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU; - - b3AlignedObjectArray<b3InertiaData>* m_inertiaBufferCPU; - b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU; - + + b3AlignedObjectArray<b3InertiaData>* m_inertiaBufferCPU; + b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU; + int m_numAcceleratedShapes; int m_numAcceleratedRigidBodies; - - b3AlignedObjectArray<b3Collidable> m_collidablesCPU; - b3OpenCLArray<b3Collidable>* m_collidablesGPU; + + b3AlignedObjectArray<b3Collidable> m_collidablesCPU; + b3OpenCLArray<b3Collidable>* m_collidablesGPU; b3OpenCLArray<b3SapAabb>* m_localShapeAABBGPU; b3AlignedObjectArray<b3SapAabb>* m_localShapeAABBCPU; @@ -78,18 +74,16 @@ struct b3GpuNarrowPhaseInternalData b3AlignedObjectArray<class b3OptimizedBvh*> m_bvhData; b3AlignedObjectArray<class b3TriangleIndexVertexArray*> m_meshInterfaces; - b3AlignedObjectArray<b3QuantizedBvhNode> m_treeNodesCPU; - b3AlignedObjectArray<b3BvhSubtreeInfo> m_subTreesCPU; + b3AlignedObjectArray<b3QuantizedBvhNode> m_treeNodesCPU; + b3AlignedObjectArray<b3BvhSubtreeInfo> m_subTreesCPU; + + b3AlignedObjectArray<b3BvhInfo> m_bvhInfoCPU; + b3OpenCLArray<b3BvhInfo>* m_bvhInfoGPU; - b3AlignedObjectArray<b3BvhInfo> m_bvhInfoCPU; - b3OpenCLArray<b3BvhInfo>* m_bvhInfoGPU; - - b3OpenCLArray<b3QuantizedBvhNode>* m_treeNodesGPU; - b3OpenCLArray<b3BvhSubtreeInfo>* m_subTreesGPU; - + b3OpenCLArray<b3QuantizedBvhNode>* m_treeNodesGPU; + b3OpenCLArray<b3BvhSubtreeInfo>* m_subTreesGPU; - b3Config m_config; - + b3Config m_config; }; -#endif //B3_GPU_NARROWPHASE_INTERNAL_DATA_H +#endif //B3_GPU_NARROWPHASE_INTERNAL_DATA_H diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp index 0d3d50c548..bd9d6bb04b 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp @@ -14,11 +14,10 @@ subject to the following restrictions: */ //Originally written by Erwin Coumans - bool useGpuInitSolverBodies = true; bool useGpuInfo1 = true; -bool useGpuInfo2= true; -bool useGpuSolveJointConstraintRows=true; +bool useGpuInfo2 = true; +bool useGpuSolveJointConstraintRows = true; bool useGpuWriteBackVelocities = true; bool gpuBreakConstraints = true; @@ -29,27 +28,25 @@ bool gpuBreakConstraints = true; #include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h" #include <new> #include "Bullet3Common/b3AlignedObjectArray.h" -#include <string.h> //for memset +#include <string.h> //for memset #include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" -#include "Bullet3OpenCL/RigidBody/kernels/jointSolver.h" //solveConstraintRowsCL +#include "Bullet3OpenCL/RigidBody/kernels/jointSolver.h" //solveConstraintRowsCL #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #define B3_JOINT_SOLVER_PATH "src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl" - struct b3GpuPgsJacobiSolverInternalData { - cl_context m_context; cl_device_id m_device; cl_command_queue m_queue; - b3PrefixScanCL* m_prefixScan; + b3PrefixScanCL* m_prefixScan; cl_kernel m_solveJointConstraintRowsKernels; cl_kernel m_initSolverBodiesKernel; @@ -59,31 +56,27 @@ struct b3GpuPgsJacobiSolverInternalData cl_kernel m_writeBackVelocitiesKernel; cl_kernel m_breakViolatedConstraintsKernel; - b3OpenCLArray<unsigned int>* m_gpuConstraintRowOffsets; + b3OpenCLArray<unsigned int>* m_gpuConstraintRowOffsets; - b3OpenCLArray<b3GpuSolverBody>* m_gpuSolverBodies; - b3OpenCLArray<b3BatchConstraint>* m_gpuBatchConstraints; - b3OpenCLArray<b3GpuSolverConstraint>* m_gpuConstraintRows; - b3OpenCLArray<unsigned int>* m_gpuConstraintInfo1; + b3OpenCLArray<b3GpuSolverBody>* m_gpuSolverBodies; + b3OpenCLArray<b3BatchConstraint>* m_gpuBatchConstraints; + b3OpenCLArray<b3GpuSolverConstraint>* m_gpuConstraintRows; + b3OpenCLArray<unsigned int>* m_gpuConstraintInfo1; -// b3AlignedObjectArray<b3GpuSolverBody> m_cpuSolverBodies; - b3AlignedObjectArray<b3BatchConstraint> m_cpuBatchConstraints; - b3AlignedObjectArray<b3GpuSolverConstraint> m_cpuConstraintRows; - b3AlignedObjectArray<unsigned int> m_cpuConstraintInfo1; - b3AlignedObjectArray<unsigned int> m_cpuConstraintRowOffsets; + // b3AlignedObjectArray<b3GpuSolverBody> m_cpuSolverBodies; + b3AlignedObjectArray<b3BatchConstraint> m_cpuBatchConstraints; + b3AlignedObjectArray<b3GpuSolverConstraint> m_cpuConstraintRows; + b3AlignedObjectArray<unsigned int> m_cpuConstraintInfo1; + b3AlignedObjectArray<unsigned int> m_cpuConstraintRowOffsets; - b3AlignedObjectArray<b3RigidBodyData> m_cpuBodies; - b3AlignedObjectArray<b3InertiaData> m_cpuInertias; + b3AlignedObjectArray<b3RigidBodyData> m_cpuBodies; + b3AlignedObjectArray<b3InertiaData> m_cpuInertias; - b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints; - b3AlignedObjectArray<int> m_batchSizes; - - + b3AlignedObjectArray<int> m_batchSizes; }; - /* static b3Transform getWorldTransform(b3RigidBodyData* rb) { @@ -100,12 +93,12 @@ static const b3Matrix3x3& getInvInertiaTensorWorld(b3InertiaData* inertia) */ -static const b3Vector3& getLinearVelocity(b3RigidBodyData* rb) +static const b3Vector3& getLinearVelocity(b3RigidBodyData* rb) { return rb->m_linVel; } -static const b3Vector3& getAngularVelocity(b3RigidBodyData* rb) +static const b3Vector3& getAngularVelocity(b3RigidBodyData* rb) { return rb->m_angVel; } @@ -114,12 +107,9 @@ b3Vector3 getVelocityInLocalPoint(b3RigidBodyData* rb, const b3Vector3& rel_pos) { //we also calculate lin/ang velocity for kinematic objects return getLinearVelocity(rb) + getAngularVelocity(rb).cross(rel_pos); - } - - -b3GpuPgsConstraintSolver::b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs) +b3GpuPgsConstraintSolver::b3GpuPgsConstraintSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, bool usePgs) { m_usePgs = usePgs; m_gpuData = new b3GpuPgsJacobiSolverInternalData(); @@ -127,45 +117,40 @@ b3GpuPgsConstraintSolver::b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id m_gpuData->m_device = device; m_gpuData->m_queue = queue; - m_gpuData->m_prefixScan = new b3PrefixScanCL(ctx,device,queue); + m_gpuData->m_prefixScan = new b3PrefixScanCL(ctx, device, queue); - m_gpuData->m_gpuConstraintRowOffsets = new b3OpenCLArray<unsigned int>(m_gpuData->m_context,m_gpuData->m_queue); + m_gpuData->m_gpuConstraintRowOffsets = new b3OpenCLArray<unsigned int>(m_gpuData->m_context, m_gpuData->m_queue); - m_gpuData->m_gpuSolverBodies = new b3OpenCLArray<b3GpuSolverBody>(m_gpuData->m_context,m_gpuData->m_queue); - m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray<b3BatchConstraint>(m_gpuData->m_context,m_gpuData->m_queue); - m_gpuData->m_gpuConstraintRows = new b3OpenCLArray<b3GpuSolverConstraint>(m_gpuData->m_context,m_gpuData->m_queue); - m_gpuData->m_gpuConstraintInfo1 = new b3OpenCLArray<unsigned int>(m_gpuData->m_context,m_gpuData->m_queue); - cl_int errNum=0; + m_gpuData->m_gpuSolverBodies = new b3OpenCLArray<b3GpuSolverBody>(m_gpuData->m_context, m_gpuData->m_queue); + m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray<b3BatchConstraint>(m_gpuData->m_context, m_gpuData->m_queue); + m_gpuData->m_gpuConstraintRows = new b3OpenCLArray<b3GpuSolverConstraint>(m_gpuData->m_context, m_gpuData->m_queue); + m_gpuData->m_gpuConstraintInfo1 = new b3OpenCLArray<unsigned int>(m_gpuData->m_context, m_gpuData->m_queue); + cl_int errNum = 0; { - cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,&errNum,"",B3_JOINT_SOLVER_PATH); + cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, &errNum, "", B3_JOINT_SOLVER_PATH); //cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,0,&errNum,"",B3_JOINT_SOLVER_PATH,true); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_solveJointConstraintRowsKernels = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device,solveConstraintRowsCL, "solveJointConstraintRows",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_initSolverBodiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"initSolverBodies",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_getInfo1Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"getInfo1Kernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_initBatchConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"initBatchConstraintsKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_getInfo2Kernel= b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"getInfo2Kernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_writeBackVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"writeBackVelocitiesKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - m_gpuData->m_breakViolatedConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context,m_gpuData->m_device,solveConstraintRowsCL,"breakViolatedConstraintsKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); - - - + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_solveJointConstraintRowsKernels = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "solveJointConstraintRows", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_initSolverBodiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "initSolverBodies", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_getInfo1Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "getInfo1Kernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_initBatchConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "initBatchConstraintsKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_getInfo2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "getInfo2Kernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_writeBackVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "writeBackVelocitiesKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); + m_gpuData->m_breakViolatedConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "breakViolatedConstraintsKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); clReleaseProgram(prog); } - - } -b3GpuPgsConstraintSolver::~b3GpuPgsConstraintSolver () +b3GpuPgsConstraintSolver::~b3GpuPgsConstraintSolver() { clReleaseKernel(m_gpuData->m_solveJointConstraintRowsKernels); clReleaseKernel(m_gpuData->m_initSolverBodiesKernel); @@ -195,16 +180,12 @@ struct b3BatchConstraint static b3AlignedObjectArray<b3BatchConstraint> batchConstraints; - -void b3GpuPgsConstraintSolver::recomputeBatches() +void b3GpuPgsConstraintSolver::recomputeBatches() { m_gpuData->m_batchSizes.clear(); } - - - -b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal) +b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal) { B3_PROFILE("GPU solveGroupCacheFriendlySetup"); batchConstraints.resize(numConstraints); @@ -212,7 +193,6 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 m_staticIdx = -1; m_maxOverrideNumSolverIterations = 0; - /* m_gpuData->m_gpuBodies->resize(numBodies); m_gpuData->m_gpuBodies->copyFromHostPointer(bodies,numBodies); @@ -223,15 +203,13 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 m_gpuData->m_gpuSolverBodies->resize(numBodies); - m_tmpSolverBodyPool.resize(numBodies); { - if (useGpuInitSolverBodies) { B3_PROFILE("m_initSolverBodiesKernel"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_initSolverBodiesKernel,"m_initSolverBodiesKernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_initSolverBodiesKernel, "m_initSolverBodiesKernel"); launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); launcher.setBuffer(gpuBodies->getBufferCL()); launcher.setConst(numBodies); @@ -239,48 +217,44 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 clFinish(m_gpuData->m_queue); // m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); - } else + } + else { gpuBodies->copyToHost(m_gpuData->m_cpuBodies); - for (int i=0;i<numBodies;i++) + for (int i = 0; i < numBodies; i++) { - b3RigidBodyData& body = m_gpuData->m_cpuBodies[i]; b3GpuSolverBody& solverBody = m_tmpSolverBodyPool[i]; - initSolverBody(i,&solverBody,&body); + initSolverBody(i, &solverBody, &body); solverBody.m_originalBodyIndex = i; } m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool); } } -// int totalBodies = 0; + // int totalBodies = 0; int totalNumRows = 0; //b3RigidBody* rb0=0,*rb1=0; //if (1) { { - - // int i; m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints); // b3OpenCLArray<b3GpuGenericConstraint> gpuConstraints(m_gpuData->m_context,m_gpuData->m_queue); - if (useGpuInfo1) { B3_PROFILE("info1 and init batchConstraint"); - + m_gpuData->m_gpuConstraintInfo1->resize(numConstraints); - if (1) { B3_PROFILE("getInfo1Kernel"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_getInfo1Kernel,"m_getInfo1Kernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_getInfo1Kernel, "m_getInfo1Kernel"); launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); launcher.setBuffer(gpuConstraints->getBufferCL()); launcher.setConst(numConstraints); @@ -288,19 +262,19 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 clFinish(m_gpuData->m_queue); } - if (m_gpuData->m_batchSizes.size()==0) + if (m_gpuData->m_batchSizes.size() == 0) { B3_PROFILE("initBatchConstraintsKernel"); m_gpuData->m_gpuConstraintRowOffsets->resize(numConstraints); - unsigned int total=0; - m_gpuData->m_prefixScan->execute(*m_gpuData->m_gpuConstraintInfo1,*m_gpuData->m_gpuConstraintRowOffsets,numConstraints,&total); - unsigned int lastElem = m_gpuData->m_gpuConstraintInfo1->at(numConstraints-1); - totalNumRows = total+lastElem; + unsigned int total = 0; + m_gpuData->m_prefixScan->execute(*m_gpuData->m_gpuConstraintInfo1, *m_gpuData->m_gpuConstraintRowOffsets, numConstraints, &total); + unsigned int lastElem = m_gpuData->m_gpuConstraintInfo1->at(numConstraints - 1); + totalNumRows = total + lastElem; { B3_PROFILE("init batch constraints"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_initBatchConstraintsKernel,"m_initBatchConstraintsKernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_initBatchConstraintsKernel, "m_initBatchConstraintsKernel"); launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); @@ -313,79 +287,74 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 //assume the batching happens on CPU, so copy the data m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); } - } + } else { - totalNumRows = 0; + totalNumRows = 0; gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints); //calculate the total number of contraint rows - for (int i=0;i<numConstraints;i++) + for (int i = 0; i < numConstraints; i++) { - unsigned int& info1= m_tmpConstraintSizesPool[i]; + unsigned int& info1 = m_tmpConstraintSizesPool[i]; // unsigned int info1; if (m_gpuData->m_cpuConstraints[i].isEnabled()) { - - m_gpuData->m_cpuConstraints[i].getInfo1(&info1,&m_gpuData->m_cpuBodies[0]); - } else + m_gpuData->m_cpuConstraints[i].getInfo1(&info1, &m_gpuData->m_cpuBodies[0]); + } + else { info1 = 0; } - + totalNumRows += info1; } m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool); - } m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows); m_gpuData->m_gpuConstraintRows->resize(totalNumRows); - + // b3GpuConstraintArray verify; if (useGpuInfo2) { { - B3_PROFILE("getInfo2Kernel"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_getInfo2Kernel,"m_getInfo2Kernel"); - launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); - launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); - launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); - launcher.setBuffer(gpuConstraints->getBufferCL()); - launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); - launcher.setBuffer(gpuBodies->getBufferCL()); - launcher.setBuffer(gpuInertias->getBufferCL()); - launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); - launcher.setConst(infoGlobal.m_timeStep); - launcher.setConst(infoGlobal.m_erp); - launcher.setConst(infoGlobal.m_globalCfm); - launcher.setConst(infoGlobal.m_damping); - launcher.setConst(infoGlobal.m_numIterations); - launcher.setConst(numConstraints); - launcher.launch1D(numConstraints); - clFinish(m_gpuData->m_queue); - - if (m_gpuData->m_batchSizes.size()==0) - m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); - //m_gpuData->m_gpuConstraintRows->copyToHost(verify); - //m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool); - - + B3_PROFILE("getInfo2Kernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_getInfo2Kernel, "m_getInfo2Kernel"); + launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); + launcher.setBuffer(gpuConstraints->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); + launcher.setBuffer(gpuBodies->getBufferCL()); + launcher.setBuffer(gpuInertias->getBufferCL()); + launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); + launcher.setConst(infoGlobal.m_timeStep); + launcher.setConst(infoGlobal.m_erp); + launcher.setConst(infoGlobal.m_globalCfm); + launcher.setConst(infoGlobal.m_damping); + launcher.setConst(infoGlobal.m_numIterations); + launcher.setConst(numConstraints); + launcher.launch1D(numConstraints); + clFinish(m_gpuData->m_queue); - } - } + if (m_gpuData->m_batchSizes.size() == 0) + m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); + //m_gpuData->m_gpuConstraintRows->copyToHost(verify); + //m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool); + } + } else { - gpuInertias->copyToHost(m_gpuData->m_cpuInertias); - ///setup the b3SolverConstraints - - for (int i=0;i<numConstraints;i++) + ///setup the b3SolverConstraints + + for (int i = 0; i < numConstraints; i++) { const int& info1 = m_tmpConstraintSizesPool[i]; - + if (info1) { int constraintIndex = batchConstraints[i].m_originalConstraintIndex; @@ -394,15 +363,13 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 b3GpuSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[constraintRowOffset]; b3GpuGenericConstraint& constraint = m_gpuData->m_cpuConstraints[i]; - b3RigidBodyData& rbA = m_gpuData->m_cpuBodies[ constraint.getRigidBodyA()]; + b3RigidBodyData& rbA = m_gpuData->m_cpuBodies[constraint.getRigidBodyA()]; //b3RigidBody& rbA = constraint.getRigidBodyA(); - // b3RigidBody& rbB = constraint.getRigidBodyB(); - b3RigidBodyData& rbB = m_gpuData->m_cpuBodies[ constraint.getRigidBodyB()]; - - + // b3RigidBody& rbB = constraint.getRigidBodyB(); + b3RigidBodyData& rbB = m_gpuData->m_cpuBodies[constraint.getRigidBodyB()]; - int solverBodyIdA = constraint.getRigidBodyA();//getOrInitSolverBody(constraint.getRigidBodyA(),bodies,inertias); - int solverBodyIdB = constraint.getRigidBodyB();//getOrInitSolverBody(constraint.getRigidBodyB(),bodies,inertias); + int solverBodyIdA = constraint.getRigidBodyA(); //getOrInitSolverBody(constraint.getRigidBodyA(),bodies,inertias); + int solverBodyIdB = constraint.getRigidBodyB(); //getOrInitSolverBody(constraint.getRigidBodyB(),bodies,inertias); b3GpuSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA]; b3GpuSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB]; @@ -410,7 +377,8 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 if (rbA.m_invMass) { batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA; - } else + } + else { if (!solverBodyIdA) m_staticIdx = 0; @@ -420,29 +388,28 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 if (rbB.m_invMass) { batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB; - } else + } + else { if (!solverBodyIdB) m_staticIdx = 0; batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB; } - - int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; - if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations) + int overrideNumSolverIterations = 0; //constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; + if (overrideNumSolverIterations > m_maxOverrideNumSolverIterations) m_maxOverrideNumSolverIterations = overrideNumSolverIterations; - int j; - for ( j=0;j<info1;j++) + for (j = 0; j < info1; j++) { - memset(¤tConstraintRow[j],0,sizeof(b3GpuSolverConstraint)); - currentConstraintRow[j].m_angularComponentA.setValue(0,0,0); - currentConstraintRow[j].m_angularComponentB.setValue(0,0,0); + memset(¤tConstraintRow[j], 0, sizeof(b3GpuSolverConstraint)); + currentConstraintRow[j].m_angularComponentA.setValue(0, 0, 0); + currentConstraintRow[j].m_angularComponentB.setValue(0, 0, 0); currentConstraintRow[j].m_appliedImpulse = 0.f; currentConstraintRow[j].m_appliedPushImpulse = 0.f; currentConstraintRow[j].m_cfm = 0.f; - currentConstraintRow[j].m_contactNormal.setValue(0,0,0); + currentConstraintRow[j].m_contactNormal.setValue(0, 0, 0); currentConstraintRow[j].m_friction = 0.f; currentConstraintRow[j].m_frictionIndex = 0; currentConstraintRow[j].m_jacDiagABInv = 0.f; @@ -451,13 +418,13 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 currentConstraintRow[j].m_originalContactPoint = 0; currentConstraintRow[j].m_overrideNumSolverIterations = 0; - currentConstraintRow[j].m_relpos1CrossNormal.setValue(0,0,0); - currentConstraintRow[j].m_relpos2CrossNormal.setValue(0,0,0); + currentConstraintRow[j].m_relpos1CrossNormal.setValue(0, 0, 0); + currentConstraintRow[j].m_relpos2CrossNormal.setValue(0, 0, 0); currentConstraintRow[j].m_rhs = 0.f; currentConstraintRow[j].m_rhsPenetration = 0.f; currentConstraintRow[j].m_solverBodyIdA = 0; currentConstraintRow[j].m_solverBodyIdB = 0; - + currentConstraintRow[j].m_lowerLimit = -B3_INFINITY; currentConstraintRow[j].m_upperLimit = B3_INFINITY; currentConstraintRow[j].m_appliedImpulse = 0.f; @@ -467,26 +434,25 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations; } - bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); - bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); - bodyAPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); - bodyAPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetPushVelocity().setValue(0.f,0.f,0.f); - bodyBPtr->internalGetTurnVelocity().setValue(0.f,0.f,0.f); - + bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f, 0.f, 0.f); + bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f, 0.f, 0.f); + bodyAPtr->internalGetPushVelocity().setValue(0.f, 0.f, 0.f); + bodyAPtr->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f); + bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f, 0.f, 0.f); + bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f, 0.f, 0.f); + bodyBPtr->internalGetPushVelocity().setValue(0.f, 0.f, 0.f); + bodyBPtr->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f); b3GpuConstraintInfo2 info2; - info2.fps = 1.f/infoGlobal.m_timeStep; + info2.fps = 1.f / infoGlobal.m_timeStep; info2.erp = infoGlobal.m_erp; info2.m_J1linearAxis = currentConstraintRow->m_contactNormal; info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal; info2.m_J2linearAxis = 0; info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal; - info2.rowskip = sizeof(b3GpuSolverConstraint)/sizeof(b3Scalar);//check this + info2.rowskip = sizeof(b3GpuSolverConstraint) / sizeof(b3Scalar); //check this ///the size of b3GpuSolverConstraint needs be a multiple of b3Scalar - b3Assert(info2.rowskip*sizeof(b3Scalar)== sizeof(b3GpuSolverConstraint)); + b3Assert(info2.rowskip * sizeof(b3Scalar) == sizeof(b3GpuSolverConstraint)); info2.m_constraintError = ¤tConstraintRow->m_rhs; currentConstraintRow->m_cfm = infoGlobal.m_globalCfm; info2.m_damping = infoGlobal.m_damping; @@ -494,47 +460,45 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit; info2.m_upperLimit = ¤tConstraintRow->m_upperLimit; info2.m_numIterations = infoGlobal.m_numIterations; - m_gpuData->m_cpuConstraints[i].getInfo2(&info2,&m_gpuData->m_cpuBodies[0]); + m_gpuData->m_cpuConstraints[i].getInfo2(&info2, &m_gpuData->m_cpuBodies[0]); ///finalize the constraint setup - for ( j=0;j<info1;j++) + for (j = 0; j < info1; j++) { b3GpuSolverConstraint& solverConstraint = currentConstraintRow[j]; - if (solverConstraint.m_upperLimit>=m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) + if (solverConstraint.m_upperLimit >= m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) { solverConstraint.m_upperLimit = m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold(); } - if (solverConstraint.m_lowerLimit<=-m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) + if (solverConstraint.m_lowerLimit <= -m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold()) { solverConstraint.m_lowerLimit = -m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold(); } - // solverConstraint.m_originalContactPoint = constraint; - - b3Matrix3x3& invInertiaWorldA= m_gpuData->m_cpuInertias[constraint.getRigidBodyA()].m_invInertiaWorld; - { + // solverConstraint.m_originalContactPoint = constraint; + b3Matrix3x3& invInertiaWorldA = m_gpuData->m_cpuInertias[constraint.getRigidBodyA()].m_invInertiaWorld; + { //b3Vector3 angularFactorA(1,1,1); const b3Vector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal; - solverConstraint.m_angularComponentA = invInertiaWorldA*ftorqueAxis1;//*angularFactorA; + solverConstraint.m_angularComponentA = invInertiaWorldA * ftorqueAxis1; //*angularFactorA; } - - b3Matrix3x3& invInertiaWorldB= m_gpuData->m_cpuInertias[constraint.getRigidBodyB()].m_invInertiaWorld; - { + b3Matrix3x3& invInertiaWorldB = m_gpuData->m_cpuInertias[constraint.getRigidBodyB()].m_invInertiaWorld; + { const b3Vector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal; - solverConstraint.m_angularComponentB = invInertiaWorldB*ftorqueAxis2;//*constraint.getRigidBodyB().getAngularFactor(); + solverConstraint.m_angularComponentB = invInertiaWorldB * ftorqueAxis2; //*constraint.getRigidBodyB().getAngularFactor(); } { //it is ok to use solverConstraint.m_contactNormal instead of -solverConstraint.m_contactNormal //because it gets multiplied iMJlB - b3Vector3 iMJlA = solverConstraint.m_contactNormal*rbA.m_invMass; - b3Vector3 iMJaA = invInertiaWorldA*solverConstraint.m_relpos1CrossNormal; - b3Vector3 iMJlB = solverConstraint.m_contactNormal*rbB.m_invMass;//sign of normal? - b3Vector3 iMJaB = invInertiaWorldB*solverConstraint.m_relpos2CrossNormal; + b3Vector3 iMJlA = solverConstraint.m_contactNormal * rbA.m_invMass; + b3Vector3 iMJaA = invInertiaWorldA * solverConstraint.m_relpos1CrossNormal; + b3Vector3 iMJlB = solverConstraint.m_contactNormal * rbB.m_invMass; //sign of normal? + b3Vector3 iMJaB = invInertiaWorldB * solverConstraint.m_relpos2CrossNormal; b3Scalar sum = iMJlA.dot(solverConstraint.m_contactNormal); sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal); @@ -542,10 +506,9 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal); b3Scalar fsum = b3Fabs(sum); b3Assert(fsum > B3_EPSILON); - solverConstraint.m_jacDiagABInv = fsum>B3_EPSILON?b3Scalar(1.)/sum : 0.f; + solverConstraint.m_jacDiagABInv = fsum > B3_EPSILON ? b3Scalar(1.) / sum : 0.f; } - ///fix rhs ///todo: add force/torque accelerators { @@ -553,94 +516,80 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3 b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.m_linVel) + solverConstraint.m_relpos1CrossNormal.dot(rbA.m_angVel); b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.m_linVel) + solverConstraint.m_relpos2CrossNormal.dot(rbB.m_angVel); - rel_vel = vel1Dotn+vel2Dotn; + rel_vel = vel1Dotn + vel2Dotn; b3Scalar restitution = 0.f; - b3Scalar positionalError = solverConstraint.m_rhs;//already filled in by getConstraintInfo2 - b3Scalar velocityError = restitution - rel_vel * info2.m_damping; - b3Scalar penetrationImpulse = positionalError*solverConstraint.m_jacDiagABInv; - b3Scalar velocityImpulse = velocityError *solverConstraint.m_jacDiagABInv; - solverConstraint.m_rhs = penetrationImpulse+velocityImpulse; + b3Scalar positionalError = solverConstraint.m_rhs; //already filled in by getConstraintInfo2 + b3Scalar velocityError = restitution - rel_vel * info2.m_damping; + b3Scalar penetrationImpulse = positionalError * solverConstraint.m_jacDiagABInv; + b3Scalar velocityImpulse = velocityError * solverConstraint.m_jacDiagABInv; + solverConstraint.m_rhs = penetrationImpulse + velocityImpulse; solverConstraint.m_appliedImpulse = 0.f; - } } - } } - - m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool); m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool); - if (m_gpuData->m_batchSizes.size()==0) + if (m_gpuData->m_batchSizes.size() == 0) m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); else m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool); - - - }//end useGpuInfo2 - - + } //end useGpuInfo2 } #ifdef B3_SUPPORT_CONTACT_CONSTRAINTS { int i; - for (i=0;i<numManifolds;i++) + for (i = 0; i < numManifolds; i++) { b3Contact4& manifold = manifoldPtr[i]; - convertContact(bodies,inertias,&manifold,infoGlobal); + convertContact(bodies, inertias, &manifold, infoGlobal); } } -#endif //B3_SUPPORT_CONTACT_CONSTRAINTS +#endif //B3_SUPPORT_CONTACT_CONSTRAINTS } -// b3ContactSolverInfo info = infoGlobal; - - -// int numNonContactPool = m_tmpSolverNonContactConstraintPool.size(); -// int numConstraintPool = m_tmpSolverContactConstraintPool.size(); -// int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size(); + // b3ContactSolverInfo info = infoGlobal; + // int numNonContactPool = m_tmpSolverNonContactConstraintPool.size(); + // int numConstraintPool = m_tmpSolverContactConstraintPool.size(); + // int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size(); return 0.f; - } - - ///a straight copy from GPU/OpenCL kernel, for debugging -__inline void internalApplyImpulse( b3GpuSolverBody* body, const b3Vector3& linearComponent, const b3Vector3& angularComponent,float impulseMagnitude) +__inline void internalApplyImpulse(b3GpuSolverBody* body, const b3Vector3& linearComponent, const b3Vector3& angularComponent, float impulseMagnitude) { - body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor; - body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor); + body->m_deltaLinearVelocity += linearComponent * impulseMagnitude * body->m_linearFactor; + body->m_deltaAngularVelocity += angularComponent * (impulseMagnitude * body->m_angularFactor); } - -void resolveSingleConstraintRowGeneric2( b3GpuSolverBody* body1, b3GpuSolverBody* body2, b3GpuSolverConstraint* c) +void resolveSingleConstraintRowGeneric2(b3GpuSolverBody* body1, b3GpuSolverBody* body2, b3GpuSolverConstraint* c) { - float deltaImpulse = c->m_rhs-b3Scalar(c->m_appliedImpulse)*c->m_cfm; - float deltaVel1Dotn = b3Dot(c->m_contactNormal,body1->m_deltaLinearVelocity) + b3Dot(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity); - float deltaVel2Dotn = -b3Dot(c->m_contactNormal,body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity); + float deltaImpulse = c->m_rhs - b3Scalar(c->m_appliedImpulse) * c->m_cfm; + float deltaVel1Dotn = b3Dot(c->m_contactNormal, body1->m_deltaLinearVelocity) + b3Dot(c->m_relpos1CrossNormal, body1->m_deltaAngularVelocity); + float deltaVel2Dotn = -b3Dot(c->m_contactNormal, body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal, body2->m_deltaAngularVelocity); - deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv; - deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv; + deltaImpulse -= deltaVel1Dotn * c->m_jacDiagABInv; + deltaImpulse -= deltaVel2Dotn * c->m_jacDiagABInv; float sum = b3Scalar(c->m_appliedImpulse) + deltaImpulse; if (sum < c->m_lowerLimit) { - deltaImpulse = c->m_lowerLimit-b3Scalar(c->m_appliedImpulse); + deltaImpulse = c->m_lowerLimit - b3Scalar(c->m_appliedImpulse); c->m_appliedImpulse = c->m_lowerLimit; } - else if (sum > c->m_upperLimit) + else if (sum > c->m_upperLimit) { - deltaImpulse = c->m_upperLimit-b3Scalar(c->m_appliedImpulse); + deltaImpulse = c->m_upperLimit - b3Scalar(c->m_appliedImpulse); c->m_appliedImpulse = c->m_upperLimit; } else @@ -648,64 +597,56 @@ void resolveSingleConstraintRowGeneric2( b3GpuSolverBody* body1, b3GpuSolverBod c->m_appliedImpulse = sum; } - internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse); - internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse); - + internalApplyImpulse(body1, c->m_contactNormal * body1->m_invMass, c->m_angularComponentA, deltaImpulse); + internalApplyImpulse(body2, -c->m_contactNormal * body2->m_invMass, c->m_angularComponentB, deltaImpulse); } - - -void b3GpuPgsConstraintSolver::initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb) +void b3GpuPgsConstraintSolver::initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb) { - - solverBody->m_deltaLinearVelocity.setValue(0.f,0.f,0.f); - solverBody->m_deltaAngularVelocity.setValue(0.f,0.f,0.f); - solverBody->internalGetPushVelocity().setValue(0.f,0.f,0.f); - solverBody->internalGetTurnVelocity().setValue(0.f,0.f,0.f); + solverBody->m_deltaLinearVelocity.setValue(0.f, 0.f, 0.f); + solverBody->m_deltaAngularVelocity.setValue(0.f, 0.f, 0.f); + solverBody->internalGetPushVelocity().setValue(0.f, 0.f, 0.f); + solverBody->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f); b3Assert(rb); -// solverBody->m_worldTransform = getWorldTransform(rb); - solverBody->internalSetInvMass(b3MakeVector3(rb->m_invMass,rb->m_invMass,rb->m_invMass)); + // solverBody->m_worldTransform = getWorldTransform(rb); + solverBody->internalSetInvMass(b3MakeVector3(rb->m_invMass, rb->m_invMass, rb->m_invMass)); solverBody->m_originalBodyIndex = bodyIndex; - solverBody->m_angularFactor = b3MakeVector3(1,1,1); - solverBody->m_linearFactor = b3MakeVector3(1,1,1); + solverBody->m_angularFactor = b3MakeVector3(1, 1, 1); + solverBody->m_linearFactor = b3MakeVector3(1, 1, 1); solverBody->m_linearVelocity = getLinearVelocity(rb); solverBody->m_angularVelocity = getAngularVelocity(rb); } - -void b3GpuPgsConstraintSolver::averageVelocities() +void b3GpuPgsConstraintSolver::averageVelocities() { } - -b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1,int numConstraints,const b3ContactSolverInfo& infoGlobal) +b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1, int numConstraints, const b3ContactSolverInfo& infoGlobal) { //only create the batches once. //@todo: incrementally update batches when constraints are added/activated and/or removed/deactivated B3_PROFILE("GpuSolveGroupCacheFriendlyIterations"); - bool createBatches = m_gpuData->m_batchSizes.size()==0; + bool createBatches = m_gpuData->m_batchSizes.size() == 0; { - if (createBatches) { - m_gpuData->m_batchSizes.resize(0); { m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints); B3_PROFILE("batch joints"); - b3Assert(batchConstraints.size()==numConstraints); - int simdWidth =numConstraints+1; + b3Assert(batchConstraints.size() == numConstraints); + int simdWidth = numConstraints + 1; int numBodies = m_tmpSolverBodyPool.size(); - sortConstraintByBatch3( &batchConstraints[0], numConstraints, simdWidth , m_staticIdx, numBodies); + sortConstraintByBatch3(&batchConstraints[0], numConstraints, simdWidth, m_staticIdx, numBodies); m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints); - } - } else + } + else { /*b3AlignedObjectArray<b3BatchConstraint> cpuCheckBatches; m_gpuData->m_gpuBatchConstraints->copyToHost(cpuCheckBatches); @@ -715,12 +656,11 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr //>copyFromHost(batchConstraints); } int maxIterations = infoGlobal.m_numIterations; - + bool useBatching = true; - if (useBatching ) + if (useBatching) { - if (!useGpuSolveJointConstraintRows) { B3_PROFILE("copy to host"); @@ -730,24 +670,21 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1); m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets); gpuConstraints1->copyToHost(m_gpuData->m_cpuConstraints); - } - for ( int iteration = 0 ; iteration< maxIterations ; iteration++) + for (int iteration = 0; iteration < maxIterations; iteration++) { - int batchOffset = 0; - int constraintOffset=0; + int constraintOffset = 0; int numBatches = m_gpuData->m_batchSizes.size(); - for (int bb=0;bb<numBatches;bb++) + for (int bb = 0; bb < numBatches; bb++) { int numConstraintsInBatch = m_gpuData->m_batchSizes[bb]; - if (useGpuSolveJointConstraintRows) { B3_PROFILE("solveJointConstraintRowsKernels"); - + /* __kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies, __global b3BatchConstraint* batchConstraints, @@ -758,53 +695,48 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr int batchOffset, int numConstraintsInBatch*/ - - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_solveJointConstraintRowsKernels,"m_solveJointConstraintRowsKernels"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_solveJointConstraintRowsKernels, "m_solveJointConstraintRowsKernels"); launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); - launcher.setBuffer(gpuConstraints1->getBufferCL());//to detect disabled constraints + launcher.setBuffer(gpuConstraints1->getBufferCL()); //to detect disabled constraints launcher.setConst(batchOffset); launcher.setConst(numConstraintsInBatch); launcher.launch1D(numConstraintsInBatch); - - - } else//useGpu + } + else //useGpu { - - - - for (int b=0;b<numConstraintsInBatch;b++) + for (int b = 0; b < numConstraintsInBatch; b++) { - const b3BatchConstraint& c = batchConstraints[batchOffset+b]; + const b3BatchConstraint& c = batchConstraints[batchOffset + b]; /*printf("-----------\n"); printf("bb=%d\n",bb); printf("c.batchId = %d\n", c.m_batchId); */ - b3Assert(c.m_batchId==bb); + b3Assert(c.m_batchId == bb); b3GpuGenericConstraint* constraint = &m_gpuData->m_cpuConstraints[c.m_originalConstraintIndex]; - if (constraint->m_flags&B3_CONSTRAINT_FLAG_ENABLED) + if (constraint->m_flags & B3_CONSTRAINT_FLAG_ENABLED) { int numConstraintRows = m_gpuData->m_cpuConstraintInfo1[c.m_originalConstraintIndex]; int constraintOffset = m_gpuData->m_cpuConstraintRowOffsets[c.m_originalConstraintIndex]; - - for (int jj=0;jj<numConstraintRows;jj++) + + for (int jj = 0; jj < numConstraintRows; jj++) { - // - b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[constraintOffset+jj]; + // + b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[constraintOffset + jj]; //resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint); - resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA],&m_tmpSolverBodyPool[constraint.m_solverBodyIdB],&constraint); + resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA], &m_tmpSolverBodyPool[constraint.m_solverBodyIdB], &constraint); } } } - }//useGpu - batchOffset+=numConstraintsInBatch; - constraintOffset+=numConstraintsInBatch; + } //useGpu + batchOffset += numConstraintsInBatch; + constraintOffset += numConstraintsInBatch; } - }//for (int iteration... + } //for (int iteration... if (!useGpuSolveJointConstraintRows) { @@ -820,20 +752,16 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr } //int sz = sizeof(b3GpuSolverBody); //printf("cpu sizeof(b3GpuSolverBody)=%d\n",sz); - - - - - - } else + } + else { - for ( int iteration = 0 ; iteration< maxIterations ; iteration++) - { - int numJoints = m_tmpSolverNonContactConstraintPool.size(); - for (int j=0;j<numJoints;j++) + for (int iteration = 0; iteration < maxIterations; iteration++) + { + int numJoints = m_tmpSolverNonContactConstraintPool.size(); + for (int j = 0; j < numJoints; j++) { b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[j]; - resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA],&m_tmpSolverBodyPool[constraint.m_solverBodyIdB],&constraint); + resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA], &m_tmpSolverBodyPool[constraint.m_solverBodyIdB], &constraint); } if (!m_usePgs) @@ -842,212 +770,198 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArr } } } - } clFinish(m_gpuData->m_queue); return 0.f; } - - - static b3AlignedObjectArray<int> bodyUsed; static b3AlignedObjectArray<int> curUsed; - - -inline int b3GpuPgsConstraintSolver::sortConstraintByBatch3( b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies) +inline int b3GpuPgsConstraintSolver::sortConstraintByBatch3(b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies) { //int sz = sizeof(b3BatchConstraint); B3_PROFILE("sortConstraintByBatch3"); - + static int maxSwaps = 0; int numSwaps = 0; - curUsed.resize(2*simdWidth); + curUsed.resize(2 * simdWidth); static int maxNumConstraints = 0; - if (maxNumConstraints<numConstraints) + if (maxNumConstraints < numConstraints) { maxNumConstraints = numConstraints; //printf("maxNumConstraints = %d\n",maxNumConstraints ); } - int numUsedArray = numBodies/32+1; + int numUsedArray = numBodies / 32 + 1; bodyUsed.resize(numUsedArray); - for (int q=0;q<numUsedArray;q++) - bodyUsed[q]=0; + for (int q = 0; q < numUsedArray; q++) + bodyUsed[q] = 0; - int curBodyUsed = 0; int numIter = 0; - - + #if defined(_DEBUG) - for(int i=0; i<numConstraints; i++) + for (int i = 0; i < numConstraints; i++) cs[i].m_batchId = -1; #endif - + int numValidConstraints = 0; -// int unprocessedConstraintIndex = 0; + // int unprocessedConstraintIndex = 0; int batchIdx = 0; - { B3_PROFILE("cpu batch innerloop"); - - while( numValidConstraints < numConstraints) + + while (numValidConstraints < numConstraints) { numIter++; int nCurrentBatch = 0; // clear flag - for(int i=0; i<curBodyUsed; i++) - bodyUsed[curUsed[i]/32] = 0; + for (int i = 0; i < curBodyUsed; i++) + bodyUsed[curUsed[i] / 32] = 0; - curBodyUsed = 0; + curBodyUsed = 0; - for(int i=numValidConstraints; i<numConstraints; i++) + for (int i = numValidConstraints; i < numConstraints; i++) { int idx = i; - b3Assert( idx < numConstraints ); + b3Assert(idx < numConstraints); // check if it can go int bodyAS = cs[idx].m_bodyAPtrAndSignBit; int bodyBS = cs[idx].m_bodyBPtrAndSignBit; int bodyA = abs(bodyAS); int bodyB = abs(bodyBS); - bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; - bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx; + bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx; int aUnavailable = 0; int bUnavailable = 0; if (!aIsStatic) { - aUnavailable = bodyUsed[ bodyA/32 ] & (1<<(bodyA&31)); + aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31)); } if (!aUnavailable) - if (!bIsStatic) - { - bUnavailable = bodyUsed[ bodyB/32 ] & (1<<(bodyB&31)); - } - - if( aUnavailable==0 && bUnavailable==0 ) // ok + if (!bIsStatic) + { + bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31)); + } + + if (aUnavailable == 0 && bUnavailable == 0) // ok { if (!aIsStatic) { - bodyUsed[ bodyA/32 ] |= (1<<(bodyA&31)); - curUsed[curBodyUsed++]=bodyA; + bodyUsed[bodyA / 32] |= (1 << (bodyA & 31)); + curUsed[curBodyUsed++] = bodyA; } if (!bIsStatic) { - bodyUsed[ bodyB/32 ] |= (1<<(bodyB&31)); - curUsed[curBodyUsed++]=bodyB; + bodyUsed[bodyB / 32] |= (1 << (bodyB & 31)); + curUsed[curBodyUsed++] = bodyB; } cs[idx].m_batchId = batchIdx; - if (i!=numValidConstraints) + if (i != numValidConstraints) { - b3Swap(cs[i],cs[numValidConstraints]); + b3Swap(cs[i], cs[numValidConstraints]); numSwaps++; } numValidConstraints++; { nCurrentBatch++; - if( nCurrentBatch == simdWidth ) + if (nCurrentBatch == simdWidth) { nCurrentBatch = 0; - for(int i=0; i<curBodyUsed; i++) - bodyUsed[curUsed[i]/32] = 0; + for (int i = 0; i < curBodyUsed; i++) + bodyUsed[curUsed[i] / 32] = 0; curBodyUsed = 0; } } } } m_gpuData->m_batchSizes.push_back(nCurrentBatch); - batchIdx ++; + batchIdx++; } } - + #if defined(_DEBUG) - // debugPrintf( "nBatches: %d\n", batchIdx ); - for(int i=0; i<numConstraints; i++) - { - b3Assert( cs[i].m_batchId != -1 ); - } + // debugPrintf( "nBatches: %d\n", batchIdx ); + for (int i = 0; i < numConstraints; i++) + { + b3Assert(cs[i].m_batchId != -1); + } #endif - if (maxSwaps<numSwaps) + if (maxSwaps < numSwaps) { maxSwaps = numSwaps; //printf("maxSwaps = %d\n", maxSwaps); } - + return batchIdx; } - /// b3PgsJacobiSolver Sequentially applies impulses -b3Scalar b3GpuPgsConstraintSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, - int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints, const b3ContactSolverInfo& infoGlobal) +b3Scalar b3GpuPgsConstraintSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, + int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal) { - B3_PROFILE("solveJoints"); //you need to provide at least some bodies - - solveGroupCacheFriendlySetup( gpuBodies, gpuInertias,numBodies,gpuConstraints, numConstraints,infoGlobal); - solveGroupCacheFriendlyIterations(gpuConstraints, numConstraints,infoGlobal); + solveGroupCacheFriendlySetup(gpuBodies, gpuInertias, numBodies, gpuConstraints, numConstraints, infoGlobal); + + solveGroupCacheFriendlyIterations(gpuConstraints, numConstraints, infoGlobal); + + solveGroupCacheFriendlyFinish(gpuBodies, gpuInertias, numBodies, gpuConstraints, numConstraints, infoGlobal); - solveGroupCacheFriendlyFinish(gpuBodies, gpuInertias,numBodies, gpuConstraints, numConstraints, infoGlobal); - return 0.f; } -void b3GpuPgsConstraintSolver::solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, - int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints) +void b3GpuPgsConstraintSolver::solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, + int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints) { b3ContactSolverInfo infoGlobal; infoGlobal.m_splitImpulse = false; - infoGlobal.m_timeStep = 1.f/60.f; - infoGlobal.m_numIterations = 4;//4; -// infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS|B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION; + infoGlobal.m_timeStep = 1.f / 60.f; + infoGlobal.m_numIterations = 4; //4; + // infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS|B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION; //infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS; - infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS; + infoGlobal.m_solverMode |= B3_SOLVER_USE_2_FRICTION_DIRECTIONS; //if (infoGlobal.m_solverMode & B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS) //if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS) && (infoGlobal.m_solverMode & B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION)) - - - solveGroup(gpuBodies,gpuInertias,numBodies,gpuConstraints,numConstraints,infoGlobal); + solveGroup(gpuBodies, gpuInertias, numBodies, gpuConstraints, numConstraints, infoGlobal); } //b3AlignedObjectArray<b3RigidBodyData> testBodies; - -b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias,int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal) +b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal) { B3_PROFILE("solveGroupCacheFriendlyFinish"); -// int numPoolConstraints = m_tmpSolverContactConstraintPool.size(); -// int i,j; - + // int numPoolConstraints = m_tmpSolverContactConstraintPool.size(); + // int i,j; { if (gpuBreakConstraints) { B3_PROFILE("breakViolatedConstraintsKernel"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_breakViolatedConstraintsKernel,"m_breakViolatedConstraintsKernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_breakViolatedConstraintsKernel, "m_breakViolatedConstraintsKernel"); launcher.setBuffer(gpuConstraints->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL()); launcher.setConst(numConstraints); launcher.launch1D(numConstraints); - } else + } + else { gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints); m_gpuData->m_gpuBatchConstraints->copyToHost(m_gpuData->m_cpuBatchConstraints); @@ -1056,31 +970,28 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1); m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets); - for (int cid=0;cid<numConstraints;cid++) + for (int cid = 0; cid < numConstraints; cid++) { int originalConstraintIndex = batchConstraints[cid].m_originalConstraintIndex; int constraintRowOffset = m_gpuData->m_cpuConstraintRowOffsets[originalConstraintIndex]; int numRows = m_gpuData->m_cpuConstraintInfo1[originalConstraintIndex]; if (numRows) { - - // printf("cid=%d, breakingThreshold =%f\n",cid,breakingThreshold); - for (int i=0;i<numRows;i++) + // printf("cid=%d, breakingThreshold =%f\n",cid,breakingThreshold); + for (int i = 0; i < numRows; i++) { - int rowIndex =constraintRowOffset+i; + int rowIndex = constraintRowOffset + i; int orgConstraintIndex = m_gpuData->m_cpuConstraintRows[rowIndex].m_originalConstraintIndex; float breakingThreshold = m_gpuData->m_cpuConstraints[orgConstraintIndex].m_breakingImpulseThreshold; - // printf("rows[%d].m_appliedImpulse=%f\n",rowIndex,rows[rowIndex].m_appliedImpulse); + // printf("rows[%d].m_appliedImpulse=%f\n",rowIndex,rows[rowIndex].m_appliedImpulse); if (b3Fabs(m_gpuData->m_cpuConstraintRows[rowIndex].m_appliedImpulse) >= breakingThreshold) { - - m_gpuData->m_cpuConstraints[orgConstraintIndex].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED; + m_gpuData->m_cpuConstraints[orgConstraintIndex].m_flags = 0; //&= ~B3_CONSTRAINT_FLAG_ENABLED; } } } } - gpuConstraints->copyFromHost(m_gpuData->m_cpuConstraints); } } @@ -1090,28 +1001,27 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b { B3_PROFILE("GPU write back velocities and transforms"); - b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_writeBackVelocitiesKernel,"m_writeBackVelocitiesKernel"); + b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_writeBackVelocitiesKernel, "m_writeBackVelocitiesKernel"); launcher.setBuffer(gpuBodies->getBufferCL()); launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL()); launcher.setConst(numBodies); launcher.launch1D(numBodies); clFinish(m_gpuData->m_queue); -// m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); -// m_gpuData->m_gpuBodies->copyToHostPointer(bodies,numBodies); + // m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); + // m_gpuData->m_gpuBodies->copyToHostPointer(bodies,numBodies); //m_gpuData->m_gpuBodies->copyToHost(testBodies); - - } + } else { B3_PROFILE("CPU write back velocities and transforms"); m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool); gpuBodies->copyToHost(m_gpuData->m_cpuBodies); - for ( int i=0;i<m_tmpSolverBodyPool.size();i++) + for (int i = 0; i < m_tmpSolverBodyPool.size(); i++) { int bodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex; //printf("bodyIndex=%d\n",bodyIndex); - b3Assert(i==bodyIndex); + b3Assert(i == bodyIndex); b3RigidBodyData* body = &m_gpuData->m_cpuBodies[bodyIndex]; if (body->m_invMass) @@ -1125,11 +1035,12 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b { body->m_linVel = m_tmpSolverBodyPool[i].m_linearVelocity; body->m_angVel = m_tmpSolverBodyPool[i].m_angularVelocity; - } else + } + else { b3Assert(0); } - /* + /* if (infoGlobal.m_splitImpulse) { body->m_pos = m_tmpSolverBodyPool[i].m_worldTransform.getOrigin(); @@ -1139,10 +1050,9 @@ b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b } */ } - }//for + } //for gpuBodies->copyFromHost(m_gpuData->m_cpuBodies); - } } diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h index ec0e3f73d6..00bc544f02 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h @@ -19,7 +19,6 @@ subject to the following restrictions: struct b3Contact4; struct b3ContactPoint; - class b3Dispatcher; #include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h" @@ -38,41 +37,40 @@ class b3GpuPgsConstraintSolver protected: int m_staticIdx; struct b3GpuPgsJacobiSolverInternalData* m_gpuData; - protected: - b3AlignedObjectArray<b3GpuSolverBody> m_tmpSolverBodyPool; - b3GpuConstraintArray m_tmpSolverContactConstraintPool; - b3GpuConstraintArray m_tmpSolverNonContactConstraintPool; - b3GpuConstraintArray m_tmpSolverContactFrictionConstraintPool; - b3GpuConstraintArray m_tmpSolverContactRollingFrictionConstraintPool; + +protected: + b3AlignedObjectArray<b3GpuSolverBody> m_tmpSolverBodyPool; + b3GpuConstraintArray m_tmpSolverContactConstraintPool; + b3GpuConstraintArray m_tmpSolverNonContactConstraintPool; + b3GpuConstraintArray m_tmpSolverContactFrictionConstraintPool; + b3GpuConstraintArray m_tmpSolverContactRollingFrictionConstraintPool; b3AlignedObjectArray<unsigned int> m_tmpConstraintSizesPool; - - bool m_usePgs; - void averageVelocities(); + bool m_usePgs; + void averageVelocities(); - int m_maxOverrideNumSolverIterations; + int m_maxOverrideNumSolverIterations; - int m_numSplitImpulseRecoveries; + int m_numSplitImpulseRecoveries; -// int getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias); - void initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb); + // int getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias); + void initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb); public: - b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs); - virtual~b3GpuPgsConstraintSolver (); - - virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1,int numConstraints,const b3ContactSolverInfo& infoGlobal); - virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal); - b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias,int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal); + b3GpuPgsConstraintSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, bool usePgs); + virtual ~b3GpuPgsConstraintSolver(); + virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1, int numConstraints, const b3ContactSolverInfo& infoGlobal); + virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal); + b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal); - b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal); - void solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, - int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints); + b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal); + void solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, + int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints); - int sortConstraintByBatch3( struct b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies); - void recomputeBatches(); + int sortConstraintByBatch3(struct b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies); + void recomputeBatches(); }; -#endif //B3_GPU_PGS_CONSTRAINT_SOLVER_H +#endif //B3_GPU_PGS_CONSTRAINT_SOLVER_H diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp index f0b0abd5e0..e3d235a4fd 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp @@ -2,7 +2,7 @@ bool gUseLargeBatches = false; bool gCpuBatchContacts = false; bool gCpuSolveConstraint = false; -bool gCpuRadixSort=false; +bool gCpuRadixSort = false; bool gCpuSetSortData = false; bool gCpuSortContactsDeterminism = false; bool gUseCpuCopyConstraints = false; @@ -11,7 +11,6 @@ bool gReorderContactsOnCpu = false; bool optionalSortContactsDeterminism = true; - #include "b3GpuPgsContactSolver.h" #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" @@ -23,7 +22,6 @@ bool optionalSortContactsDeterminism = true; #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h" #include "b3Solver.h" - #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl" #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl" #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl" @@ -38,11 +36,7 @@ bool optionalSortContactsDeterminism = true; #include "kernels/batchingKernels.h" #include "kernels/batchingKernelsNew.h" - - - - -struct b3GpuBatchingPgsSolverInternalData +struct b3GpuBatchingPgsSolverInternalData { cl_context m_context; cl_device_id m_device; @@ -53,9 +47,9 @@ struct b3GpuBatchingPgsSolverInternalData b3OpenCLArray<b3GpuConstraint4>* m_contactCGPU; b3OpenCLArray<unsigned int>* m_numConstraints; b3OpenCLArray<unsigned int>* m_offsets; - - b3Solver* m_solverGPU; - + + b3Solver* m_solverGPU; + cl_kernel m_batchingKernel; cl_kernel m_batchingKernelNew; cl_kernel m_solveContactKernel; @@ -67,17 +61,14 @@ struct b3GpuBatchingPgsSolverInternalData cl_kernel m_reorderContactKernel; cl_kernel m_copyConstraintKernel; - cl_kernel m_setDeterminismSortDataBodyAKernel; - cl_kernel m_setDeterminismSortDataBodyBKernel; - cl_kernel m_setDeterminismSortDataChildShapeAKernel; - cl_kernel m_setDeterminismSortDataChildShapeBKernel; - + cl_kernel m_setDeterminismSortDataBodyAKernel; + cl_kernel m_setDeterminismSortDataBodyBKernel; + cl_kernel m_setDeterminismSortDataChildShapeAKernel; + cl_kernel m_setDeterminismSortDataChildShapeBKernel; - - - class b3RadixSort32CL* m_sort32; - class b3BoundSearchCL* m_search; - class b3PrefixScanCL* m_scan; + class b3RadixSort32CL* m_sort32; + class b3BoundSearchCL* m_search; + class b3PrefixScanCL* m_scan; b3OpenCLArray<b3SortData>* m_sortDataBuffer; b3OpenCLArray<b3Contact4>* m_contactBuffer; @@ -85,63 +76,56 @@ struct b3GpuBatchingPgsSolverInternalData b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU; b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU; b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU; - - b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy; - b3OpenCLArray<b3SortData>* m_contactKeyValues; + b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy; + b3OpenCLArray<b3SortData>* m_contactKeyValues; b3AlignedObjectArray<unsigned int> m_idxBuffer; b3AlignedObjectArray<b3SortData> m_sortData; b3AlignedObjectArray<b3Contact4> m_old; - b3AlignedObjectArray<int> m_batchSizes; - b3OpenCLArray<int>* m_batchSizesGpu; - + b3AlignedObjectArray<int> m_batchSizes; + b3OpenCLArray<int>* m_batchSizesGpu; }; - - -b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity) +b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity) { - m_debugOutput=0; + m_debugOutput = 0; m_data = new b3GpuBatchingPgsSolverInternalData; m_data->m_context = ctx; m_data->m_device = device; m_data->m_queue = q; m_data->m_pairCapacity = pairCapacity; m_data->m_nIterations = 4; - m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx,q); - m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx,q); - m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx,q); - m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx,q); - - m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx,q); - m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx,q); + m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx, q); + m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx, q); + m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx, q); + m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx, q); + m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx, q); + m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx, q); - m_data->m_solverGPU = new b3Solver(ctx,device,q,512*1024); + m_data->m_solverGPU = new b3Solver(ctx, device, q, 512 * 1024); - m_data->m_sort32 = new b3RadixSort32CL(ctx,device,m_data->m_queue); - m_data->m_scan = new b3PrefixScanCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS); - m_data->m_search = new b3BoundSearchCL(ctx,device,m_data->m_queue,B3_SOLVER_N_CELLS); + m_data->m_sort32 = new b3RadixSort32CL(ctx, device, m_data->m_queue); + m_data->m_scan = new b3PrefixScanCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS); + m_data->m_search = new b3BoundSearchCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS); - const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 ); + const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512); - m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,m_data->m_queue,sortSize); - m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx,m_data->m_queue); + m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, m_data->m_queue, sortSize); + m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx, m_data->m_queue); - m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,m_data->m_queue,B3_SOLVER_N_CELLS); + m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS); m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS); - m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx,q,pairCapacity); + m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx, q, pairCapacity); - m_data->m_offsets = new b3OpenCLArray<unsigned int>( ctx,m_data->m_queue,B3_SOLVER_N_CELLS); + m_data->m_offsets = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS); m_data->m_offsets->resize(B3_SOLVER_N_CELLS); const char* additionalMacros = ""; //const char* srcFileNameForCaching=""; - - cl_int pErrNum; const char* batchKernelSource = batchingKernelsCL; const char* batchKernelNewSource = batchingKernelsNewCL; @@ -149,88 +133,73 @@ b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx,cl_device_id device, const char* solverSetup2Source = solverSetup2CL; const char* solveContactSource = solveContactCL; const char* solveFrictionSource = solveFrictionCL; - - + { - - cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH); + cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH); b3Assert(solveContactProg); - - cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH); + + cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH); b3Assert(solveFrictionProg); - cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH); - - + cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH); + b3Assert(solverSetup2Prog); - - cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH); + cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH); b3Assert(solverSetupProg); - - - m_data->m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros ); + + m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros); b3Assert(m_data->m_solveFrictionKernel); - m_data->m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros ); + m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros); b3Assert(m_data->m_solveContactKernel); - m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg,additionalMacros ); + m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg, additionalMacros); b3Assert(m_data->m_solveSingleContactKernel); - m_data->m_solveSingleFrictionKernel =b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg,additionalMacros ); + m_data->m_solveSingleFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg, additionalMacros); b3Assert(m_data->m_solveSingleFrictionKernel); - - m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros ); + + m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros); b3Assert(m_data->m_contactToConstraintKernel); - - m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + + m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_data->m_setSortDataKernel); - m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog,additionalMacros ); + m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_data->m_setDeterminismSortDataBodyAKernel); - m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog,additionalMacros ); + m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_data->m_setDeterminismSortDataBodyBKernel); - m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog,additionalMacros ); + m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_data->m_setDeterminismSortDataChildShapeAKernel); - m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog,additionalMacros ); + m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_data->m_setDeterminismSortDataChildShapeBKernel); - - m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_data->m_reorderContactKernel); - - m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_data->m_copyConstraintKernel); - } { - cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH); + cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH); b3Assert(batchingProg); - - m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros ); + + m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros); b3Assert(m_data->m_batchingKernel); } - + { - cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH); + cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH); b3Assert(batchingNewProg); - - m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros ); + + m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros); b3Assert(m_data->m_batchingKernelNew); } - - - - - - - } b3GpuPgsContactSolver::~b3GpuPgsContactSolver() @@ -242,8 +211,6 @@ b3GpuPgsContactSolver::~b3GpuPgsContactSolver() delete m_data->m_pBufContactOutGPUCopy; delete m_data->m_contactKeyValues; - - delete m_data->m_contactCGPU; delete m_data->m_numConstraints; delete m_data->m_offsets; @@ -259,29 +226,25 @@ b3GpuPgsContactSolver::~b3GpuPgsContactSolver() clReleaseKernel(m_data->m_batchingKernelNew); clReleaseKernel(m_data->m_solveSingleContactKernel); clReleaseKernel(m_data->m_solveSingleFrictionKernel); - clReleaseKernel( m_data->m_solveContactKernel); - clReleaseKernel( m_data->m_solveFrictionKernel); + clReleaseKernel(m_data->m_solveContactKernel); + clReleaseKernel(m_data->m_solveFrictionKernel); - clReleaseKernel( m_data->m_contactToConstraintKernel); - clReleaseKernel( m_data->m_setSortDataKernel); - clReleaseKernel( m_data->m_reorderContactKernel); - clReleaseKernel( m_data->m_copyConstraintKernel); + clReleaseKernel(m_data->m_contactToConstraintKernel); + clReleaseKernel(m_data->m_setSortDataKernel); + clReleaseKernel(m_data->m_reorderContactKernel); + clReleaseKernel(m_data->m_copyConstraintKernel); clReleaseKernel(m_data->m_setDeterminismSortDataBodyAKernel); clReleaseKernel(m_data->m_setDeterminismSortDataBodyBKernel); clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeAKernel); clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeBKernel); - - delete m_data; } - - struct b3ConstraintCfg { - b3ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(0) {} + b3ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(0) {} float m_positionDrift; float m_positionConstraintCoeff; @@ -291,354 +254,306 @@ struct b3ConstraintCfg int m_staticIdx; }; - - -void b3GpuPgsContactSolver::solveContactConstraintBatchSizes( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,int numIterations, const b3AlignedObjectArray<int>* batchSizes)//const b3OpenCLArray<int>* gpuBatchSizes) +void b3GpuPgsContactSolver::solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes) //const b3OpenCLArray<int>* gpuBatchSizes) { B3_PROFILE("solveContactConstraintBatchSizes"); - int numBatches = batchSizes->size()/B3_MAX_NUM_BATCHES; - for(int iter=0; iter<numIterations; iter++) + int numBatches = batchSizes->size() / B3_MAX_NUM_BATCHES; + for (int iter = 0; iter < numIterations; iter++) { - - for (int cellId=0;cellId<numBatches;cellId++) + for (int cellId = 0; cellId < numBatches; cellId++) { int offset = 0; - for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++) + for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++) { - int numInBatch = batchSizes->at(cellId*B3_MAX_NUM_BATCHES+ii); + int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii); if (!numInBatch) break; { - b3LauncherCL launcher( m_data->m_queue, m_data->m_solveSingleContactKernel,"m_solveSingleContactKernel" ); - launcher.setBuffer(bodyBuf->getBufferCL() ); - launcher.setBuffer(shapeBuf->getBufferCL() ); - launcher.setBuffer( constraint->getBufferCL() ); + b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleContactKernel, "m_solveSingleContactKernel"); + launcher.setBuffer(bodyBuf->getBufferCL()); + launcher.setBuffer(shapeBuf->getBufferCL()); + launcher.setBuffer(constraint->getBufferCL()); launcher.setConst(cellId); launcher.setConst(offset); launcher.setConst(numInBatch); launcher.launch1D(numInBatch); - offset+=numInBatch; + offset += numInBatch; } } } } - - for(int iter=0; iter<numIterations; iter++) + for (int iter = 0; iter < numIterations; iter++) { - for (int cellId=0;cellId<numBatches;cellId++) + for (int cellId = 0; cellId < numBatches; cellId++) { int offset = 0; - for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++) + for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++) { - int numInBatch = batchSizes->at(cellId*B3_MAX_NUM_BATCHES+ii); + int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii); if (!numInBatch) break; { - b3LauncherCL launcher( m_data->m_queue, m_data->m_solveSingleFrictionKernel,"m_solveSingleFrictionKernel" ); - launcher.setBuffer(bodyBuf->getBufferCL() ); - launcher.setBuffer(shapeBuf->getBufferCL() ); - launcher.setBuffer( constraint->getBufferCL() ); + b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleFrictionKernel, "m_solveSingleFrictionKernel"); + launcher.setBuffer(bodyBuf->getBufferCL()); + launcher.setBuffer(shapeBuf->getBufferCL()); + launcher.setBuffer(constraint->getBufferCL()); launcher.setConst(cellId); launcher.setConst(offset); launcher.setConst(numInBatch); launcher.launch1D(numInBatch); - offset+=numInBatch; + offset += numInBatch; } } } } } -void b3GpuPgsContactSolver::solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,int numIterations, const b3AlignedObjectArray<int>* batchSizes)//,const b3OpenCLArray<int>* gpuBatchSizes) +void b3GpuPgsContactSolver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes) //,const b3OpenCLArray<int>* gpuBatchSizes) { - //sort the contacts - - b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 ); + b3Int4 cdata = b3MakeInt4(n, 0, 0, 0); { - const int nn = B3_SOLVER_N_CELLS; cdata.x = 0; - cdata.y = maxNumBatches;//250; - + cdata.y = maxNumBatches; //250; - int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES; + int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES; #ifdef DEBUG_ME - SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; - adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems); + SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; + adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems); #endif - - { - B3_PROFILE("m_batchSolveKernel iterations"); - for(int iter=0; iter<numIterations; iter++) + for (int iter = 0; iter < numIterations; iter++) { - for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++) + for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++) { #ifdef DEBUG_ME - memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems); - gpuDebugInfo.write(debugInfo,numWorkItems); + memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems); + gpuDebugInfo.write(debugInfo, numWorkItems); #endif - cdata.z = ib; - - b3LauncherCL launcher( m_data->m_queue, m_data->m_solveContactKernel,"m_solveContactKernel" ); + b3LauncherCL launcher(m_data->m_queue, m_data->m_solveContactKernel, "m_solveContactKernel"); #if 1 - - b3BufferInfoCL bInfo[] = { - - b3BufferInfoCL( bodyBuf->getBufferCL() ), - b3BufferInfoCL( shapeBuf->getBufferCL() ), - b3BufferInfoCL( constraint->getBufferCL() ), - b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ), - b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() ) + + b3BufferInfoCL bInfo[] = { + + b3BufferInfoCL(bodyBuf->getBufferCL()), + b3BufferInfoCL(shapeBuf->getBufferCL()), + b3BufferInfoCL(constraint->getBufferCL()), + b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()), + b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL()) #ifdef DEBUG_ME - , b3BufferInfoCL(&gpuDebugInfo) + , + b3BufferInfoCL(&gpuDebugInfo) #endif - }; - - + }; - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL()); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL()); //launcher.setConst( cdata.x ); - launcher.setConst( cdata.y ); - launcher.setConst( cdata.z ); + launcher.setConst(cdata.y); + launcher.setConst(cdata.z); b3Int4 nSplit; nSplit.x = B3_SOLVER_N_SPLIT_X; nSplit.y = B3_SOLVER_N_SPLIT_Y; nSplit.z = B3_SOLVER_N_SPLIT_Z; - launcher.setConst( nSplit ); - launcher.launch1D( numWorkItems, 64 ); + launcher.setConst(nSplit); + launcher.launch1D(numWorkItems, 64); - #else - const char* fileName = "m_batchSolveKernel.bin"; - FILE* f = fopen(fileName,"rb"); - if (f) - { - int sizeInBytes=0; - if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) - { - printf("error, cannot get file size\n"); - exit(0); - } - - unsigned char* buf = (unsigned char*) malloc(sizeInBytes); - fread(buf,sizeInBytes,1,f); - int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); - int num = *(int*)&buf[serializedBytes]; - - launcher.launch1D( num); - - //this clFinish is for testing on errors - clFinish(m_queue); - } + const char* fileName = "m_batchSolveKernel.bin"; + FILE* f = fopen(fileName, "rb"); + if (f) + { + int sizeInBytes = 0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*)malloc(sizeInBytes); + fread(buf, sizeInBytes, 1, f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context); + int num = *(int*)&buf[serializedBytes]; + + launcher.launch1D(num); + + //this clFinish is for testing on errors + clFinish(m_queue); + } #endif - #ifdef DEBUG_ME clFinish(m_queue); - gpuDebugInfo.read(debugInfo,numWorkItems); + gpuDebugInfo.read(debugInfo, numWorkItems); clFinish(m_queue); - for (int i=0;i<numWorkItems;i++) + for (int i = 0; i < numWorkItems; i++) { - if (debugInfo[i].m_valInt2>0) + if (debugInfo[i].m_valInt2 > 0) { - printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2); + printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2); } - if (debugInfo[i].m_valInt3>0) + if (debugInfo[i].m_valInt3 > 0) { - printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3); + printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3); } } -#endif //DEBUG_ME - - +#endif //DEBUG_ME } } - - clFinish(m_data->m_queue); - + clFinish(m_data->m_queue); } cdata.x = 1; - bool applyFriction=true; + bool applyFriction = true; if (applyFriction) - { + { B3_PROFILE("m_batchSolveKernel iterations2"); - for(int iter=0; iter<numIterations; iter++) + for (int iter = 0; iter < numIterations; iter++) { - for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++) + for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++) { cdata.z = ib; - - - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( bodyBuf->getBufferCL() ), - b3BufferInfoCL( shapeBuf->getBufferCL() ), - b3BufferInfoCL( constraint->getBufferCL() ), - b3BufferInfoCL( m_data->m_solverGPU->m_numConstraints->getBufferCL() ), - b3BufferInfoCL( m_data->m_solverGPU->m_offsets->getBufferCL() ) + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(bodyBuf->getBufferCL()), + b3BufferInfoCL(shapeBuf->getBufferCL()), + b3BufferInfoCL(constraint->getBufferCL()), + b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()), + b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL()) #ifdef DEBUG_ME - ,b3BufferInfoCL(&gpuDebugInfo) -#endif //DEBUG_ME + , + b3BufferInfoCL(&gpuDebugInfo) +#endif //DEBUG_ME }; - b3LauncherCL launcher( m_data->m_queue, m_data->m_solveFrictionKernel,"m_solveFrictionKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setBuffer( m_data->m_solverGPU->m_batchSizes.getBufferCL()); + b3LauncherCL launcher(m_data->m_queue, m_data->m_solveFrictionKernel, "m_solveFrictionKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL()); //launcher.setConst( cdata.x ); - launcher.setConst( cdata.y ); - launcher.setConst( cdata.z ); + launcher.setConst(cdata.y); + launcher.setConst(cdata.z); - b3Int4 nSplit; + b3Int4 nSplit; nSplit.x = B3_SOLVER_N_SPLIT_X; nSplit.y = B3_SOLVER_N_SPLIT_Y; nSplit.z = B3_SOLVER_N_SPLIT_Z; - launcher.setConst( nSplit ); - - launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 ); + launcher.setConst(nSplit); + + launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64); } } clFinish(m_data->m_queue); - } #ifdef DEBUG_ME delete[] debugInfo; -#endif //DEBUG_ME +#endif //DEBUG_ME } - - } - - - - - - - - - - -static bool sortfnc(const b3SortData& a,const b3SortData& b) +static bool sortfnc(const b3SortData& a, const b3SortData& b) { - return (a.m_key<b.m_key); + return (a.m_key < b.m_key); } static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q) { - return ((p.m_bodyAPtrAndSignBit<q.m_bodyAPtrAndSignBit) || - ((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit<q.m_bodyBPtrAndSignBit)) || - ((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit==q.m_bodyBPtrAndSignBit) && p.m_childIndexA<q.m_childIndexA ) || - ((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit==q.m_bodyBPtrAndSignBit) && p.m_childIndexA<q.m_childIndexA ) || - ((p.m_bodyAPtrAndSignBit==q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit==q.m_bodyBPtrAndSignBit) && p.m_childIndexA==q.m_childIndexA && p.m_childIndexB<q.m_childIndexB) - ); + return ((p.m_bodyAPtrAndSignBit < q.m_bodyAPtrAndSignBit) || + ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit < q.m_bodyBPtrAndSignBit)) || + ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) || + ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) || + ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA == q.m_childIndexA && p.m_childIndexB < q.m_childIndexB)); } - - - - - - - - - - #define USE_SPATIAL_BATCHING 1 #define USE_4x4_GRID 1 #ifndef USE_SPATIAL_BATCHING -static const int gridTable4x4[] = -{ - 0,1,17,16, - 1,2,18,19, - 17,18,32,3, - 16,19,3,34 -}; -static const int gridTable8x8[] = -{ - 0, 2, 3, 16, 17, 18, 19, 1, - 66, 64, 80, 67, 82, 81, 65, 83, - 131,144,128,130,147,129,145,146, - 208,195,194,192,193,211,210,209, - 21, 22, 23, 5, 4, 6, 7, 20, - 86, 85, 69, 87, 70, 68, 84, 71, - 151,133,149,150,135,148,132,134, - 197,27,214,213,212,199,198,196 - -}; +static const int gridTable4x4[] = + { + 0, 1, 17, 16, + 1, 2, 18, 19, + 17, 18, 32, 3, + 16, 19, 3, 34}; +static const int gridTable8x8[] = + { + 0, 2, 3, 16, 17, 18, 19, 1, + 66, 64, 80, 67, 82, 81, 65, 83, + 131, 144, 128, 130, 147, 129, 145, 146, + 208, 195, 194, 192, 193, 211, 210, 209, + 21, 22, 23, 5, 4, 6, 7, 20, + 86, 85, 69, 87, 70, 68, 84, 71, + 151, 133, 149, 150, 135, 148, 132, 134, + 197, 27, 214, 213, 212, 199, 198, 196 +}; #endif - -void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts,float scale,const b3Int4& nSplit,int staticIdx) +void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts, float scale, const b3Int4& nSplit, int staticIdx) { - for (int gIdx=0;gIdx<nContacts;gIdx++) + for (int gIdx = 0; gIdx < nContacts; gIdx++) { - if( gIdx < nContacts ) + if (gIdx < nContacts) { - int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit; - int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit; + int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit; + int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit; - int aIdx = abs(aPtrAndSignBit ); + int aIdx = abs(aPtrAndSignBit); int bIdx = abs(bPtrAndSignBit); - bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx); + bool aStatic = (aPtrAndSignBit < 0) || (aPtrAndSignBit == staticIdx); - #if USE_SPATIAL_BATCHING - int idx = (aStatic)? bIdx: aIdx; +#if USE_SPATIAL_BATCHING + int idx = (aStatic) ? bIdx : aIdx; b3Vector3 p = gBodies[idx].m_pos; - int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1); - int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1); - int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1); - - int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y); - - #else//USE_SPATIAL_BATCHING - bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx); - - #if USE_4x4_GRID - int aa = aIdx&3; - int bb = bIdx&3; + int xIdx = (int)((p.x - ((p.x < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.x - 1); + int yIdx = (int)((p.y - ((p.y < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.y - 1); + int zIdx = (int)((p.z - ((p.z < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.z - 1); + + int newIndex = (xIdx + yIdx * nSplit.x + zIdx * nSplit.x * nSplit.y); + +#else //USE_SPATIAL_BATCHING + bool bStatic = (bPtrAndSignBit < 0) || (bPtrAndSignBit == staticIdx); + +#if USE_4x4_GRID + int aa = aIdx & 3; + int bb = bIdx & 3; if (aStatic) aa = bb; if (bStatic) bb = aa; - int gridIndex = aa + bb*4; + int gridIndex = aa + bb * 4; int newIndex = gridTable4x4[gridIndex]; - #else//USE_4x4_GRID - int aa = aIdx&7; - int bb = bIdx&7; +#else //USE_4x4_GRID + int aa = aIdx & 7; + int bb = bIdx & 7; if (aStatic) aa = bb; if (bStatic) bb = aa; - int gridIndex = aa + bb*8; + int gridIndex = aa + bb * 8; int newIndex = gridTable8x8[gridIndex]; - #endif//USE_4x4_GRID - #endif//USE_SPATIAL_BATCHING - +#endif //USE_4x4_GRID +#endif //USE_SPATIAL_BATCHING gSortDataOut[gIdx].x = newIndex; gSortDataOut[gIdx].y = gIdx; @@ -650,17 +565,12 @@ void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* } } - - - - - void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index) { B3_PROFILE("solveContacts"); - m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf,numBodies); - m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf,numBodies); - m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf,numContacts); + m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf, numBodies); + m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf, numBodies); + m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf, numContacts); if (optionalSortContactsDeterminism) { @@ -671,61 +581,61 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem m_data->m_pBufContactOutGPUCopy->resize(numContacts); m_data->m_contactKeyValues->resize(numContacts); - m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(),numContacts,0,0); + m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(), numContacts, 0, 0); { - b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel,"m_setDeterminismSortDataChildShapeBKernel"); + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel, "m_setDeterminismSortDataChildShapeBKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); launcher.setConst(numContacts); - launcher.launch1D( numContacts, 64 ); + launcher.launch1D(numContacts, 64); } m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); { - b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel,"m_setDeterminismSortDataChildShapeAKernel"); + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel, "m_setDeterminismSortDataChildShapeAKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); launcher.setConst(numContacts); - launcher.launch1D( numContacts, 64 ); + launcher.launch1D(numContacts, 64); } m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); { - b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel,"m_setDeterminismSortDataBodyBKernel"); + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel, "m_setDeterminismSortDataBodyBKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); launcher.setConst(numContacts); - launcher.launch1D( numContacts, 64 ); + launcher.launch1D(numContacts, 64); } - + m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); - + { - b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel,"m_setDeterminismSortDataBodyAKernel"); + b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel, "m_setDeterminismSortDataBodyAKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); launcher.setConst(numContacts); - launcher.launch1D( numContacts, 64 ); + launcher.launch1D(numContacts, 64); } m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues); { B3_PROFILE("gpu reorderContactKernel (determinism)"); - + b3Int4 cdata; cdata.x = numContacts; - + //b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL()) // , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; - b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel,"m_reorderContactKernel"); + b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel"); launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL()); launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL()); launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL()); - launcher.setConst( cdata ); - launcher.launch1D( numContacts, 64 ); - } - - } else + launcher.setConst(cdata); + launcher.launch1D(numContacts, 64); + } + } + else { B3_PROFILE("CPU Sort contact constraints (determinism)"); b3AlignedObjectArray<b3Contact4> cpuConstraints; @@ -735,96 +645,80 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem { cpuConstraints.quickSort(b3ContactCmp); - for (int i=0;i<cpuConstraints.size();i++) + for (int i = 0; i < cpuConstraints.size(); i++) { cpuConstraints[i].m_batchIdx = i; } } m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints); - if (m_debugOutput==100) + if (m_debugOutput == 100) { - for (int i=0;i<cpuConstraints.size();i++) + for (int i = 0; i < cpuConstraints.size(); i++) { - printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n",i,cpuConstraints[i].m_bodyAPtrAndSignBit,cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx); + printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n", i, cpuConstraints[i].m_bodyAPtrAndSignBit, cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx); } } m_debugOutput++; } } - - - int nContactOut = m_data->m_pBufContactOutGPU->size(); bool useSolver = true; - - - if (useSolver) - { - float dt=1./60.; - b3ConstraintCfg csCfg( dt ); - csCfg.m_enableParallelSolve = true; - csCfg.m_batchCellSize = 6; - csCfg.m_staticIdx = static0Index; - - - b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU; - - void* additionalData = 0;//m_data->m_frictionCGPU; - const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU; - b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU; - int nContacts = nContactOut; - - + + if (useSolver) + { + float dt = 1. / 60.; + b3ConstraintCfg csCfg(dt); + csCfg.m_enableParallelSolve = true; + csCfg.m_batchCellSize = 6; + csCfg.m_staticIdx = static0Index; + + b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU; + + void* additionalData = 0; //m_data->m_frictionCGPU; + const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU; + b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU; + int nContacts = nContactOut; + int maxNumBatches = 0; - + if (!gUseLargeBatches) - { - - if( m_data->m_solverGPU->m_contactBuffer2) - { - m_data->m_solverGPU->m_contactBuffer2->resize(nContacts); - } - - if( m_data->m_solverGPU->m_contactBuffer2 == 0 ) - { - m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context,m_data->m_queue, nContacts ); - m_data->m_solverGPU->m_contactBuffer2->resize(nContacts); - } - - //clFinish(m_data->m_queue); - - - + { + if (m_data->m_solverGPU->m_contactBuffer2) { - B3_PROFILE("batching"); - //@todo: just reserve it, without copy of original contact (unless we use warmstarting) + m_data->m_solverGPU->m_contactBuffer2->resize(nContacts); + } + if (m_data->m_solverGPU->m_contactBuffer2 == 0) + { + m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context, m_data->m_queue, nContacts); + m_data->m_solverGPU->m_contactBuffer2->resize(nContacts); + } + //clFinish(m_data->m_queue); - //const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf; + { + B3_PROFILE("batching"); + //@todo: just reserve it, without copy of original contact (unless we use warmstarting) + //const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf; { - //b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf ); //b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn ); - const int sortAlignment = 512; // todo. get this out of sort - if( csCfg.m_enableParallelSolve ) + const int sortAlignment = 512; // todo. get this out of sort + if (csCfg.m_enableParallelSolve) { - - - int sortSize = B3NEXTMULTIPLEOF( nContacts, sortAlignment ); + int sortSize = B3NEXTMULTIPLEOF(nContacts, sortAlignment); b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; - if (!gCpuSetSortData) - { // 2. set cell idx + { // 2. set cell idx B3_PROFILE("GPU set cell idx"); struct CB { @@ -834,29 +728,28 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem b3Int4 m_nSplit; }; - b3Assert( sortSize%64 == 0 ); + b3Assert(sortSize % 64 == 0); CB cdata; cdata.m_nContacts = nContacts; cdata.m_staticIdx = csCfg.m_staticIdx; - cdata.m_scale = 1.f/csCfg.m_batchCellSize; + cdata.m_scale = 1.f / csCfg.m_batchCellSize; cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X; cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y; cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z; m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); - - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL()), b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; - b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel,"m_setSortDataKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( cdata.m_nContacts ); - launcher.setConst( cdata.m_scale ); + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())}; + b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel, "m_setSortDataKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(cdata.m_nContacts); + launcher.setConst(cdata.m_scale); launcher.setConst(cdata.m_nSplit); launcher.setConst(cdata.m_staticIdx); - - launcher.launch1D( sortSize, 64 ); - } else + launcher.launch1D(sortSize, 64); + } + else { m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts); b3AlignedObjectArray<b3SortData> sortDataCPU; @@ -866,22 +759,19 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem m_data->m_pBufContactOutGPU->copyToHost(contactCPU); b3AlignedObjectArray<b3RigidBodyData> bodiesCPU; bodyBuf->copyToHost(bodiesCPU); - float scale = 1.f/csCfg.m_batchCellSize; + float scale = 1.f / csCfg.m_batchCellSize; b3Int4 nSplit; nSplit.x = B3_SOLVER_N_SPLIT_X; nSplit.y = B3_SOLVER_N_SPLIT_Y; nSplit.z = B3_SOLVER_N_SPLIT_Z; - SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts,scale,nSplit,csCfg.m_staticIdx); - + SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts, scale, nSplit, csCfg.m_staticIdx); m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU); } - - if (!gCpuRadixSort) - { // 3. sort by cell idx + { // 3. sort by cell idx B3_PROFILE("gpuRadixSort"); //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT; //int sortBit = 32; @@ -891,10 +781,8 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize ); b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer); this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut); - - - - } else + } + else { b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer); b3AlignedObjectArray<b3SortData> hostValues; @@ -903,7 +791,6 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem keyValuesInOut.copyFromHost(hostValues); } - if (gUseScanHost) { // 4. find entries @@ -914,13 +801,11 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem b3AlignedObjectArray<b3SortData> sortDataHost; m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost); - //m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); - m_data->m_solverGPU->m_search->executeHost(sortDataHost,nContacts,countsHost,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); + m_data->m_solverGPU->m_search->executeHost(sortDataHost, nContacts, countsHost, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT); countsNative->copyFromHost(countsHost); - //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, // B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT ); @@ -929,24 +814,21 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem b3AlignedObjectArray<unsigned int> offsetsHost; offsetsHost.resize(offsetsNative->size()); - - m_data->m_solverGPU->m_scan->executeHost(countsHost,offsetsHost, B3_SOLVER_N_CELLS);//,&sum ); + m_data->m_solverGPU->m_scan->executeHost(countsHost, offsetsHost, B3_SOLVER_N_CELLS); //,&sum ); offsetsNative->copyFromHost(offsetsHost); //printf("sum = %d\n",sum); - } else + } + else { // 4. find entries B3_PROFILE("gpuBoundSearch"); - m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT); - m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum ); - } - - - + m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT); + m_data->m_solverGPU->m_scan->execute(*countsNative, *offsetsNative, B3_SOLVER_N_CELLS); //,&sum ); + } if (nContacts) - { // 5. sort constraints by cellIdx + { // 5. sort constraints by cellIdx if (gReorderContactsOnCpu) { B3_PROFILE("cpu m_reorderContactKernel"); @@ -956,7 +838,7 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem b3AlignedObjectArray<b3Contact4> outContacts; m_data->m_pBufContactOutGPU->copyToHost(inContacts); outContacts.resize(inContacts.size()); - for (int i=0;i<nContacts;i++) + for (int i = 0; i < nContacts; i++) { int srcIdx = sortDataHost[i].y; outContacts[i] = inContacts[srcIdx]; @@ -974,30 +856,25 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem " }\n" "}\n" */ - } else + } + else { B3_PROFILE("gpu m_reorderContactKernel"); b3Int4 cdata; cdata.x = nContacts; - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), - b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL()) - , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) }; + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()), + b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())}; - b3LauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel,"m_reorderContactKernel"); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( cdata ); - launcher.launch1D( nContacts, 64 ); + b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(cdata); + launcher.launch1D(nContacts, 64); } } - - - - } - } //clFinish(m_data->m_queue); @@ -1008,48 +885,46 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem // printf(",,,\n"); // } - if (nContacts) { - if (gUseCpuCopyConstraints) { - for (int i=0;i<nContacts;i++) + for (int i = 0; i < nContacts; i++) { m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2); - // m_data->m_solverGPU->m_contactBuffer2->getBufferCL(); - // m_data->m_pBufContactOutGPU->getBufferCL() + // m_data->m_solverGPU->m_contactBuffer2->getBufferCL(); + // m_data->m_pBufContactOutGPU->getBufferCL() } - - } else + } + else { B3_PROFILE("gpu m_copyConstraintKernel"); - b3Int4 cdata; cdata.x = nContacts; - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL() ), - b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ) - }; - - b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel,"m_copyConstraintKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst( cdata ); - launcher.launch1D( nContacts, 64 ); + b3Int4 cdata; + cdata.x = nContacts; + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()), + b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL())}; + + b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel, "m_copyConstraintKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); + launcher.setConst(cdata); + launcher.launch1D(nContacts, 64); //we use the clFinish for proper benchmark/profile clFinish(m_data->m_queue); } } - -// bool compareGPU = false; + // bool compareGPU = false; if (nContacts) { if (!gCpuBatchContacts) { B3_PROFILE("gpu batchContacts"); - maxNumBatches = 250;//250; - m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx ); + maxNumBatches = 250; //250; + m_data->m_solverGPU->batchContacts(m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx); clFinish(m_data->m_queue); - } else + } + else { B3_PROFILE("cpu batchContacts"); static b3AlignedObjectArray<b3Contact4> cpuContacts; @@ -1070,45 +945,43 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem offsetsNative->copyToHost(offsetsNativeHost); } - - int numNonzeroGrid=0; + int numNonzeroGrid = 0; if (gUseLargeBatches) { m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES); int totalNumConstraints = cpuContacts.size(); //int simdWidth =numBodies+1;//-1;//64;//-1;//32; - int numBatches = sortConstraintByBatch3( &cpuContacts[0], totalNumConstraints, totalNumConstraints+1,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[0]); // on GPU - maxNumBatches = b3Max(numBatches,maxNumBatches); + int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]); // on GPU + maxNumBatches = b3Max(numBatches, maxNumBatches); static int globalMaxBatch = 0; - if (maxNumBatches>globalMaxBatch ) + if (maxNumBatches > globalMaxBatch) { - globalMaxBatch = maxNumBatches; - b3Printf("maxNumBatches = %d\n",maxNumBatches); + globalMaxBatch = maxNumBatches; + b3Printf("maxNumBatches = %d\n", maxNumBatches); } - - } else + } + else { - m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS*B3_MAX_NUM_BATCHES); + m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS * B3_MAX_NUM_BATCHES); B3_PROFILE("cpu batch grid"); - for(int i=0; i<B3_SOLVER_N_CELLS; i++) + for (int i = 0; i < B3_SOLVER_N_CELLS; i++) { int n = (nNativeHost)[i]; int offset = (offsetsNativeHost)[i]; - if( n ) + if (n) { numNonzeroGrid++; - int simdWidth =numBodies+1;//-1;//64;//-1;//32; - int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[i*B3_MAX_NUM_BATCHES]); // on GPU - maxNumBatches = b3Max(numBatches,maxNumBatches); + int simdWidth = numBodies + 1; //-1;//64;//-1;//32; + int numBatches = sortConstraintByBatch3(&cpuContacts[0] + offset, n, simdWidth, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[i * B3_MAX_NUM_BATCHES]); // on GPU + maxNumBatches = b3Max(numBatches, maxNumBatches); static int globalMaxBatch = 0; - if (maxNumBatches>globalMaxBatch ) + if (maxNumBatches > globalMaxBatch) { - globalMaxBatch = maxNumBatches; - b3Printf("maxNumBatches = %d\n",maxNumBatches); + globalMaxBatch = maxNumBatches; + b3Printf("maxNumBatches = %d\n", maxNumBatches); } //we use the clFinish for proper benchmark/profile - } } //clFinish(m_data->m_queue); @@ -1117,22 +990,12 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem B3_PROFILE("m_contactBuffer->copyFromHost"); m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts); } - - } - + } } + } + } - - - - - } - - - } - - - //printf("maxNumBatches = %d\n", maxNumBatches); + //printf("maxNumBatches = %d\n", maxNumBatches); if (gUseLargeBatches) { @@ -1140,58 +1003,52 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem { B3_PROFILE("cpu batchContacts"); static b3AlignedObjectArray<b3Contact4> cpuContacts; -// b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2; + // b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2; { B3_PROFILE("copyToHost"); m_data->m_pBufContactOutGPU->copyToHost(cpuContacts); } -// b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; -// b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; - + // b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints; + // b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets; - -// int numNonzeroGrid=0; + // int numNonzeroGrid=0; { m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES); int totalNumConstraints = cpuContacts.size(); - // int simdWidth =numBodies+1;//-1;//64;//-1;//32; - int numBatches = sortConstraintByBatch3( &cpuContacts[0], totalNumConstraints, totalNumConstraints+1,csCfg.m_staticIdx ,numBodies,&m_data->m_batchSizes[0]); // on GPU - maxNumBatches = b3Max(numBatches,maxNumBatches); + // int simdWidth =numBodies+1;//-1;//64;//-1;//32; + int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]); // on GPU + maxNumBatches = b3Max(numBatches, maxNumBatches); static int globalMaxBatch = 0; - if (maxNumBatches>globalMaxBatch ) + if (maxNumBatches > globalMaxBatch) { - globalMaxBatch = maxNumBatches; - b3Printf("maxNumBatches = %d\n",maxNumBatches); + globalMaxBatch = maxNumBatches; + b3Printf("maxNumBatches = %d\n", maxNumBatches); } - } { B3_PROFILE("m_contactBuffer->copyFromHost"); m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts); } - - } - + } } if (nContacts) { B3_PROFILE("gpu convertToConstraints"); - m_data->m_solverGPU->convertToConstraints( bodyBuf, - shapeBuf, m_data->m_solverGPU->m_contactBuffer2, - contactConstraintOut, - additionalData, nContacts, - (b3SolverBase::ConstraintCfg&) csCfg ); + m_data->m_solverGPU->convertToConstraints(bodyBuf, + shapeBuf, m_data->m_solverGPU->m_contactBuffer2, + contactConstraintOut, + additionalData, nContacts, + (b3SolverBase::ConstraintCfg&)csCfg); clFinish(m_data->m_queue); } - if (1) { int numIter = 4; - m_data->m_solverGPU->m_nIterations = numIter;//10 + m_data->m_solverGPU->m_nIterations = numIter; //10 if (!gCpuSolveConstraint) { B3_PROFILE("GPU solveContactConstraint"); @@ -1208,32 +1065,30 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem if (gUseLargeBatches) { - solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU, - m_data->m_inertiaBufferGPU, - m_data->m_contactCGPU,0, - nContactOut , - maxNumBatches,numIter,&m_data->m_batchSizes); - } else + solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU, + m_data->m_inertiaBufferGPU, + m_data->m_contactCGPU, 0, + nContactOut, + maxNumBatches, numIter, &m_data->m_batchSizes); + } + else { solveContactConstraint( - m_data->m_bodyBufferGPU, + m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, - m_data->m_contactCGPU,0, - nContactOut , - maxNumBatches,numIter,&m_data->m_batchSizes);//m_data->m_batchSizesGpu); + m_data->m_contactCGPU, 0, + nContactOut, + maxNumBatches, numIter, &m_data->m_batchSizes); //m_data->m_batchSizesGpu); } } else { B3_PROFILE("Host solveContactConstraint"); - m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU,0, nContactOut ,maxNumBatches,&m_data->m_batchSizes); + m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU, 0, nContactOut, maxNumBatches, &m_data->m_batchSizes); } - - - } - - + } + #if 0 if (0) { @@ -1244,114 +1099,96 @@ void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL ); } #endif - - } - + } } - -void b3GpuPgsContactSolver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx ) +void b3GpuPgsContactSolver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx) { } - - - - - - - - - - b3AlignedObjectArray<unsigned int> idxBuffer; b3AlignedObjectArray<b3SortData> sortData; b3AlignedObjectArray<b3Contact4> old; - -inline int b3GpuPgsContactSolver::sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies) +inline int b3GpuPgsContactSolver::sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies) { - B3_PROFILE("sortConstraintByBatch"); int numIter = 0; - + sortData.resize(n); idxBuffer.resize(n); old.resize(n); - + unsigned int* idxSrc = &idxBuffer[0]; unsigned int* idxDst = &idxBuffer[0]; int nIdxSrc, nIdxDst; - + const int N_FLG = 256; - const int FLG_MASK = N_FLG-1; - unsigned int flg[N_FLG/32]; + const int FLG_MASK = N_FLG - 1; + unsigned int flg[N_FLG / 32]; #if defined(_DEBUG) - for(int i=0; i<n; i++) + for (int i = 0; i < n; i++) cs[i].getBatchIdx() = -1; #endif - for(int i=0; i<n; i++) + for (int i = 0; i < n; i++) idxSrc[i] = i; nIdxSrc = n; - + int batchIdx = 0; - + { B3_PROFILE("cpu batch innerloop"); - while( nIdxSrc ) + while (nIdxSrc) { numIter++; nIdxDst = 0; int nCurrentBatch = 0; - + // clear flag - for(int i=0; i<N_FLG/32; i++) flg[i] = 0; - - for(int i=0; i<nIdxSrc; i++) + for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0; + + for (int i = 0; i < nIdxSrc; i++) { int idx = idxSrc[i]; - - b3Assert( idx < n ); + b3Assert(idx < n); // check if it can go int bodyAS = cs[idx].m_bodyAPtrAndSignBit; int bodyBS = cs[idx].m_bodyBPtrAndSignBit; - - - + int bodyA = abs(bodyAS); int bodyB = abs(bodyBS); - + int aIdx = bodyA & FLG_MASK; int bIdx = bodyB & FLG_MASK; - - unsigned int aUnavailable = flg[ aIdx/32 ] & (1<<(aIdx&31)); - unsigned int bUnavailable = flg[ bIdx/32 ] & (1<<(bIdx&31)); - - bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; - bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; - - //use inv_mass! - aUnavailable = !aIsStatic? aUnavailable:0;// - bUnavailable = !bIsStatic? bUnavailable:0; - - if( aUnavailable==0 && bUnavailable==0 ) // ok + + unsigned int aUnavailable = flg[aIdx / 32] & (1 << (aIdx & 31)); + unsigned int bUnavailable = flg[bIdx / 32] & (1 << (bIdx & 31)); + + bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx; + bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx; + + //use inv_mass! + aUnavailable = !aIsStatic ? aUnavailable : 0; // + bUnavailable = !bIsStatic ? bUnavailable : 0; + + if (aUnavailable == 0 && bUnavailable == 0) // ok { if (!aIsStatic) - flg[ aIdx/32 ] |= (1<<(aIdx&31)); + flg[aIdx / 32] |= (1 << (aIdx & 31)); if (!bIsStatic) - flg[ bIdx/32 ] |= (1<<(bIdx&31)); + flg[bIdx / 32] |= (1 << (bIdx & 31)); cs[idx].getBatchIdx() = batchIdx; sortData[idx].m_key = batchIdx; sortData[idx].m_value = idx; - + { nCurrentBatch++; - if( nCurrentBatch == simdWidth ) + if (nCurrentBatch == simdWidth) { nCurrentBatch = 0; - for(int i=0; i<N_FLG/32; i++) flg[i] = 0; + for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0; } } } @@ -1360,128 +1197,121 @@ inline int b3GpuPgsContactSolver::sortConstraintByBatch( b3Contact4* cs, int n, idxDst[nIdxDst++] = idx; } } - b3Swap( idxSrc, idxDst ); - b3Swap( nIdxSrc, nIdxDst ); - batchIdx ++; + b3Swap(idxSrc, idxDst); + b3Swap(nIdxSrc, nIdxDst); + batchIdx++; } } { B3_PROFILE("quickSort"); sortData.quickSort(sortfnc); } - - + { - B3_PROFILE("reorder"); + B3_PROFILE("reorder"); // reorder - - memcpy( &old[0], cs, sizeof(b3Contact4)*n); - for(int i=0; i<n; i++) + + memcpy(&old[0], cs, sizeof(b3Contact4) * n); + for (int i = 0; i < n; i++) { int idx = sortData[i].m_value; cs[i] = old[idx]; } } - - + #if defined(_DEBUG) - // debugPrintf( "nBatches: %d\n", batchIdx ); - for(int i=0; i<n; i++) - { - b3Assert( cs[i].getBatchIdx() != -1 ); - } + // debugPrintf( "nBatches: %d\n", batchIdx ); + for (int i = 0; i < n; i++) + { + b3Assert(cs[i].getBatchIdx() != -1); + } #endif return batchIdx; } - b3AlignedObjectArray<int> bodyUsed2; -inline int b3GpuPgsContactSolver::sortConstraintByBatch2( b3Contact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies) +inline int b3GpuPgsContactSolver::sortConstraintByBatch2(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies) { - B3_PROFILE("sortConstraintByBatch2"); - - - bodyUsed2.resize(2*simdWidth); + bodyUsed2.resize(2 * simdWidth); - for (int q=0;q<2*simdWidth;q++) - bodyUsed2[q]=0; + for (int q = 0; q < 2 * simdWidth; q++) + bodyUsed2[q] = 0; int curBodyUsed = 0; int numIter = 0; - + m_data->m_sortData.resize(numConstraints); m_data->m_idxBuffer.resize(numConstraints); m_data->m_old.resize(numConstraints); - + unsigned int* idxSrc = &m_data->m_idxBuffer[0]; - + #if defined(_DEBUG) - for(int i=0; i<numConstraints; i++) + for (int i = 0; i < numConstraints; i++) cs[i].getBatchIdx() = -1; #endif - for(int i=0; i<numConstraints; i++) + for (int i = 0; i < numConstraints; i++) idxSrc[i] = i; - + int numValidConstraints = 0; -// int unprocessedConstraintIndex = 0; + // int unprocessedConstraintIndex = 0; int batchIdx = 0; - { B3_PROFILE("cpu batch innerloop"); - - while( numValidConstraints < numConstraints) + + while (numValidConstraints < numConstraints) { numIter++; int nCurrentBatch = 0; // clear flag - for(int i=0; i<curBodyUsed; i++) + for (int i = 0; i < curBodyUsed; i++) bodyUsed2[i] = 0; - curBodyUsed = 0; + curBodyUsed = 0; - for(int i=numValidConstraints; i<numConstraints; i++) + for (int i = numValidConstraints; i < numConstraints; i++) { int idx = idxSrc[i]; - b3Assert( idx < numConstraints ); + b3Assert(idx < numConstraints); // check if it can go int bodyAS = cs[idx].m_bodyAPtrAndSignBit; int bodyBS = cs[idx].m_bodyBPtrAndSignBit; int bodyA = abs(bodyAS); int bodyB = abs(bodyBS); - bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; - bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx; + bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx; int aUnavailable = 0; int bUnavailable = 0; if (!aIsStatic) { - for (int j=0;j<curBodyUsed;j++) + for (int j = 0; j < curBodyUsed; j++) { if (bodyA == bodyUsed2[j]) { - aUnavailable=1; + aUnavailable = 1; break; } } } if (!aUnavailable) - if (!bIsStatic) - { - for (int j=0;j<curBodyUsed;j++) + if (!bIsStatic) { - if (bodyB == bodyUsed2[j]) + for (int j = 0; j < curBodyUsed; j++) { - bUnavailable=1; - break; + if (bodyB == bodyUsed2[j]) + { + bUnavailable = 1; + break; + } } } - } - - if( aUnavailable==0 && bUnavailable==0 ) // ok + + if (aUnavailable == 0 && bUnavailable == 0) // ok { if (!aIsStatic) { @@ -1496,7 +1326,7 @@ inline int b3GpuPgsContactSolver::sortConstraintByBatch2( b3Contact4* cs, int nu m_data->m_sortData[idx].m_key = batchIdx; m_data->m_sortData[idx].m_value = idx; - if (i!=numValidConstraints) + if (i != numValidConstraints) { b3Swap(idxSrc[i], idxSrc[numValidConstraints]); } @@ -1504,20 +1334,19 @@ inline int b3GpuPgsContactSolver::sortConstraintByBatch2( b3Contact4* cs, int nu numValidConstraints++; { nCurrentBatch++; - if( nCurrentBatch == simdWidth ) + if (nCurrentBatch == simdWidth) { nCurrentBatch = 0; - for(int i=0; i<curBodyUsed; i++) + for (int i = 0; i < curBodyUsed; i++) bodyUsed2[i] = 0; - curBodyUsed = 0; } } } } - - batchIdx ++; + + batchIdx++; } } { @@ -1526,155 +1355,148 @@ inline int b3GpuPgsContactSolver::sortConstraintByBatch2( b3Contact4* cs, int nu } { - B3_PROFILE("reorder"); + B3_PROFILE("reorder"); // reorder - - memcpy( &m_data->m_old[0], cs, sizeof(b3Contact4)*numConstraints); - for(int i=0; i<numConstraints; i++) + memcpy(&m_data->m_old[0], cs, sizeof(b3Contact4) * numConstraints); + + for (int i = 0; i < numConstraints; i++) { b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]); int idx = m_data->m_sortData[idxSrc[i]].m_value; cs[i] = m_data->m_old[idx]; } } - + #if defined(_DEBUG) - // debugPrintf( "nBatches: %d\n", batchIdx ); - for(int i=0; i<numConstraints; i++) - { - b3Assert( cs[i].getBatchIdx() != -1 ); - } + // debugPrintf( "nBatches: %d\n", batchIdx ); + for (int i = 0; i < numConstraints; i++) + { + b3Assert(cs[i].getBatchIdx() != -1); + } #endif - return batchIdx; } - b3AlignedObjectArray<int> bodyUsed; b3AlignedObjectArray<int> curUsed; - -inline int b3GpuPgsContactSolver::sortConstraintByBatch3( b3Contact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies, int* batchSizes) +inline int b3GpuPgsContactSolver::sortConstraintByBatch3(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies, int* batchSizes) { - B3_PROFILE("sortConstraintByBatch3"); - + static int maxSwaps = 0; int numSwaps = 0; - curUsed.resize(2*simdWidth); + curUsed.resize(2 * simdWidth); static int maxNumConstraints = 0; - if (maxNumConstraints<numConstraints) + if (maxNumConstraints < numConstraints) { maxNumConstraints = numConstraints; //printf("maxNumConstraints = %d\n",maxNumConstraints ); } - int numUsedArray = numBodies/32+1; + int numUsedArray = numBodies / 32 + 1; bodyUsed.resize(numUsedArray); - for (int q=0;q<numUsedArray;q++) - bodyUsed[q]=0; + for (int q = 0; q < numUsedArray; q++) + bodyUsed[q] = 0; - int curBodyUsed = 0; int numIter = 0; - + m_data->m_sortData.resize(0); m_data->m_idxBuffer.resize(0); m_data->m_old.resize(0); - - + #if defined(_DEBUG) - for(int i=0; i<numConstraints; i++) + for (int i = 0; i < numConstraints; i++) cs[i].getBatchIdx() = -1; #endif - + int numValidConstraints = 0; -// int unprocessedConstraintIndex = 0; + // int unprocessedConstraintIndex = 0; int batchIdx = 0; - { B3_PROFILE("cpu batch innerloop"); - - while( numValidConstraints < numConstraints) + + while (numValidConstraints < numConstraints) { numIter++; int nCurrentBatch = 0; batchSizes[batchIdx] = 0; // clear flag - for(int i=0; i<curBodyUsed; i++) - bodyUsed[curUsed[i]/32] = 0; + for (int i = 0; i < curBodyUsed; i++) + bodyUsed[curUsed[i] / 32] = 0; - curBodyUsed = 0; + curBodyUsed = 0; - for(int i=numValidConstraints; i<numConstraints; i++) + for (int i = numValidConstraints; i < numConstraints; i++) { int idx = i; - b3Assert( idx < numConstraints ); + b3Assert(idx < numConstraints); // check if it can go int bodyAS = cs[idx].m_bodyAPtrAndSignBit; int bodyBS = cs[idx].m_bodyBPtrAndSignBit; int bodyA = abs(bodyAS); int bodyB = abs(bodyBS); - bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; - bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx; + bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx; int aUnavailable = 0; int bUnavailable = 0; if (!aIsStatic) { - aUnavailable = bodyUsed[ bodyA/32 ] & (1<<(bodyA&31)); + aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31)); } if (!aUnavailable) - if (!bIsStatic) - { - bUnavailable = bodyUsed[ bodyB/32 ] & (1<<(bodyB&31)); - } - - if( aUnavailable==0 && bUnavailable==0 ) // ok + if (!bIsStatic) + { + bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31)); + } + + if (aUnavailable == 0 && bUnavailable == 0) // ok { if (!aIsStatic) { - bodyUsed[ bodyA/32 ] |= (1<<(bodyA&31)); - curUsed[curBodyUsed++]=bodyA; + bodyUsed[bodyA / 32] |= (1 << (bodyA & 31)); + curUsed[curBodyUsed++] = bodyA; } if (!bIsStatic) { - bodyUsed[ bodyB/32 ] |= (1<<(bodyB&31)); - curUsed[curBodyUsed++]=bodyB; + bodyUsed[bodyB / 32] |= (1 << (bodyB & 31)); + curUsed[curBodyUsed++] = bodyB; } cs[idx].getBatchIdx() = batchIdx; - if (i!=numValidConstraints) + if (i != numValidConstraints) { - b3Swap(cs[i],cs[numValidConstraints]); + b3Swap(cs[i], cs[numValidConstraints]); numSwaps++; } numValidConstraints++; { nCurrentBatch++; - if( nCurrentBatch == simdWidth ) + if (nCurrentBatch == simdWidth) { batchSizes[batchIdx] += simdWidth; nCurrentBatch = 0; - for(int i=0; i<curBodyUsed; i++) - bodyUsed[curUsed[i]/32] = 0; + for (int i = 0; i < curBodyUsed; i++) + bodyUsed[curUsed[i] / 32] = 0; curBodyUsed = 0; } } } } - if (batchIdx>=B3_MAX_NUM_BATCHES) + if (batchIdx >= B3_MAX_NUM_BATCHES) { b3Error("batchIdx>=B3_MAX_NUM_BATCHES"); b3Assert(0); @@ -1683,26 +1505,25 @@ inline int b3GpuPgsContactSolver::sortConstraintByBatch3( b3Contact4* cs, int nu batchSizes[batchIdx] += nCurrentBatch; - batchIdx ++; - + batchIdx++; } } - + #if defined(_DEBUG) - // debugPrintf( "nBatches: %d\n", batchIdx ); - for(int i=0; i<numConstraints; i++) - { - b3Assert( cs[i].getBatchIdx() != -1 ); - } + // debugPrintf( "nBatches: %d\n", batchIdx ); + for (int i = 0; i < numConstraints; i++) + { + b3Assert(cs[i].getBatchIdx() != -1); + } #endif - batchSizes[batchIdx] =0; - - if (maxSwaps<numSwaps) + batchSizes[batchIdx] = 0; + + if (maxSwaps < numSwaps) { maxSwaps = numSwaps; //printf("maxSwaps = %d\n", maxSwaps); } - + return batchIdx; } diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h index 98e2a5b8c4..6ab7502af3 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h @@ -11,33 +11,27 @@ class b3GpuPgsContactSolver { protected: - int m_debugOutput; - struct b3GpuBatchingPgsSolverInternalData* m_data; + struct b3GpuBatchingPgsSolverInternalData* m_data; + + void batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx); - void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx ); - - inline int sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies); - inline int sortConstraintByBatch2( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies); - inline int sortConstraintByBatch3( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies, int* batchSizes); - + inline int sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies); + inline int sortConstraintByBatch2(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies); + inline int sortConstraintByBatch3(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies, int* batchSizes); - - void solveContactConstraintBatchSizes( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes); + void solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes); //const b3OpenCLArray<int>* gpuBatchSizes); - void solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes); + void solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes); //const b3OpenCLArray<int>* gpuBatchSizes); public: - - b3GpuPgsContactSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity); + b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity); virtual ~b3GpuPgsContactSolver(); void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index); - }; -#endif //B3_GPU_BATCHING_PGS_SOLVER_H - +#endif //B3_GPU_BATCHING_PGS_SOLVER_H diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp index 783e443060..fef33ad1cd 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp @@ -47,7 +47,7 @@ bool gClearPairsOnGpu = true; #define TEST_OTHER_GPU_SOLVER 1 #ifdef TEST_OTHER_GPU_SOLVER #include "b3GpuJacobiContactSolver.h" -#endif //TEST_OTHER_GPU_SOLVER +#endif //TEST_OTHER_GPU_SOLVER #include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" #include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h" @@ -59,73 +59,68 @@ bool gClearPairsOnGpu = true; #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h" #include "Bullet3OpenCL/Raycast/b3GpuRaycast.h" - #include "Bullet3Dynamics/shared/b3IntegrateTransforms.h" #include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h" -b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue q,class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap , struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config) +b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config) { m_data = new b3GpuRigidBodyPipelineInternalData; - m_data->m_constraintUid=0; + m_data->m_constraintUid = 0; m_data->m_config = config; m_data->m_context = ctx; m_data->m_device = device; m_data->m_queue = q; - m_data->m_solver = new b3PgsJacobiSolver(true);//new b3PgsJacobiSolver(true); - m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx,device,q,true);//new b3PgsJacobiSolver(true); - - m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx,q,config.m_maxConvexBodies); - m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx,q,config.m_maxBroadphasePairs); + m_data->m_solver = new b3PgsJacobiSolver(true); //new b3PgsJacobiSolver(true); + m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx, device, q, true); //new b3PgsJacobiSolver(true); - m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx,q); + m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx, q, config.m_maxConvexBodies); + m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx, q, config.m_maxBroadphasePairs); + + m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx, q); #ifdef TEST_OTHER_GPU_SOLVER - m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx,device,q,config.m_maxBroadphasePairs); -#endif // TEST_OTHER_GPU_SOLVER - - m_data->m_solver2 = new b3GpuPgsContactSolver(ctx,device,q,config.m_maxBroadphasePairs); + m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx, device, q, config.m_maxBroadphasePairs); +#endif // TEST_OTHER_GPU_SOLVER + + m_data->m_solver2 = new b3GpuPgsContactSolver(ctx, device, q, config.m_maxBroadphasePairs); - m_data->m_raycaster = new b3GpuRaycast(ctx,device,q); + m_data->m_raycaster = new b3GpuRaycast(ctx, device, q); - m_data->m_broadphaseDbvt = broadphaseDbvt; m_data->m_broadphaseSap = broadphaseSap; m_data->m_narrowphase = narrowphase; - m_data->m_gravity.setValue(0.f,-9.8f,0.f); + m_data->m_gravity.setValue(0.f, -9.8f, 0.f); - cl_int errNum=0; + cl_int errNum = 0; { - cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,integrateKernelCL,&errNum,"",B3_RIGIDBODY_INTEGRATE_PATH); - b3Assert(errNum==CL_SUCCESS); - m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,integrateKernelCL, "integrateTransformsKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); + cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, integrateKernelCL, &errNum, "", B3_RIGIDBODY_INTEGRATE_PATH); + b3Assert(errNum == CL_SUCCESS); + m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, integrateKernelCL, "integrateTransformsKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); clReleaseProgram(prog); } { - cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,updateAabbsKernelCL,&errNum,"",B3_RIGIDBODY_UPDATEAABB_PATH); - b3Assert(errNum==CL_SUCCESS); - m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "initializeGpuAabbsFull",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); + cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, &errNum, "", B3_RIGIDBODY_UPDATEAABB_PATH); + b3Assert(errNum == CL_SUCCESS); + m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "initializeGpuAabbsFull", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); - - m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "clearOverlappingPairsKernel",&errNum,prog); - b3Assert(errNum==CL_SUCCESS); + m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "clearOverlappingPairsKernel", &errNum, prog); + b3Assert(errNum == CL_SUCCESS); clReleaseProgram(prog); } - - } b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline() { if (m_data->m_integrateTransformsKernel) clReleaseKernel(m_data->m_integrateTransformsKernel); - + if (m_data->m_updateAabbsKernel) clReleaseKernel(m_data->m_updateAabbsKernel); - + if (m_data->m_clearOverlappingPairsKernel) clReleaseKernel(m_data->m_clearOverlappingPairsKernel); delete m_data->m_raycaster; @@ -136,15 +131,14 @@ b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline() #ifdef TEST_OTHER_GPU_SOLVER delete m_data->m_solver3; -#endif //TEST_OTHER_GPU_SOLVER - +#endif //TEST_OTHER_GPU_SOLVER + delete m_data->m_solver2; - - + delete m_data; } -void b3GpuRigidBodyPipeline::reset() +void b3GpuRigidBodyPipeline::reset() { m_data->m_gpuConstraints->resize(0); m_data->m_cpuConstraints.resize(0); @@ -152,30 +146,28 @@ void b3GpuRigidBodyPipeline::reset() m_data->m_allAabbsCPU.resize(0); } -void b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint) +void b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint) { m_data->m_joints.push_back(constraint); } -void b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint) +void b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint) { m_data->m_joints.remove(constraint); } - - -void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid) +void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid) { m_data->m_gpuSolver->recomputeBatches(); //slow linear search m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints); //remove - for (int i=0;i<m_data->m_cpuConstraints.size();i++) + for (int i = 0; i < m_data->m_cpuConstraints.size(); i++) { if (m_data->m_cpuConstraints[i].m_uid == uid) { //m_data->m_cpuConstraints.remove(m_data->m_cpuConstraints[i]); - m_data->m_cpuConstraints.swap(i,m_data->m_cpuConstraints.size()-1); + m_data->m_cpuConstraints.swap(i, m_data->m_cpuConstraints.size() - 1); m_data->m_cpuConstraints.pop_back(); break; @@ -185,13 +177,13 @@ void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid) if (m_data->m_cpuConstraints.size()) { m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints); - } else + } + else { m_data->m_gpuConstraints->resize(0); } - } -int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold) +int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold) { m_data->m_gpuSolver->recomputeBatches(); b3GpuGenericConstraint c; @@ -200,14 +192,14 @@ int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, co c.m_flags = B3_CONSTRAINT_FLAG_ENABLED; c.m_rbA = bodyA; c.m_rbB = bodyB; - c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]); - c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]); + c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]); + c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]); c.m_breakingImpulseThreshold = breakingThreshold; c.m_constraintType = B3_GPU_POINT2POINT_CONSTRAINT_TYPE; m_data->m_cpuConstraints.push_back(c); return c.m_uid; } -int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB,float breakingThreshold) +int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold) { m_data->m_gpuSolver->recomputeBatches(); b3GpuGenericConstraint c; @@ -216,9 +208,9 @@ int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const fl c.m_flags = B3_CONSTRAINT_FLAG_ENABLED; c.m_rbA = bodyA; c.m_rbB = bodyB; - c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]); - c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]); - c.m_relTargetAB.setValue(relTargetAB[0],relTargetAB[1],relTargetAB[2],relTargetAB[3]); + c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]); + c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]); + c.m_relTargetAB.setValue(relTargetAB[0], relTargetAB[1], relTargetAB[2], relTargetAB[3]); c.m_breakingImpulseThreshold = breakingThreshold; c.m_constraintType = B3_GPU_FIXED_CONSTRAINT_TYPE; @@ -226,31 +218,28 @@ int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const fl return c.m_uid; } - -void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) +void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) { - //update worldspace AABBs from local AABB/worldtransform { B3_PROFILE("setupGpuAabbs"); setupGpuAabbsFull(); } - int numPairs =0; + int numPairs = 0; //compute overlapping pairs { - if (gUseDbvt) { { B3_PROFILE("setAabb"); m_data->m_allAabbsGPU->copyToHost(m_data->m_allAabbsCPU); - for (int i=0;i<m_data->m_allAabbsCPU.size();i++) + for (int i = 0; i < m_data->m_allAabbsCPU.size(); i++) { - b3Vector3 aabbMin=b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0],m_data->m_allAabbsCPU[i].m_min[1],m_data->m_allAabbsCPU[i].m_min[2]); - b3Vector3 aabbMax=b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0],m_data->m_allAabbsCPU[i].m_max[1],m_data->m_allAabbsCPU[i].m_max[2]); - m_data->m_broadphaseDbvt->setAabb(i,aabbMin,aabbMax,0); + b3Vector3 aabbMin = b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0], m_data->m_allAabbsCPU[i].m_min[1], m_data->m_allAabbsCPU[i].m_min[2]); + b3Vector3 aabbMax = b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0], m_data->m_allAabbsCPU[i].m_max[1], m_data->m_allAabbsCPU[i].m_max[2]); + m_data->m_broadphaseDbvt->setAabb(i, aabbMin, aabbMax, 0); } } @@ -259,13 +248,14 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) m_data->m_broadphaseDbvt->calculateOverlappingPairs(); } numPairs = m_data->m_broadphaseDbvt->getOverlappingPairCache()->getNumOverlappingPairs(); - - } else + } + else { if (gUseCalculateOverlappingPairsHost) { m_data->m_broadphaseSap->calculateOverlappingPairsHost(m_data->m_config.m_maxBroadphasePairs); - } else + } + else { m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs); } @@ -274,24 +264,24 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) } //compute contact points -// printf("numPairs=%d\n",numPairs); - - int numContacts = 0; + // printf("numPairs=%d\n",numPairs); + int numContacts = 0; int numBodies = m_data->m_narrowphase->getNumRigidBodies(); if (numPairs) { - cl_mem pairs =0; - cl_mem aabbsWS =0; + cl_mem pairs = 0; + cl_mem aabbsWS = 0; if (gUseDbvt) { B3_PROFILE("m_overlappingPairsGPU->copyFromHost"); m_data->m_overlappingPairsGPU->copyFromHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray()); pairs = m_data->m_overlappingPairsGPU->getBufferCL(); aabbsWS = m_data->m_allAabbsGPU->getBufferCL(); - } else + } + else { pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer(); aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS(); @@ -302,31 +292,27 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) //mark the contacts for each pair as 'unused' if (numPairs) { - b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context,m_data->m_queue); - gpuPairs.setFromOpenCLBuffer(pairs,numPairs); + b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context, m_data->m_queue); + gpuPairs.setFromOpenCLBuffer(pairs, numPairs); if (gClearPairsOnGpu) { - - //b3AlignedObjectArray<b3BroadphasePair> hostPairs;//just for debugging //gpuPairs.copyToHost(hostPairs); - b3LauncherCL launcher(m_data->m_queue,m_data->m_clearOverlappingPairsKernel,"clearOverlappingPairsKernel"); + b3LauncherCL launcher(m_data->m_queue, m_data->m_clearOverlappingPairsKernel, "clearOverlappingPairsKernel"); launcher.setBuffer(pairs); launcher.setConst(numPairs); launcher.launch1D(numPairs); - //gpuPairs.copyToHost(hostPairs); - - - } else + } + else { b3AlignedObjectArray<b3BroadphasePair> hostPairs; gpuPairs.copyToHost(hostPairs); - for (int i=0;i<hostPairs.size();i++) + for (int i = 0; i < hostPairs.size(); i++) { hostPairs[i].z = 0xffffffff; } @@ -335,7 +321,7 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) } } - m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbsWS,numBodies); + m_data->m_narrowphase->computeContacts(pairs, numPairs, aabbsWS, numBodies); numContacts = m_data->m_narrowphase->getNumContactsGpu(); if (gUseDbvt) @@ -347,56 +333,54 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) if (gDumpContactStats && numContacts) { m_data->m_narrowphase->getContactsGpu(); - + printf("numContacts = %d\n", numContacts); - int totalPoints = 0; + int totalPoints = 0; const b3Contact4* contacts = m_data->m_narrowphase->getContactsCPU(); - for (int i=0;i<numContacts;i++) + for (int i = 0; i < numContacts; i++) { totalPoints += contacts->getNPoints(); } - printf("totalPoints=%d\n",totalPoints); - + printf("totalPoints=%d\n", totalPoints); } } - //convert contact points to contact constraints - + //solve constraints - b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context,m_data->m_queue,0,true); - gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(),m_data->m_narrowphase->getNumRigidBodies()); - b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context,m_data->m_queue,0,true); - gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(),m_data->m_narrowphase->getNumRigidBodies()); - b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context,m_data->m_queue,0,true); - gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(),m_data->m_narrowphase->getNumContactsGpu()); + b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context, m_data->m_queue, 0, true); + gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(), m_data->m_narrowphase->getNumRigidBodies()); + b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context, m_data->m_queue, 0, true); + gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(), m_data->m_narrowphase->getNumRigidBodies()); + b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context, m_data->m_queue, 0, true); + gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(), m_data->m_narrowphase->getNumContactsGpu()); - int numJoints = m_data->m_joints.size() ? m_data->m_joints.size() : m_data->m_cpuConstraints.size(); + int numJoints = m_data->m_joints.size() ? m_data->m_joints.size() : m_data->m_cpuConstraints.size(); if (useBullet2CpuSolver && numJoints) { - - // b3AlignedObjectArray<b3Contact4> hostContacts; + // b3AlignedObjectArray<b3Contact4> hostContacts; //gpuContacts.copyToHost(hostContacts); { - bool useGpu = m_data->m_joints.size()==0; + bool useGpu = m_data->m_joints.size() == 0; -// b3Contact4* contacts = numContacts? &hostContacts[0]: 0; + // b3Contact4* contacts = numContacts? &hostContacts[0]: 0; //m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,contacts,numJoints, joints); if (useGpu) { - m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(),&gpuBodies,&gpuInertias,numJoints, m_data->m_gpuConstraints); - } else + m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(), &gpuBodies, &gpuInertias, numJoints, m_data->m_gpuConstraints); + } + else { b3AlignedObjectArray<b3RigidBodyData> hostBodies; gpuBodies.copyToHost(hostBodies); b3AlignedObjectArray<b3InertiaData> hostInertias; gpuInertias.copyToHost(hostInertias); - b3TypedConstraint** joints = numJoints? &m_data->m_joints[0] : 0; - m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(),&hostBodies[0],&hostInertias[0],0,0,numJoints, joints); + b3TypedConstraint** joints = numJoints ? &m_data->m_joints[0] : 0; + m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(), &hostBodies[0], &hostInertias[0], 0, 0, numJoints, joints); gpuBodies.copyFromHost(hostBodies); } } @@ -404,22 +388,20 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) if (numContacts) { - #ifdef TEST_OTHER_GPU_SOLVER - + if (gUseJacobi) { bool useGpu = true; if (useGpu) { - bool forceHost = false; if (forceHost) { b3AlignedObjectArray<b3RigidBodyData> hostBodies; b3AlignedObjectArray<b3InertiaData> hostInertias; b3AlignedObjectArray<b3Contact4> hostContacts; - + { B3_PROFILE("copyToHost"); gpuBodies.copyToHost(hostBodies); @@ -429,25 +411,24 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) { b3JacobiSolverInfo solverInfo; - m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(),&hostContacts[0],hostContacts.size(),solverInfo); - - + m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(), &hostContacts[0], hostContacts.size(), solverInfo); } { B3_PROFILE("copyFromHost"); gpuBodies.copyFromHost(hostBodies); } - } else - + } + else { int static0Index = m_data->m_narrowphase->getStatic0Index(); b3JacobiSolverInfo solverInfo; //m_data->m_solver3->solveContacts( >solveGroup(&gpuBodies, &gpuInertias, &gpuContacts,solverInfo); //m_data->m_solver3->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]); - m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index); + m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index); } - } else + } + else { b3AlignedObjectArray<b3RigidBodyData> hostBodies; gpuBodies.copyToHost(hostBodies); @@ -460,17 +441,15 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) } gpuBodies.copyFromHost(hostBodies); } - - } else -#endif //TEST_OTHER_GPU_SOLVER + } + else +#endif //TEST_OTHER_GPU_SOLVER { - int static0Index = m_data->m_narrowphase->getStatic0Index(); - m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index); - + m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index); + //m_data->m_solver4->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(), gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL()); - - + /*m_data->m_solver3->solveContactConstraintHost( (b3OpenCLArray<RigidBodyBase::Body>*)&gpuBodies, (b3OpenCLArray<RigidBodyBase::Inertia>*)&gpuInertias, @@ -481,11 +460,9 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime) } integrate(deltaTime); - } - -void b3GpuRigidBodyPipeline::integrate(float timeStep) +void b3GpuRigidBodyPipeline::integrate(float timeStep) { //integrate int numBodies = m_data->m_narrowphase->getNumRigidBodies(); @@ -493,24 +470,25 @@ void b3GpuRigidBodyPipeline::integrate(float timeStep) if (gIntegrateOnCpu) { - if(numBodies) + if (numBodies) { - b3GpuNarrowPhaseInternalData* npData = m_data->m_narrowphase->getInternalData(); + b3GpuNarrowPhaseInternalData* npData = m_data->m_narrowphase->getInternalData(); npData->m_bodyBufferGPU->copyToHost(*npData->m_bodyBufferCPU); b3RigidBodyData_t* bodies = &npData->m_bodyBufferCPU->at(0); - for (int nodeID=0;nodeID<numBodies;nodeID++) + for (int nodeID = 0; nodeID < numBodies; nodeID++) { - integrateSingleTransform( bodies,nodeID, timeStep, angularDamp, m_data->m_gravity); + integrateSingleTransform(bodies, nodeID, timeStep, angularDamp, m_data->m_gravity); } npData->m_bodyBufferGPU->copyFromHost(*npData->m_bodyBufferCPU); } - } else + } + else { - b3LauncherCL launcher(m_data->m_queue,m_data->m_integrateTransformsKernel,"m_integrateTransformsKernel"); + b3LauncherCL launcher(m_data->m_queue, m_data->m_integrateTransformsKernel, "m_integrateTransformsKernel"); launcher.setBuffer(m_data->m_narrowphase->getBodiesGpu()); - + launcher.setConst(numBodies); launcher.setConst(timeStep); launcher.setConst(angularDamp); @@ -519,12 +497,9 @@ void b3GpuRigidBodyPipeline::integrate(float timeStep) } } - - - -void b3GpuRigidBodyPipeline::setupGpuAabbsFull() +void b3GpuRigidBodyPipeline::setupGpuAabbsFull() { - cl_int ciErrNum=0; + cl_int ciErrNum = 0; int numBodies = m_data->m_narrowphase->getNumRigidBodies(); if (!numBodies) @@ -532,34 +507,35 @@ void b3GpuRigidBodyPipeline::setupGpuAabbsFull() if (gCalcWorldSpaceAabbOnCpu) { - if (numBodies) { if (gUseDbvt) { m_data->m_allAabbsCPU.resize(numBodies); m_data->m_narrowphase->readbackAllBodiesToCpu(); - for (int i=0;i<numBodies;i++) + for (int i = 0; i < numBodies; i++) { - b3ComputeWorldAabb( i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_allAabbsCPU[0]); + b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_allAabbsCPU[0]); } m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU); - } else + } + else { m_data->m_broadphaseSap->getAllAabbsCPU().resize(numBodies); m_data->m_narrowphase->readbackAllBodiesToCpu(); - for (int i=0;i<numBodies;i++) + for (int i = 0; i < numBodies; i++) { - b3ComputeWorldAabb( i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_broadphaseSap->getAllAabbsCPU()[0]); + b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_broadphaseSap->getAllAabbsCPU()[0]); } m_data->m_broadphaseSap->getAllAabbsGPU().copyFromHost(m_data->m_broadphaseSap->getAllAabbsCPU()); //m_data->m_broadphaseSap->writeAabbsToGpu(); } } - } else + } + else { //__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB) - b3LauncherCL launcher(m_data->m_queue,m_data->m_updateAabbsKernel,"m_updateAabbsKernel"); + b3LauncherCL launcher(m_data->m_queue, m_data->m_updateAabbsKernel, "m_updateAabbsKernel"); launcher.setConst(numBodies); cl_mem bodies = m_data->m_narrowphase->getBodiesGpu(); launcher.setBuffer(bodies); @@ -568,17 +544,18 @@ void b3GpuRigidBodyPipeline::setupGpuAabbsFull() cl_mem localAabbs = m_data->m_narrowphase->getAabbLocalSpaceBufferGpu(); launcher.setBuffer(localAabbs); - cl_mem worldAabbs =0; + cl_mem worldAabbs = 0; if (gUseDbvt) { worldAabbs = m_data->m_allAabbsGPU->getBufferCL(); - } else + } + else { worldAabbs = m_data->m_broadphaseSap->getAabbBufferWS(); } launcher.setBuffer(worldAabbs); launcher.launch1D(numBodies); - + oclCHECKERROR(ciErrNum, CL_SUCCESS); } @@ -595,78 +572,68 @@ void b3GpuRigidBodyPipeline::setupGpuAabbsFull() }; */ - - - - - } - - -cl_mem b3GpuRigidBodyPipeline::getBodyBuffer() +cl_mem b3GpuRigidBodyPipeline::getBodyBuffer() { return m_data->m_narrowphase->getBodiesGpu(); } -int b3GpuRigidBodyPipeline::getNumBodies() const +int b3GpuRigidBodyPipeline::getNumBodies() const { return m_data->m_narrowphase->getNumRigidBodies(); } -void b3GpuRigidBodyPipeline::setGravity(const float* grav) +void b3GpuRigidBodyPipeline::setGravity(const float* grav) { - m_data->m_gravity.setValue(grav[0],grav[1],grav[2]); + m_data->m_gravity.setValue(grav[0], grav[1], grav[2]); } -void b3GpuRigidBodyPipeline::copyConstraintsToHost() +void b3GpuRigidBodyPipeline::copyConstraintsToHost() { m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints); } -void b3GpuRigidBodyPipeline::writeAllInstancesToGpu() +void b3GpuRigidBodyPipeline::writeAllInstancesToGpu() { m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU); m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints); } - -int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu) +int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu) { - - b3Vector3 aabbMin=b3MakeVector3(0,0,0),aabbMax=b3MakeVector3(0,0,0); + b3Vector3 aabbMin = b3MakeVector3(0, 0, 0), aabbMax = b3MakeVector3(0, 0, 0); - - if (collidableIndex>=0) + if (collidableIndex >= 0) { b3SapAabb localAabb = m_data->m_narrowphase->getLocalSpaceAabb(collidableIndex); - b3Vector3 localAabbMin=b3MakeVector3(localAabb.m_min[0],localAabb.m_min[1],localAabb.m_min[2]); - b3Vector3 localAabbMax=b3MakeVector3(localAabb.m_max[0],localAabb.m_max[1],localAabb.m_max[2]); - + b3Vector3 localAabbMin = b3MakeVector3(localAabb.m_min[0], localAabb.m_min[1], localAabb.m_min[2]); + b3Vector3 localAabbMax = b3MakeVector3(localAabb.m_max[0], localAabb.m_max[1], localAabb.m_max[2]); + b3Scalar margin = 0.01f; b3Transform t; t.setIdentity(); - t.setOrigin(b3MakeVector3(position[0],position[1],position[2])); - t.setRotation(b3Quaternion(orientation[0],orientation[1],orientation[2],orientation[3])); - b3TransformAabb(localAabbMin,localAabbMax, margin,t,aabbMin,aabbMax); - } else + t.setOrigin(b3MakeVector3(position[0], position[1], position[2])); + t.setRotation(b3Quaternion(orientation[0], orientation[1], orientation[2], orientation[3])); + b3TransformAabb(localAabbMin, localAabbMax, margin, t, aabbMin, aabbMax); + } + else { b3Error("registerPhysicsInstance using invalid collidableIndex\n"); return -1; } - - + bool writeToGpu = false; int bodyIndex = m_data->m_narrowphase->getNumRigidBodies(); - bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex,mass,position,orientation,&aabbMin.getX(),&aabbMax.getX(),writeToGpu); + bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex, mass, position, orientation, &aabbMin.getX(), &aabbMax.getX(), writeToGpu); - if (bodyIndex>=0) + if (bodyIndex >= 0) { if (gUseDbvt) { - m_data->m_broadphaseDbvt->createProxy(aabbMin,aabbMax,bodyIndex,0,1,1); + m_data->m_broadphaseDbvt->createProxy(aabbMin, aabbMax, bodyIndex, 0, 1, 1); b3SapAabb aabb; - for (int i=0;i<3;i++) + for (int i = 0; i < 3; i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; @@ -677,14 +644,16 @@ int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* po { m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU); } - } else + } + else { if (mass) { - m_data->m_broadphaseSap->createProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher); - } else + m_data->m_broadphaseSap->createProxy(aabbMin, aabbMax, bodyIndex, 1, 1); //m_dispatcher); + } + else { - m_data->m_broadphaseSap->createLargeProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher); + m_data->m_broadphaseSap->createLargeProxy(aabbMin, aabbMax, bodyIndex, 1, 1); //m_dispatcher); } } } @@ -699,10 +668,10 @@ int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* po return bodyIndex; } -void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults) +void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults) { - this->m_data->m_raycaster->castRays(rays,hitResults, - getNumBodies(),this->m_data->m_narrowphase->getBodiesCpu(), - m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(), - m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap); + this->m_data->m_raycaster->castRays(rays, hitResults, + getNumBodies(), this->m_data->m_narrowphase->getBodiesCpu(), + m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(), + m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap); } diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h index b4eac6841a..0e5c6fec12 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h @@ -25,50 +25,46 @@ subject to the following restrictions: class b3GpuRigidBodyPipeline { protected: - struct b3GpuRigidBodyPipelineInternalData* m_data; + struct b3GpuRigidBodyPipelineInternalData* m_data; int allocateCollidable(); public: - - - b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue q , class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config); + b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config); virtual ~b3GpuRigidBodyPipeline(); - void stepSimulation(float deltaTime); - void integrate(float timeStep); - void setupGpuAabbsFull(); + void stepSimulation(float deltaTime); + void integrate(float timeStep); + void setupGpuAabbsFull(); - int registerConvexPolyhedron(class b3ConvexUtility* convex); + int registerConvexPolyhedron(class b3ConvexUtility* convex); //int registerConvexPolyhedron(const float* vertices, int strideInBytes, int numVertices, const float* scaling); //int registerSphereShape(float radius); //int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant); - + //int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling); //int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes); - - int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu); + int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu); //if you passed "writeInstanceToGpu" false in the registerPhysicsInstance method (for performance) you need to call writeAllInstancesToGpu after all instances are registered - void writeAllInstancesToGpu(); - void copyConstraintsToHost(); - void setGravity(const float* grav); + void writeAllInstancesToGpu(); + void copyConstraintsToHost(); + void setGravity(const float* grav); void reset(); - - int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold); + + int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold); int createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold); void removeConstraintByUid(int uid); - void addConstraint(class b3TypedConstraint* constraint); - void removeConstraint(b3TypedConstraint* constraint); - - void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults); + void addConstraint(class b3TypedConstraint* constraint); + void removeConstraint(b3TypedConstraint* constraint); - cl_mem getBodyBuffer(); + void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults); - int getNumBodies() const; + cl_mem getBodyBuffer(); + int getNumBodies() const; }; -#endif //B3_GPU_RIGIDBODY_PIPELINE_H
\ No newline at end of file +#endif //B3_GPU_RIGIDBODY_PIPELINE_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h index 5ac92f97d6..e0a26fda17 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h @@ -22,52 +22,47 @@ subject to the following restrictions: #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" #include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h" - #include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h" #include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h" #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h" - - #include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h" #include "Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h" struct b3GpuRigidBodyPipelineInternalData { + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; - cl_context m_context; - cl_device_id m_device; - cl_command_queue m_queue; + cl_kernel m_integrateTransformsKernel; + cl_kernel m_updateAabbsKernel; + cl_kernel m_clearOverlappingPairsKernel; - cl_kernel m_integrateTransformsKernel; - cl_kernel m_updateAabbsKernel; - cl_kernel m_clearOverlappingPairsKernel; - class b3PgsJacobiSolver* m_solver; - + class b3GpuPgsConstraintSolver* m_gpuSolver; class b3GpuPgsContactSolver* m_solver2; class b3GpuJacobiContactSolver* m_solver3; class b3GpuRaycast* m_raycaster; - + class b3GpuBroadphaseInterface* m_broadphaseSap; - + struct b3DynamicBvhBroadphase* m_broadphaseDbvt; - b3OpenCLArray<b3SapAabb>* m_allAabbsGPU; - b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; - b3OpenCLArray<b3BroadphasePair>* m_overlappingPairsGPU; + b3OpenCLArray<b3SapAabb>* m_allAabbsGPU; + b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; + b3OpenCLArray<b3BroadphasePair>* m_overlappingPairsGPU; b3OpenCLArray<b3GpuGenericConstraint>* m_gpuConstraints; b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints; b3AlignedObjectArray<b3TypedConstraint*> m_joints; - int m_constraintUid; - class b3GpuNarrowPhase* m_narrowphase; - b3Vector3 m_gravity; + int m_constraintUid; + class b3GpuNarrowPhase* m_narrowphase; + b3Vector3 m_gravity; - b3Config m_config; + b3Config m_config; }; -#endif //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H - +#endif //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h index f2a61801ac..db815d9b31 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h @@ -13,11 +13,9 @@ subject to the following restrictions: */ //Originally written by Erwin Coumans - #ifndef B3_GPU_SOLVER_BODY_H #define B3_GPU_SOLVER_BODY_H - #include "Bullet3Common/b3Vector3.h" #include "Bullet3Common/b3Matrix3x3.h" @@ -27,29 +25,27 @@ subject to the following restrictions: ///Until we get other contributions, only use SIMD on Windows, when using Visual Studio 2008 or later, and not double precision #ifdef B3_USE_SSE #define USE_SIMD 1 -#endif // - - +#endif // ///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance. -B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody +B3_ATTRIBUTE_ALIGNED16(struct) +b3GpuSolverBody { B3_DECLARE_ALIGNED_ALLOCATOR(); -// b3Transform m_worldTransformUnused; - b3Vector3 m_deltaLinearVelocity; - b3Vector3 m_deltaAngularVelocity; - b3Vector3 m_angularFactor; - b3Vector3 m_linearFactor; - b3Vector3 m_invMass; - b3Vector3 m_pushVelocity; - b3Vector3 m_turnVelocity; - b3Vector3 m_linearVelocity; - b3Vector3 m_angularVelocity; - - union - { - void* m_originalBody; - int m_originalBodyIndex; + // b3Transform m_worldTransformUnused; + b3Vector3 m_deltaLinearVelocity; + b3Vector3 m_deltaAngularVelocity; + b3Vector3 m_angularFactor; + b3Vector3 m_linearFactor; + b3Vector3 m_invMass; + b3Vector3 m_pushVelocity; + b3Vector3 m_turnVelocity; + b3Vector3 m_linearVelocity; + b3Vector3 m_angularVelocity; + + union { + void* m_originalBody; + int m_originalBodyIndex; }; int padding[3]; @@ -65,44 +61,41 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody return m_worldTransform; } */ - B3_FORCE_INLINE void getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const + B3_FORCE_INLINE void getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const { if (m_originalBody) - velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos); + velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos); else - velocity.setValue(0,0,0); + velocity.setValue(0, 0, 0); } - B3_FORCE_INLINE void getAngularVelocity(b3Vector3& angVel) const + B3_FORCE_INLINE void getAngularVelocity(b3Vector3 & angVel) const { if (m_originalBody) - angVel =m_angularVelocity+m_deltaAngularVelocity; + angVel = m_angularVelocity + m_deltaAngularVelocity; else - angVel.setValue(0,0,0); + angVel.setValue(0, 0, 0); } - //Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position - B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude) + B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude) { if (m_originalBody) { - m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor; - m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor); + m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor; + m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor); } } - B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,b3Scalar impulseMagnitude) + B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, b3Scalar impulseMagnitude) { if (m_originalBody) { - m_pushVelocity += linearComponent*impulseMagnitude*m_linearFactor; - m_turnVelocity += angularComponent*(impulseMagnitude*m_angularFactor); + m_pushVelocity += linearComponent * impulseMagnitude * m_linearFactor; + m_turnVelocity += angularComponent * (impulseMagnitude * m_angularFactor); } } - - const b3Vector3& getDeltaLinearVelocity() const { return m_deltaLinearVelocity; @@ -113,20 +106,19 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody return m_deltaAngularVelocity; } - const b3Vector3& getPushVelocity() const + const b3Vector3& getPushVelocity() const { return m_pushVelocity; } - const b3Vector3& getTurnVelocity() const + const b3Vector3& getTurnVelocity() const { return m_turnVelocity; } - //////////////////////////////////////////////// ///some internal methods, don't use them - + b3Vector3& internalGetDeltaLinearVelocity() { return m_deltaLinearVelocity; @@ -151,7 +143,7 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody { m_invMass = invMass; } - + b3Vector3& internalGetPushVelocity() { return m_pushVelocity; @@ -162,67 +154,57 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody return m_turnVelocity; } - B3_FORCE_INLINE void internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const + B3_FORCE_INLINE void internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const { - velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos); + velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos); } - B3_FORCE_INLINE void internalGetAngularVelocity(b3Vector3& angVel) const + B3_FORCE_INLINE void internalGetAngularVelocity(b3Vector3 & angVel) const { - angVel = m_angularVelocity+m_deltaAngularVelocity; + angVel = m_angularVelocity + m_deltaAngularVelocity; } - //Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position - B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude) + B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude) { //if (m_originalBody) { - m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor; - m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor); + m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor; + m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor); } } - - - - void writebackVelocity() + void writebackVelocity() { //if (m_originalBody>=0) { - m_linearVelocity +=m_deltaLinearVelocity; + m_linearVelocity += m_deltaLinearVelocity; m_angularVelocity += m_deltaAngularVelocity; - + //m_originalBody->setCompanionId(-1); } } - - void writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp) + void writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp) { - (void) timeStep; + (void)timeStep; if (m_originalBody) { m_linearVelocity += m_deltaLinearVelocity; m_angularVelocity += m_deltaAngularVelocity; - + //correct the position/orientation based on push/turn recovery b3Transform newTransform; - if (m_pushVelocity[0]!=0.f || m_pushVelocity[1]!=0 || m_pushVelocity[2]!=0 || m_turnVelocity[0]!=0.f || m_turnVelocity[1]!=0 || m_turnVelocity[2]!=0) + if (m_pushVelocity[0] != 0.f || m_pushVelocity[1] != 0 || m_pushVelocity[2] != 0 || m_turnVelocity[0] != 0.f || m_turnVelocity[1] != 0 || m_turnVelocity[2] != 0) { - // b3Quaternion orn = m_worldTransform.getRotation(); -// b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform); -// m_worldTransform = newTransform; + // b3Quaternion orn = m_worldTransform.getRotation(); + // b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform); + // m_worldTransform = newTransform; } //m_worldTransform.setRotation(orn); //m_originalBody->setCompanionId(-1); } } - - - }; -#endif //B3_SOLVER_BODY_H - - +#endif //B3_SOLVER_BODY_H diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h index 60d235baab..7d9eea243a 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h @@ -13,11 +13,9 @@ subject to the following restrictions: 3. This notice may not be removed or altered from any source distribution. */ - #ifndef B3_GPU_SOLVER_CONSTRAINT_H #define B3_GPU_SOLVER_CONSTRAINT_H - #include "Bullet3Common/b3Vector3.h" #include "Bullet3Common/b3Matrix3x3.h" //#include "b3JacobianEntry.h" @@ -25,58 +23,51 @@ subject to the following restrictions: //#define NO_FRICTION_TANGENTIALS 1 - - ///1D constraint along a normal axis between bodyA and bodyB. It can be combined to solve contact and friction constraints. -B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverConstraint +B3_ATTRIBUTE_ALIGNED16(struct) +b3GpuSolverConstraint { B3_DECLARE_ALIGNED_ALLOCATOR(); - b3Vector3 m_relpos1CrossNormal; - b3Vector3 m_contactNormal; + b3Vector3 m_relpos1CrossNormal; + b3Vector3 m_contactNormal; - b3Vector3 m_relpos2CrossNormal; + b3Vector3 m_relpos2CrossNormal; //b3Vector3 m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal - b3Vector3 m_angularComponentA; - b3Vector3 m_angularComponentB; - - mutable b3Scalar m_appliedPushImpulse; - mutable b3Scalar m_appliedImpulse; + b3Vector3 m_angularComponentA; + b3Vector3 m_angularComponentB; + + mutable b3Scalar m_appliedPushImpulse; + mutable b3Scalar m_appliedImpulse; int m_padding1; int m_padding2; - b3Scalar m_friction; - b3Scalar m_jacDiagABInv; - b3Scalar m_rhs; - b3Scalar m_cfm; - - b3Scalar m_lowerLimit; - b3Scalar m_upperLimit; - b3Scalar m_rhsPenetration; - union - { - void* m_originalContactPoint; - int m_originalConstraintIndex; - b3Scalar m_unusedPadding4; + b3Scalar m_friction; + b3Scalar m_jacDiagABInv; + b3Scalar m_rhs; + b3Scalar m_cfm; + + b3Scalar m_lowerLimit; + b3Scalar m_upperLimit; + b3Scalar m_rhsPenetration; + union { + void* m_originalContactPoint; + int m_originalConstraintIndex; + b3Scalar m_unusedPadding4; }; - int m_overrideNumSolverIterations; - int m_frictionIndex; + int m_overrideNumSolverIterations; + int m_frictionIndex; int m_solverBodyIdA; int m_solverBodyIdB; - - enum b3SolverConstraintType + enum b3SolverConstraintType { B3_SOLVER_CONTACT_1D = 0, B3_SOLVER_FRICTION_1D }; }; -typedef b3AlignedObjectArray<b3GpuSolverConstraint> b3GpuConstraintArray; - - -#endif //B3_GPU_SOLVER_CONSTRAINT_H - - +typedef b3AlignedObjectArray<b3GpuSolverConstraint> b3GpuConstraintArray; +#endif //B3_GPU_SOLVER_CONSTRAINT_H diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp index 20bf6d47c5..ccf67da1a8 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp @@ -13,7 +13,6 @@ subject to the following restrictions: */ //Originally written by Takahiro Harada - #include "b3Solver.h" ///useNewBatchingKernel is a rewritten kernel using just a single thread of the warp, for experiments @@ -38,7 +37,6 @@ bool gConvertConstraintOnCpu = false; #include "kernels/batchingKernels.h" #include "kernels/batchingKernelsNew.h" - #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" #include "Bullet3Common/b3Vector3.h" @@ -48,7 +46,7 @@ struct SolverDebugInfo int m_valInt1; int m_valInt2; int m_valInt3; - + int m_valInt4; int m_valInt5; int m_valInt6; @@ -59,11 +57,10 @@ struct SolverDebugInfo int m_valInt10; int m_valInt11; - int m_valInt12; - int m_valInt13; - int m_valInt14; - int m_valInt15; - + int m_valInt12; + int m_valInt13; + int m_valInt14; + int m_valInt15; float m_val0; float m_val1; @@ -71,9 +68,6 @@ struct SolverDebugInfo float m_val3; }; - - - class SolverDeviceInl { public: @@ -84,101 +78,89 @@ public: }; }; - - b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity) - : - m_context(ctx), - m_device(device), - m_queue(queue), - m_batchSizes(ctx,queue), - m_nIterations(4) + : m_context(ctx), + m_device(device), + m_queue(queue), + m_batchSizes(ctx, queue), + m_nIterations(4) { - m_sort32 = new b3RadixSort32CL(ctx,device,queue); - m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS); - m_search = new b3BoundSearchCL(ctx,device,queue,B3_SOLVER_N_CELLS); + m_sort32 = new b3RadixSort32CL(ctx, device, queue); + m_scan = new b3PrefixScanCL(ctx, device, queue, B3_SOLVER_N_CELLS); + m_search = new b3BoundSearchCL(ctx, device, queue, B3_SOLVER_N_CELLS); - const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 ); + const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512); - m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,queue,sortSize); - m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx,queue); + m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, queue, sortSize); + m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx, queue); - m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,queue,B3_SOLVER_N_CELLS ); + m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS); m_numConstraints->resize(B3_SOLVER_N_CELLS); - m_offsets = new b3OpenCLArray<unsigned int>( ctx,queue,B3_SOLVER_N_CELLS); + m_offsets = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS); m_offsets->resize(B3_SOLVER_N_CELLS); const char* additionalMacros = ""; -// const char* srcFileNameForCaching=""; - - + // const char* srcFileNameForCaching=""; cl_int pErrNum; const char* batchKernelSource = batchingKernelsCL; const char* batchKernelNewSource = batchingKernelsNewCL; - + const char* solverSetupSource = solverSetupCL; const char* solverSetup2Source = solverSetup2CL; const char* solveContactSource = solveContactCL; const char* solveFrictionSource = solveFrictionCL; - - - + { - - cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH); + cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH); b3Assert(solveContactProg); - - cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH); + + cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH); b3Assert(solveFrictionProg); - cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH); + cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH); b3Assert(solverSetup2Prog); - - cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH); + cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH); b3Assert(solverSetupProg); - - - m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros ); + + m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros); b3Assert(m_solveFrictionKernel); - m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros ); + m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros); b3Assert(m_solveContactKernel); - - m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros ); + + m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros); b3Assert(m_contactToConstraintKernel); - - m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + + m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_setSortDataKernel); - - m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + + m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_reorderContactKernel); - - m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros ); + m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros); b3Assert(m_copyConstraintKernel); - } { - cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH); + cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH); //cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true); b3Assert(batchingProg); - - m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros ); + + m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros); b3Assert(m_batchingKernel); } { - cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH); + cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH); b3Assert(batchingNewProg); - m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros ); + m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros); //m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros ); b3Assert(m_batchingKernelNew); } } - + b3Solver::~b3Solver() { delete m_offsets; @@ -190,71 +172,68 @@ b3Solver::~b3Solver() delete m_scan; delete m_search; - clReleaseKernel(m_batchingKernel); clReleaseKernel(m_batchingKernelNew); - - clReleaseKernel( m_solveContactKernel); - clReleaseKernel( m_solveFrictionKernel); - - clReleaseKernel( m_contactToConstraintKernel); - clReleaseKernel( m_setSortDataKernel); - clReleaseKernel( m_reorderContactKernel); - clReleaseKernel( m_copyConstraintKernel); - -} + clReleaseKernel(m_solveContactKernel); + clReleaseKernel(m_solveFrictionKernel); - + clReleaseKernel(m_contactToConstraintKernel); + clReleaseKernel(m_setSortDataKernel); + clReleaseKernel(m_reorderContactKernel); + clReleaseKernel(m_copyConstraintKernel); +} -template<bool JACOBI> -static -__inline -void solveContact(b3GpuConstraint4& cs, - const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, - const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, - float maxRambdaDt[4], float minRambdaDt[4]) +template <bool JACOBI> +static __inline void solveContact(b3GpuConstraint4& cs, + const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4]) { - - b3Vector3 dLinVelA; dLinVelA.setZero(); - b3Vector3 dAngVelA; dAngVelA.setZero(); - b3Vector3 dLinVelB; dLinVelB.setZero(); - b3Vector3 dAngVelB; dAngVelB.setZero(); - - for(int ic=0; ic<4; ic++) + b3Vector3 dLinVelA; + dLinVelA.setZero(); + b3Vector3 dAngVelA; + dAngVelA.setZero(); + b3Vector3 dLinVelB; + dLinVelB.setZero(); + b3Vector3 dAngVelB; + dAngVelB.setZero(); + + for (int ic = 0; ic < 4; ic++) { // dont necessary because this makes change to 0 - if( cs.m_jacCoeffInv[ic] == 0.f ) continue; + if (cs.m_jacCoeffInv[ic] == 0.f) continue; { b3Vector3 angular0, angular1, linear; b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA; b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB; - setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, &linear, &angular0, &angular1 ); + setLinearAndAngular((const b3Vector3&)cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, &linear, &angular0, &angular1); - float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1, - linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic]; + float rambdaDt = calcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB) + + cs.m_b[ic]; rambdaDt *= cs.m_jacCoeffInv[ic]; { float prevSum = cs.m_appliedRambdaDt[ic]; float updated = prevSum; updated += rambdaDt; - updated = b3Max( updated, minRambdaDt[ic] ); - updated = b3Min( updated, maxRambdaDt[ic] ); + updated = b3Max(updated, minRambdaDt[ic]); + updated = b3Min(updated, maxRambdaDt[ic]); rambdaDt = updated - prevSum; cs.m_appliedRambdaDt[ic] = updated; } - b3Vector3 linImp0 = invMassA*linear*rambdaDt; - b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; - b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; - b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; + b3Vector3 linImp0 = invMassA * linear * rambdaDt; + b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt; + b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt; + b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt; #ifdef _WIN32 - b3Assert(_finite(linImp0.getX())); + b3Assert(_finite(linImp0.getX())); b3Assert(_finite(linImp1.getX())); #endif - if( JACOBI ) + if (JACOBI) { dLinVelA += linImp0; dAngVelA += angImp0; @@ -271,92 +250,83 @@ void solveContact(b3GpuConstraint4& cs, } } - if( JACOBI ) + if (JACOBI) { linVelA += dLinVelA; angVelA += dAngVelA; linVelB += dLinVelB; angVelB += dAngVelB; } - } +static __inline void solveFriction(b3GpuConstraint4& cs, + const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, + const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, + float maxRambdaDt[4], float minRambdaDt[4]) +{ + if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return; + const b3Vector3& center = (const b3Vector3&)cs.m_center; + b3Vector3 n = -(const b3Vector3&)cs.m_linear; - - - static - __inline - void solveFriction(b3GpuConstraint4& cs, - const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA, - const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, - float maxRambdaDt[4], float minRambdaDt[4]) - { - - if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return; - const b3Vector3& center = (const b3Vector3&)cs.m_center; - - b3Vector3 n = -(const b3Vector3&)cs.m_linear; - - b3Vector3 tangent[2]; -#if 1 - b3PlaneSpace1 (n, tangent[0],tangent[1]); + b3Vector3 tangent[2]; +#if 1 + b3PlaneSpace1(n, tangent[0], tangent[1]); #else - b3Vector3 r = cs.m_worldPos[0]-center; - tangent[0] = cross3( n, r ); - tangent[1] = cross3( tangent[0], n ); - tangent[0] = normalize3( tangent[0] ); - tangent[1] = normalize3( tangent[1] ); + b3Vector3 r = cs.m_worldPos[0] - center; + tangent[0] = cross3(n, r); + tangent[1] = cross3(tangent[0], n); + tangent[0] = normalize3(tangent[0]); + tangent[1] = normalize3(tangent[1]); #endif - b3Vector3 angular0, angular1, linear; - b3Vector3 r0 = center - posA; - b3Vector3 r1 = center - posB; - for(int i=0; i<2; i++) - { - setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 ); - float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, - linVelA, angVelA, linVelB, angVelB ); - rambdaDt *= cs.m_fJacCoeffInv[i]; + b3Vector3 angular0, angular1, linear; + b3Vector3 r0 = center - posA; + b3Vector3 r1 = center - posB; + for (int i = 0; i < 2; i++) + { + setLinearAndAngular(tangent[i], r0, r1, &linear, &angular0, &angular1); + float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB); + rambdaDt *= cs.m_fJacCoeffInv[i]; - { - float prevSum = cs.m_fAppliedRambdaDt[i]; - float updated = prevSum; - updated += rambdaDt; - updated = b3Max( updated, minRambdaDt[i] ); - updated = b3Min( updated, maxRambdaDt[i] ); - rambdaDt = updated - prevSum; - cs.m_fAppliedRambdaDt[i] = updated; - } + { + float prevSum = cs.m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = b3Max(updated, minRambdaDt[i]); + updated = b3Min(updated, maxRambdaDt[i]); + rambdaDt = updated - prevSum; + cs.m_fAppliedRambdaDt[i] = updated; + } - b3Vector3 linImp0 = invMassA*linear*rambdaDt; - b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt; - b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt; - b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt; + b3Vector3 linImp0 = invMassA * linear * rambdaDt; + b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt; + b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt; + b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt; #ifdef _WIN32 - b3Assert(_finite(linImp0.getX())); - b3Assert(_finite(linImp1.getX())); + b3Assert(_finite(linImp0.getX())); + b3Assert(_finite(linImp1.getX())); #endif - linVelA += linImp0; - angVelA += angImp0; - linVelB += linImp1; - angVelB += angImp1; - } + linVelA += linImp0; + angVelA += angImp0; + linVelB += linImp1; + angVelB += angImp1; + } - { // angular damping for point constraint - b3Vector3 ab = ( posB - posA ).normalized(); - b3Vector3 ac = ( center - posA ).normalized(); - if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) - { - float angNA = b3Dot( n, angVelA ); - float angNB = b3Dot( n, angVelB ); + { // angular damping for point constraint + b3Vector3 ab = (posB - posA).normalized(); + b3Vector3 ac = (center - posA).normalized(); + if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { + float angNA = b3Dot(n, angVelA); + float angNB = b3Dot(n, angVelB); - angVelA -= (angNA*0.1f)*n; - angVelB -= (angNB*0.1f)*n; - } + angVelA -= (angNA * 0.1f) * n; + angVelB -= (angNB * 0.1f) * n; } - } +} /* b3AlignedObjectArray<b3RigidBodyData>& m_bodies; b3AlignedObjectArray<b3InertiaData>& m_shapes; @@ -370,79 +340,69 @@ void solveContact(b3GpuConstraint4& cs, int m_maxNumBatches; */ -struct SolveTask// : public ThreadPool::Task +struct SolveTask // : public ThreadPool::Task { - SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints, - int start, int nConstraints,int maxNumBatches,b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex) - : m_bodies( bodies ), m_shapes( shapes ), - m_constraints( constraints ), - m_batchSizes(batchSizes), - m_cellIndex(cellIndex), - m_curWgidx(curWgidx), - m_start( start ), - m_nConstraints( nConstraints ), - m_solveFriction( true ), - m_maxNumBatches(maxNumBatches) - {} - - unsigned short int getType(){ return 0; } + SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints, + int start, int nConstraints, int maxNumBatches, b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex) + : m_bodies(bodies), m_shapes(shapes), m_constraints(constraints), m_batchSizes(batchSizes), m_cellIndex(cellIndex), m_curWgidx(curWgidx), m_start(start), m_nConstraints(nConstraints), m_solveFriction(true), m_maxNumBatches(maxNumBatches) + { + } + + unsigned short int getType() { return 0; } void run(int tIdx) { int offset = 0; - for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++) + for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++) { - int numInBatch = m_batchSizes->at(m_cellIndex*B3_MAX_NUM_BATCHES+ii); + int numInBatch = m_batchSizes->at(m_cellIndex * B3_MAX_NUM_BATCHES + ii); if (!numInBatch) break; - for (int jj=0;jj<numInBatch;jj++) + for (int jj = 0; jj < numInBatch; jj++) { - int i = m_start + offset+jj; + int i = m_start + offset + jj; int batchId = m_constraints[i].m_batchIdx; - b3Assert(batchId==ii); + b3Assert(batchId == ii); float frictionCoeff = m_constraints[i].getFrictionCoeff(); int aIdx = (int)m_constraints[i].m_bodyA; int bIdx = (int)m_constraints[i].m_bodyB; -// int localBatch = m_constraints[i].m_batchIdx; + // int localBatch = m_constraints[i].m_batchIdx; b3RigidBodyData& bodyA = m_bodies[aIdx]; b3RigidBodyData& bodyB = m_bodies[bIdx]; - if( !m_solveFriction ) + if (!m_solveFriction) { - float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; - float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX}; + float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f}; - solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, - (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld, - maxRambdaDt, minRambdaDt ); + solveContact<false>(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt); } else { - float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; - float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX}; + float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f}; float sum = 0; - for(int j=0; j<4; j++) + for (int j = 0; j < 4; j++) { - sum +=m_constraints[i].m_appliedRambdaDt[j]; + sum += m_constraints[i].m_appliedRambdaDt[j]; } frictionCoeff = 0.7f; - for(int j=0; j<4; j++) + for (int j = 0; j < 4; j++) { - maxRambdaDt[j] = frictionCoeff*sum; + maxRambdaDt[j] = frictionCoeff * sum; minRambdaDt[j] = -maxRambdaDt[j]; } - solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, - (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld, - maxRambdaDt, minRambdaDt ); - + solveFriction(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld, + (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld, + maxRambdaDt, minRambdaDt); } } - offset+=numInBatch; - - + offset += numInBatch; } -/* for (int bb=0;bb<m_maxNumBatches;bb++) + /* for (int bb=0;bb<m_maxNumBatches;bb++) { //for(int ic=m_nConstraints-1; ic>=0; ic--) for(int ic=0; ic<m_nConstraints; ic++) @@ -491,9 +451,6 @@ struct SolveTask// : public ThreadPool::Task } } */ - - - } b3AlignedObjectArray<b3RigidBodyData>& m_bodies; @@ -508,11 +465,9 @@ struct SolveTask// : public ThreadPool::Task int m_maxNumBatches; }; - -void b3Solver::solveContactConstraintHost( b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,b3AlignedObjectArray<int>* batchSizes) +void b3Solver::solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes) { - #if 0 { int nSplitX = B3_SOLVER_N_SPLIT_X; @@ -571,114 +526,105 @@ void b3Solver::solveContactConstraintHost( b3OpenCLArray<b3RigidBodyData>* body //printf("------------------------\n"); b3AlignedObjectArray<unsigned int> offsetsHost; m_offsets->copyToHost(offsetsHost); - static int frame=0; - bool useBatches=true; + static int frame = 0; + bool useBatches = true; if (useBatches) { - for(int iter=0; iter<m_nIterations; iter++) + for (int iter = 0; iter < m_nIterations; iter++) { - for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++) + for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++) { - int nSplitX = B3_SOLVER_N_SPLIT_X; int nSplitY = B3_SOLVER_N_SPLIT_Y; - int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES; + int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES; //printf("cell Batch %d\n",cellBatch); b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS]; - for (int i=0;i<B3_SOLVER_N_CELLS;i++) + for (int i = 0; i < B3_SOLVER_N_CELLS; i++) { usedBodies[i].resize(0); } - - - //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--) - for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++) + for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++) { - int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2); - int remain= (wgIdx%((nSplitX*nSplitY)/4)); - int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1); - int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1); - int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); - - - if( numConstraintsHost[cellIdx] == 0 ) + int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2); + int remain = (wgIdx % ((nSplitX * nSplitY) / 4)); + int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1); + int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1); + int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY); + + if (numConstraintsHost[cellIdx] == 0) continue; //printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch); //printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]); if (zIdx) { - //printf("?\n"); + //printf("?\n"); } - if (iter==0) + if (iter == 0) { //printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx); //printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]); } const int start = offsetsHost[cellIdx]; int numConstraintsInCell = numConstraintsHost[cellIdx]; - // const int end = start + numConstraintsInCell; + // const int end = start + numConstraintsInCell; - SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell ,maxNumBatches,usedBodies,wgIdx,batchSizes,cellIdx); + SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, usedBodies, wgIdx, batchSizes, cellIdx); task.m_solveFriction = false; task.run(0); - } } } - for(int iter=0; iter<m_nIterations; iter++) + for (int iter = 0; iter < m_nIterations; iter++) { - for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++) + for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++) { int nSplitX = B3_SOLVER_N_SPLIT_X; int nSplitY = B3_SOLVER_N_SPLIT_Y; - - int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES; + int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES; - for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++) + for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++) { - int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2); - int remain= (wgIdx%((nSplitX*nSplitY)/4)); - int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1); - int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1); - - int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); - - if( numConstraintsHost[cellIdx] == 0 ) + int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2); + int remain = (wgIdx % ((nSplitX * nSplitY) / 4)); + int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1); + int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1); + + int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY); + + if (numConstraintsHost[cellIdx] == 0) continue; - + //printf("yIdx=%d\n",yIdx); - + const int start = offsetsHost[cellIdx]; int numConstraintsInCell = numConstraintsHost[cellIdx]; - // const int end = start + numConstraintsInCell; + // const int end = start + numConstraintsInCell; - SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell,maxNumBatches, 0,0,batchSizes,cellIdx); + SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, 0, 0, batchSizes, cellIdx); task.m_solveFriction = true; task.run(0); - } } } - - - } else + } + else { - for(int iter=0; iter<m_nIterations; iter++) + for (int iter = 0; iter < m_nIterations; iter++) { - SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0); + SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0); task.m_solveFriction = false; task.run(0); } - for(int iter=0; iter<m_nIterations; iter++) + for (int iter = 0; iter < m_nIterations; iter++) { - SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0); + SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0); task.m_solveFriction = true; task.run(0); } @@ -688,23 +634,21 @@ void b3Solver::solveContactConstraintHost( b3OpenCLArray<b3RigidBodyData>* body shapeBuf->copyFromHost(shapeNative); constraint->copyFromHost(constraintNative); frame++; - } void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, - const b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3GpuConstraint4>* constraint, - b3OpenCLArray<unsigned int>* m_numConstraints, - b3OpenCLArray<unsigned int>* m_offsets, - int batchId - ) + const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, + b3OpenCLArray<unsigned int>* m_numConstraints, + b3OpenCLArray<unsigned int>* m_offsets, + int batchId) { -// b3BufferInfoCL( m_numConstraints->getBufferCL() ), -// b3BufferInfoCL( m_offsets->getBufferCL() ) - + // b3BufferInfoCL( m_numConstraints->getBufferCL() ), + // b3BufferInfoCL( m_offsets->getBufferCL() ) + int cellBatch = batchId; const int nn = B3_SOLVER_N_CELLS; -// int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES; + // int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES; b3AlignedObjectArray<unsigned int> gN; m_numConstraints->copyToHost(gN); @@ -712,243 +656,220 @@ void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, m_offsets->copyToHost(gOffsets); int nSplitX = B3_SOLVER_N_SPLIT_X; int nSplitY = B3_SOLVER_N_SPLIT_Y; - -// int bIdx = batchId; + + // int bIdx = batchId; b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints; constraint->copyToHost(cpuConstraints); printf("batch = %d\n", batchId); - int numWorkgroups = nn/B3_SOLVER_N_BATCHES; + int numWorkgroups = nn / B3_SOLVER_N_BATCHES; b3AlignedObjectArray<int> usedBodies; - - for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++) + for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++) { printf("wgIdx = %d ", wgIdx); - int zIdx = (wgIdx/((nSplitX*nSplitY))/2)*2+((cellBatch&4)>>2); - int remain = wgIdx%((nSplitX*nSplitY)); - int yIdx = (remain%(nSplitX/2))*2 + ((cellBatch&2)>>1); - int xIdx = (remain/(nSplitX/2))*2 + (cellBatch&1); + int zIdx = (wgIdx / ((nSplitX * nSplitY)) / 2) * 2 + ((cellBatch & 4) >> 2); + int remain = wgIdx % ((nSplitX * nSplitY)); + int yIdx = (remain % (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1); + int xIdx = (remain / (nSplitX / 2)) * 2 + (cellBatch & 1); - - int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY); - printf("cellIdx=%d\n",cellIdx); - if( gN[cellIdx] == 0 ) + int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY); + printf("cellIdx=%d\n", cellIdx); + if (gN[cellIdx] == 0) continue; const int start = gOffsets[cellIdx]; const int end = start + gN[cellIdx]; - for (int c=start;c<end;c++) + for (int c = start; c < end; c++) { b3GpuConstraint4& constraint = cpuConstraints[c]; //printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB); - if (usedBodies.findLinearSearch(constraint.m_bodyA)< usedBodies.size()) + if (usedBodies.findLinearSearch(constraint.m_bodyA) < usedBodies.size()) { printf("error?\n"); } - if (usedBodies.findLinearSearch(constraint.m_bodyB)< usedBodies.size()) + if (usedBodies.findLinearSearch(constraint.m_bodyB) < usedBodies.size()) { printf("error?\n"); } } - for (int c=start;c<end;c++) + for (int c = start; c < end; c++) { b3GpuConstraint4& constraint = cpuConstraints[c]; usedBodies.push_back(constraint.m_bodyA); usedBodies.push_back(constraint.m_bodyB); } - } } -static bool verify=false; +static bool verify = false; -void b3Solver::solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches) +void b3Solver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches) { - - - b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 ); + b3Int4 cdata = b3MakeInt4(n, 0, 0, 0); { - const int nn = B3_SOLVER_N_CELLS; cdata.x = 0; - cdata.y = maxNumBatches;//250; - + cdata.y = maxNumBatches; //250; - int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES; + int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES; #ifdef DEBUG_ME - SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; - adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems); + SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; + adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems); #endif - - { - B3_PROFILE("m_batchSolveKernel iterations"); - for(int iter=0; iter<m_nIterations; iter++) + for (int iter = 0; iter < m_nIterations; iter++) { - for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++) + for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++) { - if (verify) { - checkConstraintBatch(bodyBuf,shapeBuf,constraint,m_numConstraints,m_offsets,ib); + checkConstraintBatch(bodyBuf, shapeBuf, constraint, m_numConstraints, m_offsets, ib); } #ifdef DEBUG_ME - memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems); - gpuDebugInfo.write(debugInfo,numWorkItems); + memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems); + gpuDebugInfo.write(debugInfo, numWorkItems); #endif - cdata.z = ib; - - b3LauncherCL launcher( m_queue, m_solveContactKernel ,"m_solveContactKernel"); + b3LauncherCL launcher(m_queue, m_solveContactKernel, "m_solveContactKernel"); #if 1 - - b3BufferInfoCL bInfo[] = { - - b3BufferInfoCL( bodyBuf->getBufferCL() ), - b3BufferInfoCL( shapeBuf->getBufferCL() ), - b3BufferInfoCL( constraint->getBufferCL() ), - b3BufferInfoCL( m_numConstraints->getBufferCL() ), - b3BufferInfoCL( m_offsets->getBufferCL() ) + + b3BufferInfoCL bInfo[] = { + + b3BufferInfoCL(bodyBuf->getBufferCL()), + b3BufferInfoCL(shapeBuf->getBufferCL()), + b3BufferInfoCL(constraint->getBufferCL()), + b3BufferInfoCL(m_numConstraints->getBufferCL()), + b3BufferInfoCL(m_offsets->getBufferCL()) #ifdef DEBUG_ME - , b3BufferInfoCL(&gpuDebugInfo) + , + b3BufferInfoCL(&gpuDebugInfo) #endif - }; - - + }; - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); //launcher.setConst( cdata.x ); - launcher.setConst( cdata.y ); - launcher.setConst( cdata.z ); - b3Int4 nSplit; + launcher.setConst(cdata.y); + launcher.setConst(cdata.z); + b3Int4 nSplit; nSplit.x = B3_SOLVER_N_SPLIT_X; nSplit.y = B3_SOLVER_N_SPLIT_Y; nSplit.z = B3_SOLVER_N_SPLIT_Z; - launcher.setConst( nSplit ); - launcher.launch1D( numWorkItems, 64 ); + launcher.setConst(nSplit); + launcher.launch1D(numWorkItems, 64); - #else - const char* fileName = "m_batchSolveKernel.bin"; - FILE* f = fopen(fileName,"rb"); - if (f) - { - int sizeInBytes=0; - if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) - { - printf("error, cannot get file size\n"); - exit(0); - } - - unsigned char* buf = (unsigned char*) malloc(sizeInBytes); - fread(buf,sizeInBytes,1,f); - int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); - int num = *(int*)&buf[serializedBytes]; - - launcher.launch1D( num); - - //this clFinish is for testing on errors - clFinish(m_queue); - } + const char* fileName = "m_batchSolveKernel.bin"; + FILE* f = fopen(fileName, "rb"); + if (f) + { + int sizeInBytes = 0; + if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) + { + printf("error, cannot get file size\n"); + exit(0); + } + + unsigned char* buf = (unsigned char*)malloc(sizeInBytes); + fread(buf, sizeInBytes, 1, f); + int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context); + int num = *(int*)&buf[serializedBytes]; + + launcher.launch1D(num); + + //this clFinish is for testing on errors + clFinish(m_queue); + } #endif - #ifdef DEBUG_ME clFinish(m_queue); - gpuDebugInfo.read(debugInfo,numWorkItems); + gpuDebugInfo.read(debugInfo, numWorkItems); clFinish(m_queue); - for (int i=0;i<numWorkItems;i++) + for (int i = 0; i < numWorkItems; i++) { - if (debugInfo[i].m_valInt2>0) + if (debugInfo[i].m_valInt2 > 0) { - printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2); + printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2); } - if (debugInfo[i].m_valInt3>0) + if (debugInfo[i].m_valInt3 > 0) { - printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3); + printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3); } } -#endif //DEBUG_ME - - +#endif //DEBUG_ME } } - - clFinish(m_queue); - + clFinish(m_queue); } cdata.x = 1; - bool applyFriction=true; + bool applyFriction = true; if (applyFriction) - { + { B3_PROFILE("m_batchSolveKernel iterations2"); - for(int iter=0; iter<m_nIterations; iter++) + for (int iter = 0; iter < m_nIterations; iter++) { - for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++) + for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++) { cdata.z = ib; - - - b3BufferInfoCL bInfo[] = { - b3BufferInfoCL( bodyBuf->getBufferCL() ), - b3BufferInfoCL( shapeBuf->getBufferCL() ), - b3BufferInfoCL( constraint->getBufferCL() ), - b3BufferInfoCL( m_numConstraints->getBufferCL() ), - b3BufferInfoCL( m_offsets->getBufferCL() ) + + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL(bodyBuf->getBufferCL()), + b3BufferInfoCL(shapeBuf->getBufferCL()), + b3BufferInfoCL(constraint->getBufferCL()), + b3BufferInfoCL(m_numConstraints->getBufferCL()), + b3BufferInfoCL(m_offsets->getBufferCL()) #ifdef DEBUG_ME - ,b3BufferInfoCL(&gpuDebugInfo) -#endif //DEBUG_ME + , + b3BufferInfoCL(&gpuDebugInfo) +#endif //DEBUG_ME }; - b3LauncherCL launcher( m_queue, m_solveFrictionKernel,"m_solveFrictionKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + b3LauncherCL launcher(m_queue, m_solveFrictionKernel, "m_solveFrictionKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); //launcher.setConst( cdata.x ); - launcher.setConst( cdata.y ); - launcher.setConst( cdata.z ); - b3Int4 nSplit; + launcher.setConst(cdata.y); + launcher.setConst(cdata.z); + b3Int4 nSplit; nSplit.x = B3_SOLVER_N_SPLIT_X; nSplit.y = B3_SOLVER_N_SPLIT_Y; nSplit.z = B3_SOLVER_N_SPLIT_Z; - launcher.setConst( nSplit ); - - launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 ); + launcher.setConst(nSplit); + + launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64); } } clFinish(m_queue); - } #ifdef DEBUG_ME delete[] debugInfo; -#endif //DEBUG_ME +#endif //DEBUG_ME } - - } -void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, - const b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, - int nContacts, const ConstraintCfg& cfg ) +void b3Solver::convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, + int nContacts, const ConstraintCfg& cfg) { -// b3OpenCLArray<b3GpuConstraint4>* constraintNative =0; + // b3OpenCLArray<b3GpuConstraint4>* constraintNative =0; contactCOut->resize(nContacts); struct CB { @@ -959,30 +880,28 @@ void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyB }; { - CB cdata; cdata.m_nContacts = nContacts; cdata.m_dt = cfg.m_dt; cdata.m_positionDrift = cfg.m_positionDrift; cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff; - if (gConvertConstraintOnCpu) { b3AlignedObjectArray<b3RigidBodyData> gBodies; - bodyBuf->copyToHost(gBodies); + bodyBuf->copyToHost(gBodies); - b3AlignedObjectArray<b3Contact4> gContact; - contactsIn->copyToHost(gContact); + b3AlignedObjectArray<b3Contact4> gContact; + contactsIn->copyToHost(gContact); + + b3AlignedObjectArray<b3InertiaData> gShapes; + shapeBuf->copyToHost(gShapes); + + b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut; + gConstraintOut.resize(nContacts); - b3AlignedObjectArray<b3InertiaData> gShapes; - shapeBuf->copyToHost(gShapes); - - b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut; - gConstraintOut.resize(nContacts); - B3_PROFILE("cpu contactToConstraintKernel"); - for (int gIdx=0;gIdx<nContacts;gIdx++) + for (int gIdx = 0; gIdx < nContacts; gIdx++) { int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); @@ -1001,40 +920,36 @@ void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyB b3ContactConstraint4_t cs; - setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, - &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff, - &cs ); - + setConstraint4(posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, + &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff, + &cs); + cs.m_batchIdx = gContact[gIdx].m_batchIdx; gConstraintOut[gIdx] = (b3GpuConstraint4&)cs; } contactCOut->copyFromHost(gConstraintOut); - - } else + } + else { B3_PROFILE("gpu m_contactToConstraintKernel"); - - b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()), - b3BufferInfoCL( contactCOut->getBufferCL() )}; - b3LauncherCL launcher( m_queue, m_contactToConstraintKernel,"m_contactToConstraintKernel" ); - launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + b3BufferInfoCL bInfo[] = {b3BufferInfoCL(contactsIn->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(shapeBuf->getBufferCL()), + b3BufferInfoCL(contactCOut->getBufferCL())}; + b3LauncherCL launcher(m_queue, m_contactToConstraintKernel, "m_contactToConstraintKernel"); + launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); //launcher.setConst( cdata ); - + launcher.setConst(cdata.m_nContacts); launcher.setConst(cdata.m_dt); launcher.setConst(cdata.m_positionDrift); launcher.setConst(cdata.m_positionConstraintCoeff); - - launcher.launch1D( nContacts, 64 ); - clFinish(m_queue); + launcher.launch1D(nContacts, 64); + clFinish(m_queue); } } - - } /* @@ -1115,28 +1030,24 @@ void b3Solver::sortContacts( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, } */ -void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx ) +void b3Solver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx) { - - int numWorkItems = 64*B3_SOLVER_N_CELLS; + int numWorkItems = 64 * B3_SOLVER_N_CELLS; { B3_PROFILE("batch generation"); - + b3Int4 cdata; cdata.x = nContacts; cdata.y = 0; cdata.z = staticIdx; - #ifdef BATCH_DEBUG - SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; - adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems); - memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems); - gpuDebugInfo.write(debugInfo,numWorkItems); + SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems]; + adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems); + memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems); + gpuDebugInfo.write(debugInfo, numWorkItems); #endif - - #if 0 b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contacts->getBufferCL() ), @@ -1148,8 +1059,6 @@ void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContact #endif }; #endif - - { m_batchSizes.resize(nNative->size()); @@ -1157,22 +1066,21 @@ void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContact //b3LauncherCL launcher( m_queue, m_batchingKernel); cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel; - b3LauncherCL launcher( m_queue, k,"*batchingKernel"); - if (!useNewBatchingKernel ) + b3LauncherCL launcher(m_queue, k, "*batchingKernel"); + if (!useNewBatchingKernel) { - launcher.setBuffer( contacts->getBufferCL() ); + launcher.setBuffer(contacts->getBufferCL()); } - launcher.setBuffer( m_contactBuffer2->getBufferCL() ); - launcher.setBuffer( nNative->getBufferCL()); - launcher.setBuffer( offsetsNative->getBufferCL()); - + launcher.setBuffer(m_contactBuffer2->getBufferCL()); + launcher.setBuffer(nNative->getBufferCL()); + launcher.setBuffer(offsetsNative->getBufferCL()); + launcher.setBuffer(m_batchSizes.getBufferCL()); - //launcher.setConst( cdata ); - launcher.setConst(staticIdx); - - launcher.launch1D( numWorkItems, 64 ); + launcher.setConst(staticIdx); + + launcher.launch1D(numWorkItems, 64); //clFinish(m_queue); //b3AlignedObjectArray<int> batchSizesCPU; //m_batchSizes.copyToHost(batchSizesCPU); @@ -1180,46 +1088,41 @@ void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContact } #ifdef BATCH_DEBUG - aaaa - b3Contact4* hostContacts = new b3Contact4[nContacts]; - m_contactBuffer->read(hostContacts,nContacts); + aaaa + b3Contact4* hostContacts = new b3Contact4[nContacts]; + m_contactBuffer->read(hostContacts, nContacts); clFinish(m_queue); - gpuDebugInfo.read(debugInfo,numWorkItems); + gpuDebugInfo.read(debugInfo, numWorkItems); clFinish(m_queue); - for (int i=0;i<numWorkItems;i++) + for (int i = 0; i < numWorkItems; i++) { - if (debugInfo[i].m_valInt1>0) + if (debugInfo[i].m_valInt1 > 0) { printf("catch\n"); } - if (debugInfo[i].m_valInt2>0) + if (debugInfo[i].m_valInt2 > 0) { printf("catch22\n"); } - if (debugInfo[i].m_valInt3>0) + if (debugInfo[i].m_valInt3 > 0) { printf("catch666\n"); } - if (debugInfo[i].m_valInt4>0) + if (debugInfo[i].m_valInt4 > 0) { printf("catch777\n"); } } delete[] debugInfo; -#endif //BATCH_DEBUG - +#endif //BATCH_DEBUG } -// copy buffer to buffer + // copy buffer to buffer //b3Assert(m_contactBuffer->size()==nContacts); //contacts->copyFromOpenCLArray( *m_contactBuffer); //clFinish(m_queue);//needed? - - - } - diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.h index b37f2f1bec..ee63531d78 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.h @@ -13,7 +13,6 @@ subject to the following restrictions: */ //Originally written by Takahiro Harada - #ifndef __ADL_SOLVER_H #define __ADL_SOLVER_H @@ -29,98 +28,83 @@ subject to the following restrictions: #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" - -#define B3NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment)) +#define B3NEXTMULTIPLEOF(num, alignment) (((num) / (alignment) + (((num) % (alignment) == 0) ? 0 : 1)) * (alignment)) enum { - B3_SOLVER_N_SPLIT_X = 8,//16,//4, - B3_SOLVER_N_SPLIT_Y = 4,//16,//4, - B3_SOLVER_N_SPLIT_Z = 8,//, - B3_SOLVER_N_CELLS = B3_SOLVER_N_SPLIT_X*B3_SOLVER_N_SPLIT_Y*B3_SOLVER_N_SPLIT_Z, - B3_SOLVER_N_BATCHES = 8,//4,//8,//4, + B3_SOLVER_N_SPLIT_X = 8, //16,//4, + B3_SOLVER_N_SPLIT_Y = 4, //16,//4, + B3_SOLVER_N_SPLIT_Z = 8, //, + B3_SOLVER_N_CELLS = B3_SOLVER_N_SPLIT_X * B3_SOLVER_N_SPLIT_Y * B3_SOLVER_N_SPLIT_Z, + B3_SOLVER_N_BATCHES = 8, //4,//8,//4, B3_MAX_NUM_BATCHES = 128, }; class b3SolverBase { - public: - - - struct ConstraintCfg - { - ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {} - - float m_positionDrift; - float m_positionConstraintCoeff; - float m_dt; - bool m_enableParallelSolve; - float m_batchCellSize; - int m_staticIdx; - }; - +public: + struct ConstraintCfg + { + ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(-1) {} + + float m_positionDrift; + float m_positionConstraintCoeff; + float m_dt; + bool m_enableParallelSolve; + float m_batchCellSize; + int m_staticIdx; + }; }; class b3Solver : public b3SolverBase { - public: - - cl_context m_context; - cl_device_id m_device; - cl_command_queue m_queue; - - - b3OpenCLArray<unsigned int>* m_numConstraints; - b3OpenCLArray<unsigned int>* m_offsets; - b3OpenCLArray<int> m_batchSizes; - - - int m_nIterations; - cl_kernel m_batchingKernel; - cl_kernel m_batchingKernelNew; - cl_kernel m_solveContactKernel; - cl_kernel m_solveFrictionKernel; - cl_kernel m_contactToConstraintKernel; - cl_kernel m_setSortDataKernel; - cl_kernel m_reorderContactKernel; - cl_kernel m_copyConstraintKernel; +public: + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; - class b3RadixSort32CL* m_sort32; - class b3BoundSearchCL* m_search; - class b3PrefixScanCL* m_scan; + b3OpenCLArray<unsigned int>* m_numConstraints; + b3OpenCLArray<unsigned int>* m_offsets; + b3OpenCLArray<int> m_batchSizes; - b3OpenCLArray<b3SortData>* m_sortDataBuffer; - b3OpenCLArray<b3Contact4>* m_contactBuffer2; + int m_nIterations; + cl_kernel m_batchingKernel; + cl_kernel m_batchingKernelNew; + cl_kernel m_solveContactKernel; + cl_kernel m_solveFrictionKernel; + cl_kernel m_contactToConstraintKernel; + cl_kernel m_setSortDataKernel; + cl_kernel m_reorderContactKernel; + cl_kernel m_copyConstraintKernel; - enum - { - DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000, - }; + class b3RadixSort32CL* m_sort32; + class b3BoundSearchCL* m_search; + class b3PrefixScanCL* m_scan; - + b3OpenCLArray<b3SortData>* m_sortDataBuffer; + b3OpenCLArray<b3Contact4>* m_contactBuffer2; - - b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity); + enum + { + DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000, + }; - virtual ~b3Solver(); - - void solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* inertiaBuf, - b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches); + b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity); - void solveContactConstraintHost( b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, b3AlignedObjectArray<int>* batchSizes); + virtual ~b3Solver(); + void solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* inertiaBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches); - void convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, - const b3OpenCLArray<b3InertiaData>* shapeBuf, - b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, - int nContacts, const ConstraintCfg& cfg ); + void solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes); - void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx ); + void convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, + const b3OpenCLArray<b3InertiaData>* shapeBuf, + b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, + int nContacts, const ConstraintCfg& cfg); + void batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx); }; - - - -#endif //__ADL_SOLVER_H +#endif //__ADL_SOLVER_H diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h index 150eedc94b..7c73c96baa 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h @@ -1,388 +1,387 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* batchingKernelsCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"#ifndef B3_CONTACT4DATA_H\n" -"#define B3_CONTACT4DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"typedef struct b3Contact4Data b3Contact4Data_t;\n" -"struct b3Contact4Data\n" -"{\n" -" b3Float4 m_worldPosB[4];\n" -"// b3Float4 m_localPosA[4];\n" -"// b3Float4 m_localPosB[4];\n" -" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" -" unsigned short m_restituitionCoeffCmp;\n" -" unsigned short m_frictionCoeffCmp;\n" -" int m_batchIdx;\n" -" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" -" int m_bodyBPtrAndSignBit;\n" -" int m_childIndexA;\n" -" int m_childIndexB;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" -"{\n" -" return (int)contact->m_worldNormalOnB.w;\n" -"};\n" -"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" -"{\n" -" contact->m_worldNormalOnB.w = (float)numPoints;\n" -"};\n" -"#endif //B3_CONTACT4DATA_H\n" -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"#ifdef cl_ext_atomic_counters_32\n" -"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -"#define counter32_t volatile __global int*\n" -"#endif\n" -"typedef unsigned int u32;\n" -"typedef unsigned short u16;\n" -"typedef unsigned char u8;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GET_NUM_GROUPS get_num_groups(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" -"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define make_float4 (float4)\n" -"#define make_float2 (float2)\n" -"#define make_uint4 (uint4)\n" -"#define make_int4 (int4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"#define max2 max\n" -"#define min2 min\n" -"#define WG_SIZE 64\n" -"typedef struct \n" -"{\n" -" int m_n;\n" -" int m_start;\n" -" int m_staticIdx;\n" -" int m_paddings[1];\n" -"} ConstBuffer;\n" -"typedef struct \n" -"{\n" -" int m_a;\n" -" int m_b;\n" -" u32 m_idx;\n" -"}Elem;\n" -"#define STACK_SIZE (WG_SIZE*10)\n" -"//#define STACK_SIZE (WG_SIZE)\n" -"#define RING_SIZE 1024\n" -"#define RING_SIZE_MASK (RING_SIZE-1)\n" -"#define CHECK_SIZE (WG_SIZE)\n" -"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n" -"#define RING_END ldsTmp\n" -"u32 readBuf(__local u32* buff, int idx)\n" -"{\n" -" idx = idx % (32*CHECK_SIZE);\n" -" int bitIdx = idx%32;\n" -" int bufIdx = idx/32;\n" -" return buff[bufIdx] & (1<<bitIdx);\n" -"}\n" -"void writeBuf(__local u32* buff, int idx)\n" -"{\n" -" idx = idx % (32*CHECK_SIZE);\n" -" int bitIdx = idx%32;\n" -" int bufIdx = idx/32;\n" -"// buff[bufIdx] |= (1<<bitIdx);\n" -" atom_or( &buff[bufIdx], (1<<bitIdx) );\n" -"}\n" -"u32 tryWrite(__local u32* buff, int idx)\n" -"{\n" -" idx = idx % (32*CHECK_SIZE);\n" -" int bitIdx = idx%32;\n" -" int bufIdx = idx/32;\n" -" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n" -" return ((ans >> bitIdx)&1) == 0;\n" -"}\n" -"// batching on the GPU\n" -"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n" -" __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n" -" int m_staticIdx )\n" -"{\n" -" __local u32 ldsStackIdx[STACK_SIZE];\n" -" __local u32 ldsStackEnd;\n" -" __local Elem ldsRingElem[RING_SIZE];\n" -" __local u32 ldsRingEnd;\n" -" __local u32 ldsTmp;\n" -" __local u32 ldsCheckBuffer[CHECK_SIZE];\n" -" __local u32 ldsFixedBuffer[CHECK_SIZE];\n" -" __local u32 ldsGEnd;\n" -" __local u32 ldsDstEnd;\n" -" int wgIdx = GET_GROUP_IDX;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" \n" -" const int m_n = gN[wgIdx];\n" -" const int m_start = gStart[wgIdx];\n" -" \n" -" if( lIdx == 0 )\n" -" {\n" -" ldsRingEnd = 0;\n" -" ldsGEnd = 0;\n" -" ldsStackEnd = 0;\n" -" ldsDstEnd = m_start;\n" -" }\n" -" \n" -" \n" -" \n" -"// while(1)\n" -"//was 250\n" -" int ie=0;\n" -" int maxBatch = 0;\n" -" for(ie=0; ie<50; ie++)\n" -" {\n" -" ldsFixedBuffer[lIdx] = 0;\n" -" for(int giter=0; giter<4; giter++)\n" -" {\n" -" int ringCap = GET_RING_CAPACITY;\n" -" \n" -" // 1. fill ring\n" -" if( ldsGEnd < m_n )\n" -" {\n" -" while( ringCap > WG_SIZE )\n" -" {\n" -" if( ldsGEnd >= m_n ) break;\n" -" if( lIdx < ringCap - WG_SIZE )\n" -" {\n" -" int srcIdx;\n" -" AtomInc1( ldsGEnd, srcIdx );\n" -" if( srcIdx < m_n )\n" -" {\n" -" int dstIdx;\n" -" AtomInc1( ldsRingEnd, dstIdx );\n" -" \n" -" int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n" -" int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n" -" ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n" -" ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n" -" ldsRingElem[dstIdx].m_idx = srcIdx;\n" -" }\n" -" }\n" -" ringCap = GET_RING_CAPACITY;\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" \n" -" // 2. fill stack\n" -" __local Elem* dst = ldsRingElem;\n" -" if( lIdx == 0 ) RING_END = 0;\n" -" int srcIdx=lIdx;\n" -" int end = ldsRingEnd;\n" -" {\n" -" for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n" -" {\n" -" Elem e;\n" -" if(srcIdx<end) e = ldsRingElem[srcIdx];\n" -" bool done = (srcIdx<end)?false:true;\n" -" for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n" -" \n" -" if( !done )\n" -" {\n" -" int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n" -" int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n" -" if( aUsed==0 && bUsed==0 )\n" -" {\n" -" int aAvailable=1;\n" -" int bAvailable=1;\n" -" int ea = abs(e.m_a);\n" -" int eb = abs(e.m_b);\n" -" bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n" -" bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n" -" \n" -" if (!aStatic)\n" -" aAvailable = tryWrite( ldsCheckBuffer, ea );\n" -" if (!bStatic)\n" -" bAvailable = tryWrite( ldsCheckBuffer, eb );\n" -" \n" -" //aAvailable = aStatic? 1: aAvailable;\n" -" //bAvailable = bStatic? 1: bAvailable;\n" -" bool success = (aAvailable && bAvailable);\n" -" if(success)\n" -" {\n" -" \n" -" if (!aStatic)\n" -" writeBuf( ldsFixedBuffer, ea );\n" -" if (!bStatic)\n" -" writeBuf( ldsFixedBuffer, eb );\n" -" }\n" -" done = success;\n" -" }\n" -" }\n" -" // put it aside\n" -" if(srcIdx<end)\n" -" {\n" -" if( done )\n" -" {\n" -" int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n" -" if( dstIdx < STACK_SIZE )\n" -" ldsStackIdx[dstIdx] = e.m_idx;\n" -" else{\n" -" done = false;\n" -" AtomAdd( ldsStackEnd, -1 );\n" -" }\n" -" }\n" -" if( !done )\n" -" {\n" -" int dstIdx; AtomInc1( RING_END, dstIdx );\n" -" dst[dstIdx] = e;\n" -" }\n" -" }\n" -" // if filled, flush\n" -" if( ldsStackEnd == STACK_SIZE )\n" -" {\n" -" for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n" -" {\n" -" int idx = m_start + ldsStackIdx[i];\n" -" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" -" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" -" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n" -" }\n" -" if( lIdx == 0 ) ldsStackEnd = 0;\n" -" //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n" -" ldsFixedBuffer[lIdx] = 0;\n" -" }\n" -" }\n" -" }\n" -" if( lIdx == 0 ) ldsRingEnd = RING_END;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n" -" {\n" -" int idx = m_start + ldsStackIdx[i];\n" -" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" -" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" -" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n" -" }\n" -" // in case it couldn't consume any pair. Flush them\n" -" // todo. Serial batch worth while?\n" -" if( ldsStackEnd == 0 )\n" -" {\n" -" for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n" -" {\n" -" int idx = m_start + ldsRingElem[i].m_idx;\n" -" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" -" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" -" int curBatch = 100+i;\n" -" if (maxBatch < curBatch)\n" -" maxBatch = curBatch;\n" -" \n" -" gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n" -" \n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" if( lIdx == 0 ) ldsRingEnd = 0;\n" -" }\n" -" if( lIdx == 0 ) ldsStackEnd = 0;\n" -" GROUP_LDS_BARRIER;\n" -" // termination\n" -" if( ldsGEnd == m_n && ldsRingEnd == 0 )\n" -" break;\n" -" }\n" -" if( lIdx == 0 )\n" -" {\n" -" if (maxBatch < ie)\n" -" maxBatch=ie;\n" -" batchSizes[wgIdx]=maxBatch;\n" -" }\n" -"}\n" -; +static const char* batchingKernelsCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "#ifndef B3_CONTACT4DATA_H\n" + "#define B3_CONTACT4DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "typedef struct b3Contact4Data b3Contact4Data_t;\n" + "struct b3Contact4Data\n" + "{\n" + " b3Float4 m_worldPosB[4];\n" + "// b3Float4 m_localPosA[4];\n" + "// b3Float4 m_localPosB[4];\n" + " b3Float4 m_worldNormalOnB; // w: m_nPoints\n" + " unsigned short m_restituitionCoeffCmp;\n" + " unsigned short m_frictionCoeffCmp;\n" + " int m_batchIdx;\n" + " int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" + " int m_bodyBPtrAndSignBit;\n" + " int m_childIndexA;\n" + " int m_childIndexB;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" + "{\n" + " return (int)contact->m_worldNormalOnB.w;\n" + "};\n" + "inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" + "{\n" + " contact->m_worldNormalOnB.w = (float)numPoints;\n" + "};\n" + "#endif //B3_CONTACT4DATA_H\n" + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "#ifdef cl_ext_atomic_counters_32\n" + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + "#define counter32_t volatile __global int*\n" + "#endif\n" + "typedef unsigned int u32;\n" + "typedef unsigned short u16;\n" + "typedef unsigned char u8;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GET_NUM_GROUPS get_num_groups(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" + "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define make_float4 (float4)\n" + "#define make_float2 (float2)\n" + "#define make_uint4 (uint4)\n" + "#define make_int4 (int4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "#define max2 max\n" + "#define min2 min\n" + "#define WG_SIZE 64\n" + "typedef struct \n" + "{\n" + " int m_n;\n" + " int m_start;\n" + " int m_staticIdx;\n" + " int m_paddings[1];\n" + "} ConstBuffer;\n" + "typedef struct \n" + "{\n" + " int m_a;\n" + " int m_b;\n" + " u32 m_idx;\n" + "}Elem;\n" + "#define STACK_SIZE (WG_SIZE*10)\n" + "//#define STACK_SIZE (WG_SIZE)\n" + "#define RING_SIZE 1024\n" + "#define RING_SIZE_MASK (RING_SIZE-1)\n" + "#define CHECK_SIZE (WG_SIZE)\n" + "#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n" + "#define RING_END ldsTmp\n" + "u32 readBuf(__local u32* buff, int idx)\n" + "{\n" + " idx = idx % (32*CHECK_SIZE);\n" + " int bitIdx = idx%32;\n" + " int bufIdx = idx/32;\n" + " return buff[bufIdx] & (1<<bitIdx);\n" + "}\n" + "void writeBuf(__local u32* buff, int idx)\n" + "{\n" + " idx = idx % (32*CHECK_SIZE);\n" + " int bitIdx = idx%32;\n" + " int bufIdx = idx/32;\n" + "// buff[bufIdx] |= (1<<bitIdx);\n" + " atom_or( &buff[bufIdx], (1<<bitIdx) );\n" + "}\n" + "u32 tryWrite(__local u32* buff, int idx)\n" + "{\n" + " idx = idx % (32*CHECK_SIZE);\n" + " int bitIdx = idx%32;\n" + " int bufIdx = idx/32;\n" + " u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n" + " return ((ans >> bitIdx)&1) == 0;\n" + "}\n" + "// batching on the GPU\n" + "__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n" + " __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n" + " int m_staticIdx )\n" + "{\n" + " __local u32 ldsStackIdx[STACK_SIZE];\n" + " __local u32 ldsStackEnd;\n" + " __local Elem ldsRingElem[RING_SIZE];\n" + " __local u32 ldsRingEnd;\n" + " __local u32 ldsTmp;\n" + " __local u32 ldsCheckBuffer[CHECK_SIZE];\n" + " __local u32 ldsFixedBuffer[CHECK_SIZE];\n" + " __local u32 ldsGEnd;\n" + " __local u32 ldsDstEnd;\n" + " int wgIdx = GET_GROUP_IDX;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " \n" + " const int m_n = gN[wgIdx];\n" + " const int m_start = gStart[wgIdx];\n" + " \n" + " if( lIdx == 0 )\n" + " {\n" + " ldsRingEnd = 0;\n" + " ldsGEnd = 0;\n" + " ldsStackEnd = 0;\n" + " ldsDstEnd = m_start;\n" + " }\n" + " \n" + " \n" + " \n" + "// while(1)\n" + "//was 250\n" + " int ie=0;\n" + " int maxBatch = 0;\n" + " for(ie=0; ie<50; ie++)\n" + " {\n" + " ldsFixedBuffer[lIdx] = 0;\n" + " for(int giter=0; giter<4; giter++)\n" + " {\n" + " int ringCap = GET_RING_CAPACITY;\n" + " \n" + " // 1. fill ring\n" + " if( ldsGEnd < m_n )\n" + " {\n" + " while( ringCap > WG_SIZE )\n" + " {\n" + " if( ldsGEnd >= m_n ) break;\n" + " if( lIdx < ringCap - WG_SIZE )\n" + " {\n" + " int srcIdx;\n" + " AtomInc1( ldsGEnd, srcIdx );\n" + " if( srcIdx < m_n )\n" + " {\n" + " int dstIdx;\n" + " AtomInc1( ldsRingEnd, dstIdx );\n" + " \n" + " int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n" + " int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n" + " ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n" + " ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n" + " ldsRingElem[dstIdx].m_idx = srcIdx;\n" + " }\n" + " }\n" + " ringCap = GET_RING_CAPACITY;\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " \n" + " // 2. fill stack\n" + " __local Elem* dst = ldsRingElem;\n" + " if( lIdx == 0 ) RING_END = 0;\n" + " int srcIdx=lIdx;\n" + " int end = ldsRingEnd;\n" + " {\n" + " for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n" + " {\n" + " Elem e;\n" + " if(srcIdx<end) e = ldsRingElem[srcIdx];\n" + " bool done = (srcIdx<end)?false:true;\n" + " for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n" + " \n" + " if( !done )\n" + " {\n" + " int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n" + " int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n" + " if( aUsed==0 && bUsed==0 )\n" + " {\n" + " int aAvailable=1;\n" + " int bAvailable=1;\n" + " int ea = abs(e.m_a);\n" + " int eb = abs(e.m_b);\n" + " bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n" + " bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n" + " \n" + " if (!aStatic)\n" + " aAvailable = tryWrite( ldsCheckBuffer, ea );\n" + " if (!bStatic)\n" + " bAvailable = tryWrite( ldsCheckBuffer, eb );\n" + " \n" + " //aAvailable = aStatic? 1: aAvailable;\n" + " //bAvailable = bStatic? 1: bAvailable;\n" + " bool success = (aAvailable && bAvailable);\n" + " if(success)\n" + " {\n" + " \n" + " if (!aStatic)\n" + " writeBuf( ldsFixedBuffer, ea );\n" + " if (!bStatic)\n" + " writeBuf( ldsFixedBuffer, eb );\n" + " }\n" + " done = success;\n" + " }\n" + " }\n" + " // put it aside\n" + " if(srcIdx<end)\n" + " {\n" + " if( done )\n" + " {\n" + " int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n" + " if( dstIdx < STACK_SIZE )\n" + " ldsStackIdx[dstIdx] = e.m_idx;\n" + " else{\n" + " done = false;\n" + " AtomAdd( ldsStackEnd, -1 );\n" + " }\n" + " }\n" + " if( !done )\n" + " {\n" + " int dstIdx; AtomInc1( RING_END, dstIdx );\n" + " dst[dstIdx] = e;\n" + " }\n" + " }\n" + " // if filled, flush\n" + " if( ldsStackEnd == STACK_SIZE )\n" + " {\n" + " for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n" + " {\n" + " int idx = m_start + ldsStackIdx[i];\n" + " int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" + " gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" + " gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n" + " }\n" + " if( lIdx == 0 ) ldsStackEnd = 0;\n" + " //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n" + " ldsFixedBuffer[lIdx] = 0;\n" + " }\n" + " }\n" + " }\n" + " if( lIdx == 0 ) ldsRingEnd = RING_END;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n" + " {\n" + " int idx = m_start + ldsStackIdx[i];\n" + " int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" + " gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" + " gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n" + " }\n" + " // in case it couldn't consume any pair. Flush them\n" + " // todo. Serial batch worth while?\n" + " if( ldsStackEnd == 0 )\n" + " {\n" + " for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n" + " {\n" + " int idx = m_start + ldsRingElem[i].m_idx;\n" + " int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" + " gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" + " int curBatch = 100+i;\n" + " if (maxBatch < curBatch)\n" + " maxBatch = curBatch;\n" + " \n" + " gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n" + " \n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " if( lIdx == 0 ) ldsRingEnd = 0;\n" + " }\n" + " if( lIdx == 0 ) ldsStackEnd = 0;\n" + " GROUP_LDS_BARRIER;\n" + " // termination\n" + " if( ldsGEnd == m_n && ldsRingEnd == 0 )\n" + " break;\n" + " }\n" + " if( lIdx == 0 )\n" + " {\n" + " if (maxBatch < ie)\n" + " maxBatch=ie;\n" + " batchSizes[wgIdx]=maxBatch;\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h index 1e5957adae..05800656cb 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h @@ -1,291 +1,290 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* batchingKernelsNewCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Erwin Coumans\n" -"#ifndef B3_CONTACT4DATA_H\n" -"#define B3_CONTACT4DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"typedef struct b3Contact4Data b3Contact4Data_t;\n" -"struct b3Contact4Data\n" -"{\n" -" b3Float4 m_worldPosB[4];\n" -"// b3Float4 m_localPosA[4];\n" -"// b3Float4 m_localPosB[4];\n" -" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" -" unsigned short m_restituitionCoeffCmp;\n" -" unsigned short m_frictionCoeffCmp;\n" -" int m_batchIdx;\n" -" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" -" int m_bodyBPtrAndSignBit;\n" -" int m_childIndexA;\n" -" int m_childIndexB;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" -"{\n" -" return (int)contact->m_worldNormalOnB.w;\n" -"};\n" -"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" -"{\n" -" contact->m_worldNormalOnB.w = (float)numPoints;\n" -"};\n" -"#endif //B3_CONTACT4DATA_H\n" -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"#ifdef cl_ext_atomic_counters_32\n" -"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -"#define counter32_t volatile __global int*\n" -"#endif\n" -"#define SIMD_WIDTH 64\n" -"typedef unsigned int u32;\n" -"typedef unsigned short u16;\n" -"typedef unsigned char u8;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GET_NUM_GROUPS get_num_groups(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" -"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define make_float4 (float4)\n" -"#define make_float2 (float2)\n" -"#define make_uint4 (uint4)\n" -"#define make_int4 (int4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"#define max2 max\n" -"#define min2 min\n" -"#define WG_SIZE 64\n" -"typedef struct \n" -"{\n" -" int m_n;\n" -" int m_start;\n" -" int m_staticIdx;\n" -" int m_paddings[1];\n" -"} ConstBuffer;\n" -"typedef struct \n" -"{\n" -" int m_a;\n" -" int m_b;\n" -" u32 m_idx;\n" -"}Elem;\n" -"// batching on the GPU\n" -"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx )\n" -"{\n" -" int wgIdx = GET_GROUP_IDX;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" \n" -" const int m_n = gN[wgIdx];\n" -" const int m_start = gStart[wgIdx];\n" -" \n" -" if( lIdx == 0 )\n" -" {\n" -" for (int i=0;i<m_n;i++)\n" -" {\n" -" int srcIdx = i+m_start;\n" -" int batchIndex = i;\n" -" gConstraints[ srcIdx ].m_batchIdx = batchIndex; \n" -" }\n" -" }\n" -"}\n" -"#define CHECK_SIZE (WG_SIZE)\n" -"u32 readBuf(__local u32* buff, int idx)\n" -"{\n" -" idx = idx % (32*CHECK_SIZE);\n" -" int bitIdx = idx%32;\n" -" int bufIdx = idx/32;\n" -" return buff[bufIdx] & (1<<bitIdx);\n" -"}\n" -"void writeBuf(__local u32* buff, int idx)\n" -"{\n" -" idx = idx % (32*CHECK_SIZE);\n" -" int bitIdx = idx%32;\n" -" int bufIdx = idx/32;\n" -" buff[bufIdx] |= (1<<bitIdx);\n" -" //atom_or( &buff[bufIdx], (1<<bitIdx) );\n" -"}\n" -"u32 tryWrite(__local u32* buff, int idx)\n" -"{\n" -" idx = idx % (32*CHECK_SIZE);\n" -" int bitIdx = idx%32;\n" -" int bufIdx = idx/32;\n" -" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n" -" return ((ans >> bitIdx)&1) == 0;\n" -"}\n" -"// batching on the GPU\n" -"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n" -"{\n" -" int wgIdx = GET_GROUP_IDX;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" const int numConstraints = gN[wgIdx];\n" -" const int m_start = gStart[wgIdx];\n" -" b3Contact4Data_t tmp;\n" -" \n" -" __local u32 ldsFixedBuffer[CHECK_SIZE];\n" -" \n" -" \n" -" \n" -" \n" -" \n" -" if( lIdx == 0 )\n" -" {\n" -" \n" -" \n" -" __global struct b3Contact4Data* cs = &gConstraints[m_start]; \n" -" \n" -" \n" -" int numValidConstraints = 0;\n" -" int batchIdx = 0;\n" -" while( numValidConstraints < numConstraints)\n" -" {\n" -" int nCurrentBatch = 0;\n" -" // clear flag\n" -" \n" -" for(int i=0; i<CHECK_SIZE; i++) \n" -" ldsFixedBuffer[i] = 0; \n" -" for(int i=numValidConstraints; i<numConstraints; i++)\n" -" {\n" -" int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n" -" int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n" -" int bodyA = abs(bodyAS);\n" -" int bodyB = abs(bodyBS);\n" -" bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n" -" bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n" -" int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n" -" int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n" -" \n" -" if( aUnavailable==0 && bUnavailable==0 ) // ok\n" -" {\n" -" if (!aIsStatic)\n" -" {\n" -" writeBuf( ldsFixedBuffer, bodyA );\n" -" }\n" -" if (!bIsStatic)\n" -" {\n" -" writeBuf( ldsFixedBuffer, bodyB );\n" -" }\n" -" cs[i].m_batchIdx = batchIdx;\n" -" if (i!=numValidConstraints)\n" -" {\n" -" tmp = cs[i];\n" -" cs[i] = cs[numValidConstraints];\n" -" cs[numValidConstraints] = tmp;\n" -" }\n" -" numValidConstraints++;\n" -" \n" -" nCurrentBatch++;\n" -" if( nCurrentBatch == SIMD_WIDTH)\n" -" {\n" -" nCurrentBatch = 0;\n" -" for(int i=0; i<CHECK_SIZE; i++) \n" -" ldsFixedBuffer[i] = 0;\n" -" \n" -" }\n" -" }\n" -" }//for\n" -" batchIdx ++;\n" -" }//while\n" -" \n" -" batchSizes[wgIdx] = batchIdx;\n" -" }//if( lIdx == 0 )\n" -" \n" -" //return batchIdx;\n" -"}\n" -; +static const char* batchingKernelsNewCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Erwin Coumans\n" + "#ifndef B3_CONTACT4DATA_H\n" + "#define B3_CONTACT4DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "typedef struct b3Contact4Data b3Contact4Data_t;\n" + "struct b3Contact4Data\n" + "{\n" + " b3Float4 m_worldPosB[4];\n" + "// b3Float4 m_localPosA[4];\n" + "// b3Float4 m_localPosB[4];\n" + " b3Float4 m_worldNormalOnB; // w: m_nPoints\n" + " unsigned short m_restituitionCoeffCmp;\n" + " unsigned short m_frictionCoeffCmp;\n" + " int m_batchIdx;\n" + " int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" + " int m_bodyBPtrAndSignBit;\n" + " int m_childIndexA;\n" + " int m_childIndexB;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" + "{\n" + " return (int)contact->m_worldNormalOnB.w;\n" + "};\n" + "inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" + "{\n" + " contact->m_worldNormalOnB.w = (float)numPoints;\n" + "};\n" + "#endif //B3_CONTACT4DATA_H\n" + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "#ifdef cl_ext_atomic_counters_32\n" + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + "#define counter32_t volatile __global int*\n" + "#endif\n" + "#define SIMD_WIDTH 64\n" + "typedef unsigned int u32;\n" + "typedef unsigned short u16;\n" + "typedef unsigned char u8;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GET_NUM_GROUPS get_num_groups(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" + "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define make_float4 (float4)\n" + "#define make_float2 (float2)\n" + "#define make_uint4 (uint4)\n" + "#define make_int4 (int4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "#define max2 max\n" + "#define min2 min\n" + "#define WG_SIZE 64\n" + "typedef struct \n" + "{\n" + " int m_n;\n" + " int m_start;\n" + " int m_staticIdx;\n" + " int m_paddings[1];\n" + "} ConstBuffer;\n" + "typedef struct \n" + "{\n" + " int m_a;\n" + " int m_b;\n" + " u32 m_idx;\n" + "}Elem;\n" + "// batching on the GPU\n" + "__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx )\n" + "{\n" + " int wgIdx = GET_GROUP_IDX;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " \n" + " const int m_n = gN[wgIdx];\n" + " const int m_start = gStart[wgIdx];\n" + " \n" + " if( lIdx == 0 )\n" + " {\n" + " for (int i=0;i<m_n;i++)\n" + " {\n" + " int srcIdx = i+m_start;\n" + " int batchIndex = i;\n" + " gConstraints[ srcIdx ].m_batchIdx = batchIndex; \n" + " }\n" + " }\n" + "}\n" + "#define CHECK_SIZE (WG_SIZE)\n" + "u32 readBuf(__local u32* buff, int idx)\n" + "{\n" + " idx = idx % (32*CHECK_SIZE);\n" + " int bitIdx = idx%32;\n" + " int bufIdx = idx/32;\n" + " return buff[bufIdx] & (1<<bitIdx);\n" + "}\n" + "void writeBuf(__local u32* buff, int idx)\n" + "{\n" + " idx = idx % (32*CHECK_SIZE);\n" + " int bitIdx = idx%32;\n" + " int bufIdx = idx/32;\n" + " buff[bufIdx] |= (1<<bitIdx);\n" + " //atom_or( &buff[bufIdx], (1<<bitIdx) );\n" + "}\n" + "u32 tryWrite(__local u32* buff, int idx)\n" + "{\n" + " idx = idx % (32*CHECK_SIZE);\n" + " int bitIdx = idx%32;\n" + " int bufIdx = idx/32;\n" + " u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n" + " return ((ans >> bitIdx)&1) == 0;\n" + "}\n" + "// batching on the GPU\n" + "__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n" + "{\n" + " int wgIdx = GET_GROUP_IDX;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " const int numConstraints = gN[wgIdx];\n" + " const int m_start = gStart[wgIdx];\n" + " b3Contact4Data_t tmp;\n" + " \n" + " __local u32 ldsFixedBuffer[CHECK_SIZE];\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " if( lIdx == 0 )\n" + " {\n" + " \n" + " \n" + " __global struct b3Contact4Data* cs = &gConstraints[m_start]; \n" + " \n" + " \n" + " int numValidConstraints = 0;\n" + " int batchIdx = 0;\n" + " while( numValidConstraints < numConstraints)\n" + " {\n" + " int nCurrentBatch = 0;\n" + " // clear flag\n" + " \n" + " for(int i=0; i<CHECK_SIZE; i++) \n" + " ldsFixedBuffer[i] = 0; \n" + " for(int i=numValidConstraints; i<numConstraints; i++)\n" + " {\n" + " int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n" + " int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n" + " int bodyA = abs(bodyAS);\n" + " int bodyB = abs(bodyBS);\n" + " bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n" + " bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n" + " int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n" + " int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n" + " \n" + " if( aUnavailable==0 && bUnavailable==0 ) // ok\n" + " {\n" + " if (!aIsStatic)\n" + " {\n" + " writeBuf( ldsFixedBuffer, bodyA );\n" + " }\n" + " if (!bIsStatic)\n" + " {\n" + " writeBuf( ldsFixedBuffer, bodyB );\n" + " }\n" + " cs[i].m_batchIdx = batchIdx;\n" + " if (i!=numValidConstraints)\n" + " {\n" + " tmp = cs[i];\n" + " cs[i] = cs[numValidConstraints];\n" + " cs[numValidConstraints] = tmp;\n" + " }\n" + " numValidConstraints++;\n" + " \n" + " nCurrentBatch++;\n" + " if( nCurrentBatch == SIMD_WIDTH)\n" + " {\n" + " nCurrentBatch = 0;\n" + " for(int i=0; i<CHECK_SIZE; i++) \n" + " ldsFixedBuffer[i] = 0;\n" + " \n" + " }\n" + " }\n" + " }//for\n" + " batchIdx ++;\n" + " }//while\n" + " \n" + " batchSizes[wgIdx] = batchIdx;\n" + " }//if( lIdx == 0 )\n" + " \n" + " //return batchIdx;\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h index a5a432947c..6e9c53e161 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h @@ -1,433 +1,432 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* integrateKernelCL= \ -"/*\n" -"Copyright (c) 2013 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Erwin Coumans\n" -"#ifndef B3_RIGIDBODY_DATA_H\n" -"#define B3_RIGIDBODY_DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#define B3_QUAT_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif\n" -"#endif\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Quat;\n" -" #define b3QuatConstArg const b3Quat\n" -" \n" -" \n" -"inline float4 b3FastNormalize4(float4 v)\n" -"{\n" -" v = (float4)(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -" \n" -"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" -"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" -"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" -"{\n" -" b3Quat ans;\n" -" ans = b3Cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" -"{\n" -" b3Quat q;\n" -" q=in;\n" -" //return b3FastNormalize4(in);\n" -" float len = native_sqrt(dot(q, q));\n" -" if(len > 0.f)\n" -" {\n" -" q *= 1.f / len;\n" -" }\n" -" else\n" -" {\n" -" q.x = q.y = q.z = 0.f;\n" -" q.w = 1.f;\n" -" }\n" -" return q;\n" -"}\n" -"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" b3Quat qInv = b3QuatInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" return b3QuatRotate( b3QuatInvert( q ), vec );\n" -"}\n" -"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" -"{\n" -" return b3QuatRotate( orientation, point ) + (translation);\n" -"}\n" -" \n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifndef B3_MAT3x3_H\n" -"#define B3_MAT3x3_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"typedef struct\n" -"{\n" -" b3Float4 m_row[3];\n" -"}b3Mat3x3;\n" -"#define b3Mat3x3ConstArg const b3Mat3x3\n" -"#define b3GetRow(m,row) (m.m_row[row])\n" -"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" -"{\n" -" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" -" b3Mat3x3 out;\n" -" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" -" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" -" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" -" out.m_row[0].w = 0.f;\n" -" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" -" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" -" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" -" out.m_row[1].w = 0.f;\n" -" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" -" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" -" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" -" out.m_row[2].w = 0.f;\n" -" return out;\n" -"}\n" -"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = fabs(matIn.m_row[0]);\n" -" out.m_row[1] = fabs(matIn.m_row[1]);\n" -" out.m_row[2] = fabs(matIn.m_row[2]);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtZero();\n" -"__inline\n" -"b3Mat3x3 mtIdentity();\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Mat3x3 mtZero()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(0.f);\n" -" m.m_row[1] = (b3Float4)(0.f);\n" -" m.m_row[2] = (b3Float4)(0.f);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtIdentity()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(1,0,0,0);\n" -" m.m_row[1] = (b3Float4)(0,1,0,0);\n" -" m.m_row[2] = (b3Float4)(0,0,1,0);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" -" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" -" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" -"{\n" -" b3Mat3x3 transB;\n" -" transB = mtTranspose( b );\n" -" b3Mat3x3 ans;\n" -" // why this doesn't run when 0ing in the for{}\n" -" a.m_row[0].w = 0.f;\n" -" a.m_row[1].w = 0.f;\n" -" a.m_row[2].w = 0.f;\n" -" for(int i=0; i<3; i++)\n" -" {\n" -"// a.m_row[i].w = 0.f;\n" -" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" -" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" -" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" -" ans.m_row[i].w = 0.f;\n" -" }\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" -"{\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a.m_row[0], b );\n" -" ans.y = b3Dot3F4( a.m_row[1], b );\n" -" ans.z = b3Dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" -"{\n" -" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a, colx );\n" -" ans.y = b3Dot3F4( a, coly );\n" -" ans.z = b3Dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"#endif\n" -"#endif //B3_MAT3x3_H\n" -"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" -"struct b3RigidBodyData\n" -"{\n" -" b3Float4 m_pos;\n" -" b3Quat m_quat;\n" -" b3Float4 m_linVel;\n" -" b3Float4 m_angVel;\n" -" int m_collidableIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"};\n" -"typedef struct b3InertiaData b3InertiaData_t;\n" -"struct b3InertiaData\n" -"{\n" -" b3Mat3x3 m_invInertiaWorld;\n" -" b3Mat3x3 m_initInvInertia;\n" -"};\n" -"#endif //B3_RIGIDBODY_DATA_H\n" -" \n" -"#ifndef B3_RIGIDBODY_DATA_H\n" -"#endif //B3_RIGIDBODY_DATA_H\n" -" \n" -"inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n" -"{\n" -" \n" -" if (bodies[nodeID].m_invMass != 0.f)\n" -" {\n" -" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n" -" //angular velocity\n" -" {\n" -" b3Float4 axis;\n" -" //add some hardcoded angular damping\n" -" bodies[nodeID].m_angVel.x *= angularDamping;\n" -" bodies[nodeID].m_angVel.y *= angularDamping;\n" -" bodies[nodeID].m_angVel.z *= angularDamping;\n" -" \n" -" b3Float4 angvel = bodies[nodeID].m_angVel;\n" -" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n" -" \n" -" //limit the angular motion\n" -" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n" -" {\n" -" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" -" }\n" -" if(fAngle < 0.001f)\n" -" {\n" -" // use Taylor's expansions of sync function\n" -" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n" -" }\n" -" else\n" -" {\n" -" // sync(fAngle) = sin(c*fAngle)/t\n" -" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n" -" }\n" -" \n" -" b3Quat dorn;\n" -" dorn.x = axis.x;\n" -" dorn.y = axis.y;\n" -" dorn.z = axis.z;\n" -" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n" -" b3Quat orn0 = bodies[nodeID].m_quat;\n" -" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n" -" predictedOrn = b3QuatNormalized(predictedOrn);\n" -" bodies[nodeID].m_quat=predictedOrn;\n" -" }\n" -" //linear velocity \n" -" bodies[nodeID].m_pos += bodies[nodeID].m_linVel * timeStep;\n" -" \n" -" //apply gravity\n" -" bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n" -" \n" -" }\n" -" \n" -"}\n" -"inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n" -"{\n" -" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n" -" \n" -" if( (body->m_invMass != 0.f))\n" -" {\n" -" //angular velocity\n" -" {\n" -" b3Float4 axis;\n" -" //add some hardcoded angular damping\n" -" body->m_angVel.x *= angularDamping;\n" -" body->m_angVel.y *= angularDamping;\n" -" body->m_angVel.z *= angularDamping;\n" -" \n" -" b3Float4 angvel = body->m_angVel;\n" -" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n" -" //limit the angular motion\n" -" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n" -" {\n" -" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" -" }\n" -" if(fAngle < 0.001f)\n" -" {\n" -" // use Taylor's expansions of sync function\n" -" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n" -" }\n" -" else\n" -" {\n" -" // sync(fAngle) = sin(c*fAngle)/t\n" -" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n" -" }\n" -" b3Quat dorn;\n" -" dorn.x = axis.x;\n" -" dorn.y = axis.y;\n" -" dorn.z = axis.z;\n" -" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n" -" b3Quat orn0 = body->m_quat;\n" -" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n" -" predictedOrn = b3QuatNormalized(predictedOrn);\n" -" body->m_quat=predictedOrn;\n" -" }\n" -" //apply gravity\n" -" body->m_linVel += gravityAcceleration * timeStep;\n" -" //linear velocity \n" -" body->m_pos += body->m_linVel * timeStep;\n" -" \n" -" }\n" -" \n" -"}\n" -"__kernel void \n" -" integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n" -"{\n" -" int nodeID = get_global_id(0);\n" -" \n" -" if( nodeID < numNodes)\n" -" {\n" -" integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);\n" -" }\n" -"}\n" -; +static const char* integrateKernelCL = + "/*\n" + "Copyright (c) 2013 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Erwin Coumans\n" + "#ifndef B3_RIGIDBODY_DATA_H\n" + "#define B3_RIGIDBODY_DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#define B3_QUAT_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif\n" + "#endif\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Quat;\n" + " #define b3QuatConstArg const b3Quat\n" + " \n" + " \n" + "inline float4 b3FastNormalize4(float4 v)\n" + "{\n" + " v = (float4)(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + " \n" + "inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" + "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" + "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" + "{\n" + " b3Quat ans;\n" + " ans = b3Cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - b3Dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" + "{\n" + " b3Quat q;\n" + " q=in;\n" + " //return b3FastNormalize4(in);\n" + " float len = native_sqrt(dot(q, q));\n" + " if(len > 0.f)\n" + " {\n" + " q *= 1.f / len;\n" + " }\n" + " else\n" + " {\n" + " q.x = q.y = q.z = 0.f;\n" + " q.w = 1.f;\n" + " }\n" + " return q;\n" + "}\n" + "inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " b3Quat qInv = b3QuatInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " return b3QuatRotate( b3QuatInvert( q ), vec );\n" + "}\n" + "inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" + "{\n" + " return b3QuatRotate( orientation, point ) + (translation);\n" + "}\n" + " \n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifndef B3_MAT3x3_H\n" + "#define B3_MAT3x3_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "typedef struct\n" + "{\n" + " b3Float4 m_row[3];\n" + "}b3Mat3x3;\n" + "#define b3Mat3x3ConstArg const b3Mat3x3\n" + "#define b3GetRow(m,row) (m.m_row[row])\n" + "inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" + "{\n" + " b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" + " b3Mat3x3 out;\n" + " out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" + " out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" + " out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" + " out.m_row[0].w = 0.f;\n" + " out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" + " out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" + " out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" + " out.m_row[1].w = 0.f;\n" + " out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" + " out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" + " out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" + " out.m_row[2].w = 0.f;\n" + " return out;\n" + "}\n" + "inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = fabs(matIn.m_row[0]);\n" + " out.m_row[1] = fabs(matIn.m_row[1]);\n" + " out.m_row[2] = fabs(matIn.m_row[2]);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtZero();\n" + "__inline\n" + "b3Mat3x3 mtIdentity();\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Mat3x3 mtZero()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(0.f);\n" + " m.m_row[1] = (b3Float4)(0.f);\n" + " m.m_row[2] = (b3Float4)(0.f);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtIdentity()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(1,0,0,0);\n" + " m.m_row[1] = (b3Float4)(0,1,0,0);\n" + " m.m_row[2] = (b3Float4)(0,0,1,0);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" + " out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" + " out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" + "{\n" + " b3Mat3x3 transB;\n" + " transB = mtTranspose( b );\n" + " b3Mat3x3 ans;\n" + " // why this doesn't run when 0ing in the for{}\n" + " a.m_row[0].w = 0.f;\n" + " a.m_row[1].w = 0.f;\n" + " a.m_row[2].w = 0.f;\n" + " for(int i=0; i<3; i++)\n" + " {\n" + "// a.m_row[i].w = 0.f;\n" + " ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" + " ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" + " ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" + " ans.m_row[i].w = 0.f;\n" + " }\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" + "{\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a.m_row[0], b );\n" + " ans.y = b3Dot3F4( a.m_row[1], b );\n" + " ans.z = b3Dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" + "{\n" + " b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a, colx );\n" + " ans.y = b3Dot3F4( a, coly );\n" + " ans.z = b3Dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "#endif\n" + "#endif //B3_MAT3x3_H\n" + "typedef struct b3RigidBodyData b3RigidBodyData_t;\n" + "struct b3RigidBodyData\n" + "{\n" + " b3Float4 m_pos;\n" + " b3Quat m_quat;\n" + " b3Float4 m_linVel;\n" + " b3Float4 m_angVel;\n" + " int m_collidableIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "};\n" + "typedef struct b3InertiaData b3InertiaData_t;\n" + "struct b3InertiaData\n" + "{\n" + " b3Mat3x3 m_invInertiaWorld;\n" + " b3Mat3x3 m_initInvInertia;\n" + "};\n" + "#endif //B3_RIGIDBODY_DATA_H\n" + " \n" + "#ifndef B3_RIGIDBODY_DATA_H\n" + "#endif //B3_RIGIDBODY_DATA_H\n" + " \n" + "inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n" + "{\n" + " \n" + " if (bodies[nodeID].m_invMass != 0.f)\n" + " {\n" + " float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n" + " //angular velocity\n" + " {\n" + " b3Float4 axis;\n" + " //add some hardcoded angular damping\n" + " bodies[nodeID].m_angVel.x *= angularDamping;\n" + " bodies[nodeID].m_angVel.y *= angularDamping;\n" + " bodies[nodeID].m_angVel.z *= angularDamping;\n" + " \n" + " b3Float4 angvel = bodies[nodeID].m_angVel;\n" + " float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n" + " \n" + " //limit the angular motion\n" + " if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n" + " {\n" + " fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" + " }\n" + " if(fAngle < 0.001f)\n" + " {\n" + " // use Taylor's expansions of sync function\n" + " axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n" + " }\n" + " else\n" + " {\n" + " // sync(fAngle) = sin(c*fAngle)/t\n" + " axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n" + " }\n" + " \n" + " b3Quat dorn;\n" + " dorn.x = axis.x;\n" + " dorn.y = axis.y;\n" + " dorn.z = axis.z;\n" + " dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n" + " b3Quat orn0 = bodies[nodeID].m_quat;\n" + " b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n" + " predictedOrn = b3QuatNormalized(predictedOrn);\n" + " bodies[nodeID].m_quat=predictedOrn;\n" + " }\n" + " //linear velocity \n" + " bodies[nodeID].m_pos += bodies[nodeID].m_linVel * timeStep;\n" + " \n" + " //apply gravity\n" + " bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n" + " \n" + " }\n" + " \n" + "}\n" + "inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n" + "{\n" + " float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n" + " \n" + " if( (body->m_invMass != 0.f))\n" + " {\n" + " //angular velocity\n" + " {\n" + " b3Float4 axis;\n" + " //add some hardcoded angular damping\n" + " body->m_angVel.x *= angularDamping;\n" + " body->m_angVel.y *= angularDamping;\n" + " body->m_angVel.z *= angularDamping;\n" + " \n" + " b3Float4 angvel = body->m_angVel;\n" + " float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n" + " //limit the angular motion\n" + " if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n" + " {\n" + " fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" + " }\n" + " if(fAngle < 0.001f)\n" + " {\n" + " // use Taylor's expansions of sync function\n" + " axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n" + " }\n" + " else\n" + " {\n" + " // sync(fAngle) = sin(c*fAngle)/t\n" + " axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n" + " }\n" + " b3Quat dorn;\n" + " dorn.x = axis.x;\n" + " dorn.y = axis.y;\n" + " dorn.z = axis.z;\n" + " dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n" + " b3Quat orn0 = body->m_quat;\n" + " b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n" + " predictedOrn = b3QuatNormalized(predictedOrn);\n" + " body->m_quat=predictedOrn;\n" + " }\n" + " //apply gravity\n" + " body->m_linVel += gravityAcceleration * timeStep;\n" + " //linear velocity \n" + " body->m_pos += body->m_linVel * timeStep;\n" + " \n" + " }\n" + " \n" + "}\n" + "__kernel void \n" + " integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n" + "{\n" + " int nodeID = get_global_id(0);\n" + " \n" + " if( nodeID < numNodes)\n" + " {\n" + " integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.h index d48ecf6ea6..c94b55851e 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/jointSolver.h @@ -1,721 +1,720 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* solveConstraintRowsCL= \ -"/*\n" -"Copyright (c) 2013 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Erwin Coumans\n" -"#define B3_CONSTRAINT_FLAG_ENABLED 1\n" -"#define B3_GPU_POINT2POINT_CONSTRAINT_TYPE 3\n" -"#define B3_GPU_FIXED_CONSTRAINT_TYPE 4\n" -"#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails\n" -"#define B3_INFINITY 1e30f\n" -"#define mymake_float4 (float4)\n" -"__inline float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = mymake_float4(a.xyz,0.f);\n" -" float4 b1 = mymake_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"typedef float4 Quaternion;\n" -"typedef struct\n" -"{\n" -" float4 m_row[3];\n" -"}Matrix3x3;\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b);\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b);\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b)\n" -"{\n" -" float4 ans;\n" -" ans.x = dot3F4( a.m_row[0], b );\n" -" ans.y = dot3F4( a.m_row[1], b );\n" -" ans.z = dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b)\n" -"{\n" -" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" float4 ans;\n" -" ans.x = dot3F4( a, colx );\n" -" ans.y = dot3F4( a, coly );\n" -" ans.z = dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"typedef struct\n" -"{\n" -" Matrix3x3 m_invInertiaWorld;\n" -" Matrix3x3 m_initInvInertia;\n" -"} BodyInertia;\n" -"typedef struct\n" -"{\n" -" Matrix3x3 m_basis;//orientation\n" -" float4 m_origin;//transform\n" -"}b3Transform;\n" -"typedef struct\n" -"{\n" -"// b3Transform m_worldTransformUnused;\n" -" float4 m_deltaLinearVelocity;\n" -" float4 m_deltaAngularVelocity;\n" -" float4 m_angularFactor;\n" -" float4 m_linearFactor;\n" -" float4 m_invMass;\n" -" float4 m_pushVelocity;\n" -" float4 m_turnVelocity;\n" -" float4 m_linearVelocity;\n" -" float4 m_angularVelocity;\n" -" union \n" -" {\n" -" void* m_originalBody;\n" -" int m_originalBodyIndex;\n" -" };\n" -" int padding[3];\n" -"} b3GpuSolverBody;\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" Quaternion m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" unsigned int m_shapeIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} b3RigidBodyCL;\n" -"typedef struct\n" -"{\n" -" float4 m_relpos1CrossNormal;\n" -" float4 m_contactNormal;\n" -" float4 m_relpos2CrossNormal;\n" -" //float4 m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal\n" -" float4 m_angularComponentA;\n" -" float4 m_angularComponentB;\n" -" \n" -" float m_appliedPushImpulse;\n" -" float m_appliedImpulse;\n" -" int m_padding1;\n" -" int m_padding2;\n" -" float m_friction;\n" -" float m_jacDiagABInv;\n" -" float m_rhs;\n" -" float m_cfm;\n" -" \n" -" float m_lowerLimit;\n" -" float m_upperLimit;\n" -" float m_rhsPenetration;\n" -" int m_originalConstraint;\n" -" int m_overrideNumSolverIterations;\n" -" int m_frictionIndex;\n" -" int m_solverBodyIdA;\n" -" int m_solverBodyIdB;\n" -"} b3SolverConstraint;\n" -"typedef struct \n" -"{\n" -" int m_bodyAPtrAndSignBit;\n" -" int m_bodyBPtrAndSignBit;\n" -" int m_originalConstraintIndex;\n" -" int m_batchId;\n" -"} b3BatchConstraint;\n" -"typedef struct \n" -"{\n" -" int m_constraintType;\n" -" int m_rbA;\n" -" int m_rbB;\n" -" float m_breakingImpulseThreshold;\n" -" float4 m_pivotInA;\n" -" float4 m_pivotInB;\n" -" Quaternion m_relTargetAB;\n" -" int m_flags;\n" -" int m_padding[3];\n" -"} b3GpuGenericConstraint;\n" -"/*b3Transform getWorldTransform(b3RigidBodyCL* rb)\n" -"{\n" -" b3Transform newTrans;\n" -" newTrans.setOrigin(rb->m_pos);\n" -" newTrans.setRotation(rb->m_quat);\n" -" return newTrans;\n" -"}*/\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -"}\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" v = mymake_float4(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b);\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in);\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec);\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q);\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b)\n" -"{\n" -" Quaternion ans;\n" -" ans = cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in)\n" -"{\n" -" return fastNormalize4(in);\n" -"// in /= length( in );\n" -"// return in;\n" -"}\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec)\n" -"{\n" -" Quaternion qInv = qtInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q)\n" -"{\n" -" return (Quaternion)(-q.xyz, q.w);\n" -"}\n" -"__inline void internalApplyImpulse(__global b3GpuSolverBody* body, float4 linearComponent, float4 angularComponent,float impulseMagnitude)\n" -"{\n" -" body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;\n" -" body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);\n" -"}\n" -"void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c)\n" -"{\n" -" float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm;\n" -" float deltaVel1Dotn = dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) + dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);\n" -" float deltaVel2Dotn = -dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);\n" -" deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv;\n" -" deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv;\n" -" float sum = c->m_appliedImpulse + deltaImpulse;\n" -" if (sum < c->m_lowerLimit)\n" -" {\n" -" deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse;\n" -" c->m_appliedImpulse = c->m_lowerLimit;\n" -" }\n" -" else if (sum > c->m_upperLimit) \n" -" {\n" -" deltaImpulse = c->m_upperLimit-c->m_appliedImpulse;\n" -" c->m_appliedImpulse = c->m_upperLimit;\n" -" }\n" -" else\n" -" {\n" -" c->m_appliedImpulse = sum;\n" -" }\n" -" internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n" -" internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n" -"}\n" -"__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,\n" -" __global b3BatchConstraint* batchConstraints,\n" -" __global b3SolverConstraint* rows,\n" -" __global unsigned int* numConstraintRowsInfo1, \n" -" __global unsigned int* rowOffsets,\n" -" __global b3GpuGenericConstraint* constraints,\n" -" int batchOffset,\n" -" int numConstraintsInBatch\n" -" )\n" -"{\n" -" int b = get_global_id(0);\n" -" if (b>=numConstraintsInBatch)\n" -" return;\n" -" __global b3BatchConstraint* c = &batchConstraints[b+batchOffset];\n" -" int originalConstraintIndex = c->m_originalConstraintIndex;\n" -" if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED)\n" -" {\n" -" int numConstraintRows = numConstraintRowsInfo1[originalConstraintIndex];\n" -" int rowOffset = rowOffsets[originalConstraintIndex];\n" -" for (int jj=0;jj<numConstraintRows;jj++)\n" -" {\n" -" __global b3SolverConstraint* constraint = &rows[rowOffset+jj];\n" -" resolveSingleConstraintRowGeneric(&solverBodies[constraint->m_solverBodyIdA],&solverBodies[constraint->m_solverBodyIdB],constraint);\n" -" }\n" -" }\n" -"};\n" -"__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numBodies)\n" -" return;\n" -" __global b3GpuSolverBody* solverBody = &solverBodies[i];\n" -" __global b3RigidBodyCL* bodyCL = &bodiesCL[i];\n" -" solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f);\n" -" solverBody->m_deltaAngularVelocity = (float4)(0.f,0.f,0.f,0.f);\n" -" solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n" -" solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n" -" solverBody->m_invMass = (float4)(bodyCL->m_invMass,bodyCL->m_invMass,bodyCL->m_invMass,0.f);\n" -" solverBody->m_originalBodyIndex = i;\n" -" solverBody->m_angularFactor = (float4)(1,1,1,0);\n" -" solverBody->m_linearFactor = (float4) (1,1,1,0);\n" -" solverBody->m_linearVelocity = bodyCL->m_linVel;\n" -" solverBody->m_angularVelocity = bodyCL->m_angVel;\n" -"}\n" -"__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints)\n" -"{\n" -" int cid = get_global_id(0);\n" -" if (cid>=numConstraints)\n" -" return;\n" -" int numRows = numConstraintRows[cid];\n" -" if (numRows)\n" -" {\n" -" for (int i=0;i<numRows;i++)\n" -" {\n" -" int rowIndex = rowOffsets[cid]+i;\n" -" float breakingThreshold = constraints[cid].m_breakingImpulseThreshold;\n" -" if (fabs(rows[rowIndex].m_appliedImpulse) >= breakingThreshold)\n" -" {\n" -" constraints[cid].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED;\n" -" }\n" -" }\n" -" }\n" -"}\n" -"__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numConstraints)\n" -" return;\n" -" __global b3GpuGenericConstraint* constraint = &constraints[i];\n" -" switch (constraint->m_constraintType)\n" -" {\n" -" case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n" -" {\n" -" infos[i] = 3;\n" -" break;\n" -" }\n" -" case B3_GPU_FIXED_CONSTRAINT_TYPE:\n" -" {\n" -" infos[i] = 6;\n" -" break;\n" -" }\n" -" default:\n" -" {\n" -" }\n" -" }\n" -"}\n" -"__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, \n" -" __global b3BatchConstraint* batchConstraints, \n" -" __global b3GpuGenericConstraint* constraints,\n" -" __global b3RigidBodyCL* bodies,\n" -" int numConstraints)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numConstraints)\n" -" return;\n" -" int rbA = constraints[i].m_rbA;\n" -" int rbB = constraints[i].m_rbB;\n" -" batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass != 0.f ? rbA : -rbA;\n" -" batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass != 0.f ? rbB : -rbB;\n" -" batchConstraints[i].m_batchId = -1;\n" -" batchConstraints[i].m_originalConstraintIndex = i;\n" -"}\n" -"typedef struct\n" -"{\n" -" // integrator parameters: frames per second (1/stepsize), default error\n" -" // reduction parameter (0..1).\n" -" float fps,erp;\n" -" // for the first and second body, pointers to two (linear and angular)\n" -" // n*3 jacobian sub matrices, stored by rows. these matrices will have\n" -" // been initialized to 0 on entry. if the second body is zero then the\n" -" // J2xx pointers may be 0.\n" -" union \n" -" {\n" -" __global float4* m_J1linearAxisFloat4;\n" -" __global float* m_J1linearAxis;\n" -" };\n" -" union\n" -" {\n" -" __global float4* m_J1angularAxisFloat4;\n" -" __global float* m_J1angularAxis;\n" -" };\n" -" union\n" -" {\n" -" __global float4* m_J2linearAxisFloat4;\n" -" __global float* m_J2linearAxis;\n" -" };\n" -" union\n" -" {\n" -" __global float4* m_J2angularAxisFloat4;\n" -" __global float* m_J2angularAxis;\n" -" };\n" -" // elements to jump from one row to the next in J's\n" -" int rowskip;\n" -" // right hand sides of the equation J*v = c + cfm * lambda. cfm is the\n" -" // \"constraint force mixing\" vector. c is set to zero on entry, cfm is\n" -" // set to a constant value (typically very small or zero) value on entry.\n" -" __global float* m_constraintError;\n" -" __global float* cfm;\n" -" // lo and hi limits for variables (set to -/+ infinity on entry).\n" -" __global float* m_lowerLimit;\n" -" __global float* m_upperLimit;\n" -" // findex vector for variables. see the LCP solver interface for a\n" -" // description of what this does. this is set to -1 on entry.\n" -" // note that the returned indexes are relative to the first index of\n" -" // the constraint.\n" -" __global int *findex;\n" -" // number of solver iterations\n" -" int m_numIterations;\n" -" //damping of the velocity\n" -" float m_damping;\n" -"} b3GpuConstraintInfo2;\n" -"void getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2)\n" -"{\n" -" *v0 = (float4)(0. ,-vecIn.z ,vecIn.y,0.f);\n" -" *v1 = (float4)(vecIn.z ,0. ,-vecIn.x,0.f);\n" -" *v2 = (float4)(-vecIn.y ,vecIn.x ,0.f,0.f);\n" -"}\n" -"void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies)\n" -"{\n" -" float4 posA = bodies[constraint->m_rbA].m_pos;\n" -" Quaternion rotA = bodies[constraint->m_rbA].m_quat;\n" -" float4 posB = bodies[constraint->m_rbB].m_pos;\n" -" Quaternion rotB = bodies[constraint->m_rbB].m_quat;\n" -" // anchor points in global coordinates with respect to body PORs.\n" -" \n" -" // set jacobian\n" -" info->m_J1linearAxis[0] = 1;\n" -" info->m_J1linearAxis[info->rowskip+1] = 1;\n" -" info->m_J1linearAxis[2*info->rowskip+2] = 1;\n" -" float4 a1 = qtRotate(rotA,constraint->m_pivotInA);\n" -" {\n" -" __global float4* angular0 = (__global float4*)(info->m_J1angularAxis);\n" -" __global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip);\n" -" __global float4* angular2 = (__global float4*)(info->m_J1angularAxis+2*info->rowskip);\n" -" float4 a1neg = -a1;\n" -" getSkewSymmetricMatrix(a1neg,angular0,angular1,angular2);\n" -" }\n" -" if (info->m_J2linearAxis)\n" -" {\n" -" info->m_J2linearAxis[0] = -1;\n" -" info->m_J2linearAxis[info->rowskip+1] = -1;\n" -" info->m_J2linearAxis[2*info->rowskip+2] = -1;\n" -" }\n" -" \n" -" float4 a2 = qtRotate(rotB,constraint->m_pivotInB);\n" -" \n" -" {\n" -" // float4 a2n = -a2;\n" -" __global float4* angular0 = (__global float4*)(info->m_J2angularAxis);\n" -" __global float4* angular1 = (__global float4*)(info->m_J2angularAxis+info->rowskip);\n" -" __global float4* angular2 = (__global float4*)(info->m_J2angularAxis+2*info->rowskip);\n" -" getSkewSymmetricMatrix(a2,angular0,angular1,angular2);\n" -" }\n" -" \n" -" // set right hand side\n" -"// float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;\n" -" float currERP = info->erp;\n" -" float k = info->fps * currERP;\n" -" int j;\n" -" float4 result = a2 + posB - a1 - posA;\n" -" float* resultPtr = &result;\n" -" for (j=0; j<3; j++)\n" -" {\n" -" info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]);\n" -" }\n" -"}\n" -"Quaternion nearest( Quaternion first, Quaternion qd)\n" -"{\n" -" Quaternion diff,sum;\n" -" diff = first- qd;\n" -" sum = first + qd;\n" -" \n" -" if( dot(diff,diff) < dot(sum,sum) )\n" -" return qd;\n" -" return (-qd);\n" -"}\n" -"float b3Acos(float x) \n" -"{ \n" -" if (x<-1) \n" -" x=-1; \n" -" if (x>1) \n" -" x=1;\n" -" return acos(x); \n" -"}\n" -"float getAngle(Quaternion orn)\n" -"{\n" -" if (orn.w>=1.f)\n" -" orn.w=1.f;\n" -" float s = 2.f * b3Acos(orn.w);\n" -" return s;\n" -"}\n" -"void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle)\n" -"{\n" -" Quaternion orn1 = nearest(orn0,orn1a);\n" -" \n" -" Quaternion dorn = qtMul(orn1,qtInvert(orn0));\n" -" *angle = getAngle(dorn);\n" -" *axis = (float4)(dorn.x,dorn.y,dorn.z,0.f);\n" -" \n" -" //check for axis length\n" -" float len = dot3F4(*axis,*axis);\n" -" if (len < FLT_EPSILON*FLT_EPSILON)\n" -" *axis = (float4)(1,0,0,0);\n" -" else\n" -" *axis /= sqrt(len);\n" -"}\n" -"void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row)\n" -"{\n" -" Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat;\n" -" Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat;\n" -" int s = info->rowskip;\n" -" int start_index = start_row * s;\n" -" // 3 rows to make body rotations equal\n" -" info->m_J1angularAxis[start_index] = 1;\n" -" info->m_J1angularAxis[start_index + s + 1] = 1;\n" -" info->m_J1angularAxis[start_index + s*2+2] = 1;\n" -" if ( info->m_J2angularAxis)\n" -" {\n" -" info->m_J2angularAxis[start_index] = -1;\n" -" info->m_J2angularAxis[start_index + s+1] = -1;\n" -" info->m_J2angularAxis[start_index + s*2+2] = -1;\n" -" }\n" -" \n" -" float currERP = info->erp;\n" -" float k = info->fps * currERP;\n" -" float4 diff;\n" -" float angle;\n" -" float4 qrelCur = qtMul(worldOrnA,qtInvert(worldOrnB));\n" -" \n" -" calculateDiffAxisAngleQuaternion(constraint->m_relTargetAB,qrelCur,&diff,&angle);\n" -" diff*=-angle;\n" -" \n" -" float* resultPtr = &diff;\n" -" \n" -" for (int j=0; j<3; j++)\n" -" {\n" -" info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j];\n" -" }\n" -" \n" -"}\n" -"__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numBodies)\n" -" return;\n" -" if (bodies[i].m_invMass)\n" -" {\n" -"// if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP)\n" -" {\n" -" bodies[i].m_linVel += solverBodies[i].m_deltaLinearVelocity;\n" -" }\n" -"// if (length(solverBodies[i].m_deltaAngularVelocity)<MOTIONCLAMP)\n" -" {\n" -" bodies[i].m_angVel += solverBodies[i].m_deltaAngularVelocity;\n" -" } \n" -" }\n" -"}\n" -"__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, \n" -" __global unsigned int* infos, \n" -" __global unsigned int* constraintRowOffsets, \n" -" __global b3GpuGenericConstraint* constraints, \n" -" __global b3BatchConstraint* batchConstraints, \n" -" __global b3RigidBodyCL* bodies,\n" -" __global BodyInertia* inertias,\n" -" __global b3GpuSolverBody* solverBodies,\n" -" float timeStep,\n" -" float globalErp,\n" -" float globalCfm,\n" -" float globalDamping,\n" -" int globalNumIterations,\n" -" int numConstraints)\n" -"{\n" -" int i = get_global_id(0);\n" -" if (i>=numConstraints)\n" -" return;\n" -" \n" -" //for now, always initialize the batch info\n" -" int info1 = infos[i];\n" -" \n" -" __global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]];\n" -" __global b3GpuGenericConstraint* constraint = &constraints[i];\n" -" __global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA];\n" -" __global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB];\n" -" int solverBodyIdA = constraint->m_rbA;\n" -" int solverBodyIdB = constraint->m_rbB;\n" -" __global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA];\n" -" __global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB];\n" -" if (rbA->m_invMass)\n" -" {\n" -" batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;\n" -" } else\n" -" {\n" -"// if (!solverBodyIdA)\n" -"// m_staticIdx = 0;\n" -" batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;\n" -" }\n" -" if (rbB->m_invMass)\n" -" {\n" -" batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;\n" -" } else\n" -" {\n" -"// if (!solverBodyIdB)\n" -"// m_staticIdx = 0;\n" -" batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;\n" -" }\n" -" if (info1)\n" -" {\n" -" int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;\n" -"// if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)\n" -" // m_maxOverrideNumSolverIterations = overrideNumSolverIterations;\n" -" int j;\n" -" for ( j=0;j<info1;j++)\n" -" {\n" -"// memset(¤tConstraintRow[j],0,sizeof(b3SolverConstraint));\n" -" currentConstraintRow[j].m_angularComponentA = (float4)(0,0,0,0);\n" -" currentConstraintRow[j].m_angularComponentB = (float4)(0,0,0,0);\n" -" currentConstraintRow[j].m_appliedImpulse = 0.f;\n" -" currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n" -" currentConstraintRow[j].m_cfm = 0.f;\n" -" currentConstraintRow[j].m_contactNormal = (float4)(0,0,0,0);\n" -" currentConstraintRow[j].m_friction = 0.f;\n" -" currentConstraintRow[j].m_frictionIndex = 0;\n" -" currentConstraintRow[j].m_jacDiagABInv = 0.f;\n" -" currentConstraintRow[j].m_lowerLimit = 0.f;\n" -" currentConstraintRow[j].m_upperLimit = 0.f;\n" -" currentConstraintRow[j].m_originalConstraint = i;\n" -" currentConstraintRow[j].m_overrideNumSolverIterations = 0;\n" -" currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0);\n" -" currentConstraintRow[j].m_relpos2CrossNormal = (float4)(0,0,0,0);\n" -" currentConstraintRow[j].m_rhs = 0.f;\n" -" currentConstraintRow[j].m_rhsPenetration = 0.f;\n" -" currentConstraintRow[j].m_solverBodyIdA = 0;\n" -" currentConstraintRow[j].m_solverBodyIdB = 0;\n" -" \n" -" currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;\n" -" currentConstraintRow[j].m_upperLimit = B3_INFINITY;\n" -" currentConstraintRow[j].m_appliedImpulse = 0.f;\n" -" currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n" -" currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;\n" -" currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;\n" -" currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations; \n" -" }\n" -" bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n" -" bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n" -" bodyAPtr->m_pushVelocity = (float4)(0,0,0,0);\n" -" bodyAPtr->m_turnVelocity = (float4)(0,0,0,0);\n" -" bodyBPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n" -" bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n" -" bodyBPtr->m_pushVelocity = (float4)(0,0,0,0);\n" -" bodyBPtr->m_turnVelocity = (float4)(0,0,0,0);\n" -" int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n" -" \n" -" b3GpuConstraintInfo2 info2;\n" -" info2.fps = 1.f/timeStep;\n" -" info2.erp = globalErp;\n" -" info2.m_J1linearAxisFloat4 = ¤tConstraintRow->m_contactNormal;\n" -" info2.m_J1angularAxisFloat4 = ¤tConstraintRow->m_relpos1CrossNormal;\n" -" info2.m_J2linearAxisFloat4 = 0;\n" -" info2.m_J2angularAxisFloat4 = ¤tConstraintRow->m_relpos2CrossNormal;\n" -" info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n" -" ///the size of b3SolverConstraint needs be a multiple of float\n" -"// b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint));\n" -" info2.m_constraintError = ¤tConstraintRow->m_rhs;\n" -" currentConstraintRow->m_cfm = globalCfm;\n" -" info2.m_damping = globalDamping;\n" -" info2.cfm = ¤tConstraintRow->m_cfm;\n" -" info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit;\n" -" info2.m_upperLimit = ¤tConstraintRow->m_upperLimit;\n" -" info2.m_numIterations = globalNumIterations;\n" -" switch (constraint->m_constraintType)\n" -" {\n" -" case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n" -" {\n" -" getInfo2Point2Point(constraint,&info2,bodies);\n" -" break;\n" -" }\n" -" case B3_GPU_FIXED_CONSTRAINT_TYPE:\n" -" {\n" -" getInfo2Point2Point(constraint,&info2,bodies);\n" -" getInfo2FixedOrientation(constraint,&info2,bodies,3);\n" -" break;\n" -" }\n" -" default:\n" -" {\n" -" }\n" -" }\n" -" ///finalize the constraint setup\n" -" for ( j=0;j<info1;j++)\n" -" {\n" -" __global b3SolverConstraint* solverConstraint = ¤tConstraintRow[j];\n" -" if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold)\n" -" {\n" -" solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold;\n" -" }\n" -" if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold)\n" -" {\n" -" solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold;\n" -" }\n" -"// solverConstraint->m_originalContactPoint = constraint;\n" -" \n" -" Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld;\n" -" {\n" -" //float4 angularFactorA(1,1,1);\n" -" float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal;\n" -" solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA;\n" -" }\n" -" \n" -" Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld;\n" -" {\n" -" float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal;\n" -" solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor();\n" -" }\n" -" {\n" -" //it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal\n" -" //because it gets multiplied iMJlB\n" -" float4 iMJlA = solverConstraint->m_contactNormal*rbA->m_invMass;\n" -" float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA);\n" -" float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal?\n" -" float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB);\n" -" float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal);\n" -" sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal);\n" -" sum += dot3F4(iMJlB,solverConstraint->m_contactNormal);\n" -" sum += dot3F4(iMJaB,solverConstraint->m_relpos2CrossNormal);\n" -" float fsum = fabs(sum);\n" -" if (fsum>FLT_EPSILON)\n" -" {\n" -" solverConstraint->m_jacDiagABInv = 1.f/sum;\n" -" } else\n" -" {\n" -" solverConstraint->m_jacDiagABInv = 0.f;\n" -" }\n" -" }\n" -" ///fix rhs\n" -" ///todo: add force/torque accelerators\n" -" {\n" -" float rel_vel;\n" -" float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel);\n" -" float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel);\n" -" rel_vel = vel1Dotn+vel2Dotn;\n" -" float restitution = 0.f;\n" -" float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2\n" -" float velocityError = restitution - rel_vel * info2.m_damping;\n" -" float penetrationImpulse = positionalError*solverConstraint->m_jacDiagABInv;\n" -" float velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv;\n" -" solverConstraint->m_rhs = penetrationImpulse+velocityImpulse;\n" -" solverConstraint->m_appliedImpulse = 0.f;\n" -" }\n" -" }\n" -" }\n" -"}\n" -; +static const char* solveConstraintRowsCL = + "/*\n" + "Copyright (c) 2013 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Erwin Coumans\n" + "#define B3_CONSTRAINT_FLAG_ENABLED 1\n" + "#define B3_GPU_POINT2POINT_CONSTRAINT_TYPE 3\n" + "#define B3_GPU_FIXED_CONSTRAINT_TYPE 4\n" + "#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails\n" + "#define B3_INFINITY 1e30f\n" + "#define mymake_float4 (float4)\n" + "__inline float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = mymake_float4(a.xyz,0.f);\n" + " float4 b1 = mymake_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "typedef float4 Quaternion;\n" + "typedef struct\n" + "{\n" + " float4 m_row[3];\n" + "}Matrix3x3;\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b);\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b);\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b)\n" + "{\n" + " float4 ans;\n" + " ans.x = dot3F4( a.m_row[0], b );\n" + " ans.y = dot3F4( a.m_row[1], b );\n" + " ans.z = dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b)\n" + "{\n" + " float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " float4 ans;\n" + " ans.x = dot3F4( a, colx );\n" + " ans.y = dot3F4( a, coly );\n" + " ans.z = dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "typedef struct\n" + "{\n" + " Matrix3x3 m_invInertiaWorld;\n" + " Matrix3x3 m_initInvInertia;\n" + "} BodyInertia;\n" + "typedef struct\n" + "{\n" + " Matrix3x3 m_basis;//orientation\n" + " float4 m_origin;//transform\n" + "}b3Transform;\n" + "typedef struct\n" + "{\n" + "// b3Transform m_worldTransformUnused;\n" + " float4 m_deltaLinearVelocity;\n" + " float4 m_deltaAngularVelocity;\n" + " float4 m_angularFactor;\n" + " float4 m_linearFactor;\n" + " float4 m_invMass;\n" + " float4 m_pushVelocity;\n" + " float4 m_turnVelocity;\n" + " float4 m_linearVelocity;\n" + " float4 m_angularVelocity;\n" + " union \n" + " {\n" + " void* m_originalBody;\n" + " int m_originalBodyIndex;\n" + " };\n" + " int padding[3];\n" + "} b3GpuSolverBody;\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " Quaternion m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " unsigned int m_shapeIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} b3RigidBodyCL;\n" + "typedef struct\n" + "{\n" + " float4 m_relpos1CrossNormal;\n" + " float4 m_contactNormal;\n" + " float4 m_relpos2CrossNormal;\n" + " //float4 m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal\n" + " float4 m_angularComponentA;\n" + " float4 m_angularComponentB;\n" + " \n" + " float m_appliedPushImpulse;\n" + " float m_appliedImpulse;\n" + " int m_padding1;\n" + " int m_padding2;\n" + " float m_friction;\n" + " float m_jacDiagABInv;\n" + " float m_rhs;\n" + " float m_cfm;\n" + " \n" + " float m_lowerLimit;\n" + " float m_upperLimit;\n" + " float m_rhsPenetration;\n" + " int m_originalConstraint;\n" + " int m_overrideNumSolverIterations;\n" + " int m_frictionIndex;\n" + " int m_solverBodyIdA;\n" + " int m_solverBodyIdB;\n" + "} b3SolverConstraint;\n" + "typedef struct \n" + "{\n" + " int m_bodyAPtrAndSignBit;\n" + " int m_bodyBPtrAndSignBit;\n" + " int m_originalConstraintIndex;\n" + " int m_batchId;\n" + "} b3BatchConstraint;\n" + "typedef struct \n" + "{\n" + " int m_constraintType;\n" + " int m_rbA;\n" + " int m_rbB;\n" + " float m_breakingImpulseThreshold;\n" + " float4 m_pivotInA;\n" + " float4 m_pivotInB;\n" + " Quaternion m_relTargetAB;\n" + " int m_flags;\n" + " int m_padding[3];\n" + "} b3GpuGenericConstraint;\n" + "/*b3Transform getWorldTransform(b3RigidBodyCL* rb)\n" + "{\n" + " b3Transform newTrans;\n" + " newTrans.setOrigin(rb->m_pos);\n" + " newTrans.setRotation(rb->m_quat);\n" + " return newTrans;\n" + "}*/\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + "}\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " v = mymake_float4(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b);\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in);\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec);\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q);\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b)\n" + "{\n" + " Quaternion ans;\n" + " ans = cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in)\n" + "{\n" + " return fastNormalize4(in);\n" + "// in /= length( in );\n" + "// return in;\n" + "}\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec)\n" + "{\n" + " Quaternion qInv = qtInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = qtMul(qtMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q)\n" + "{\n" + " return (Quaternion)(-q.xyz, q.w);\n" + "}\n" + "__inline void internalApplyImpulse(__global b3GpuSolverBody* body, float4 linearComponent, float4 angularComponent,float impulseMagnitude)\n" + "{\n" + " body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;\n" + " body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);\n" + "}\n" + "void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c)\n" + "{\n" + " float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm;\n" + " float deltaVel1Dotn = dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) + dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);\n" + " float deltaVel2Dotn = -dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);\n" + " deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv;\n" + " deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv;\n" + " float sum = c->m_appliedImpulse + deltaImpulse;\n" + " if (sum < c->m_lowerLimit)\n" + " {\n" + " deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse;\n" + " c->m_appliedImpulse = c->m_lowerLimit;\n" + " }\n" + " else if (sum > c->m_upperLimit) \n" + " {\n" + " deltaImpulse = c->m_upperLimit-c->m_appliedImpulse;\n" + " c->m_appliedImpulse = c->m_upperLimit;\n" + " }\n" + " else\n" + " {\n" + " c->m_appliedImpulse = sum;\n" + " }\n" + " internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n" + " internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n" + "}\n" + "__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,\n" + " __global b3BatchConstraint* batchConstraints,\n" + " __global b3SolverConstraint* rows,\n" + " __global unsigned int* numConstraintRowsInfo1, \n" + " __global unsigned int* rowOffsets,\n" + " __global b3GpuGenericConstraint* constraints,\n" + " int batchOffset,\n" + " int numConstraintsInBatch\n" + " )\n" + "{\n" + " int b = get_global_id(0);\n" + " if (b>=numConstraintsInBatch)\n" + " return;\n" + " __global b3BatchConstraint* c = &batchConstraints[b+batchOffset];\n" + " int originalConstraintIndex = c->m_originalConstraintIndex;\n" + " if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED)\n" + " {\n" + " int numConstraintRows = numConstraintRowsInfo1[originalConstraintIndex];\n" + " int rowOffset = rowOffsets[originalConstraintIndex];\n" + " for (int jj=0;jj<numConstraintRows;jj++)\n" + " {\n" + " __global b3SolverConstraint* constraint = &rows[rowOffset+jj];\n" + " resolveSingleConstraintRowGeneric(&solverBodies[constraint->m_solverBodyIdA],&solverBodies[constraint->m_solverBodyIdB],constraint);\n" + " }\n" + " }\n" + "};\n" + "__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numBodies)\n" + " return;\n" + " __global b3GpuSolverBody* solverBody = &solverBodies[i];\n" + " __global b3RigidBodyCL* bodyCL = &bodiesCL[i];\n" + " solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f);\n" + " solverBody->m_deltaAngularVelocity = (float4)(0.f,0.f,0.f,0.f);\n" + " solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n" + " solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n" + " solverBody->m_invMass = (float4)(bodyCL->m_invMass,bodyCL->m_invMass,bodyCL->m_invMass,0.f);\n" + " solverBody->m_originalBodyIndex = i;\n" + " solverBody->m_angularFactor = (float4)(1,1,1,0);\n" + " solverBody->m_linearFactor = (float4) (1,1,1,0);\n" + " solverBody->m_linearVelocity = bodyCL->m_linVel;\n" + " solverBody->m_angularVelocity = bodyCL->m_angVel;\n" + "}\n" + "__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints)\n" + "{\n" + " int cid = get_global_id(0);\n" + " if (cid>=numConstraints)\n" + " return;\n" + " int numRows = numConstraintRows[cid];\n" + " if (numRows)\n" + " {\n" + " for (int i=0;i<numRows;i++)\n" + " {\n" + " int rowIndex = rowOffsets[cid]+i;\n" + " float breakingThreshold = constraints[cid].m_breakingImpulseThreshold;\n" + " if (fabs(rows[rowIndex].m_appliedImpulse) >= breakingThreshold)\n" + " {\n" + " constraints[cid].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED;\n" + " }\n" + " }\n" + " }\n" + "}\n" + "__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numConstraints)\n" + " return;\n" + " __global b3GpuGenericConstraint* constraint = &constraints[i];\n" + " switch (constraint->m_constraintType)\n" + " {\n" + " case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n" + " {\n" + " infos[i] = 3;\n" + " break;\n" + " }\n" + " case B3_GPU_FIXED_CONSTRAINT_TYPE:\n" + " {\n" + " infos[i] = 6;\n" + " break;\n" + " }\n" + " default:\n" + " {\n" + " }\n" + " }\n" + "}\n" + "__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, \n" + " __global b3BatchConstraint* batchConstraints, \n" + " __global b3GpuGenericConstraint* constraints,\n" + " __global b3RigidBodyCL* bodies,\n" + " int numConstraints)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numConstraints)\n" + " return;\n" + " int rbA = constraints[i].m_rbA;\n" + " int rbB = constraints[i].m_rbB;\n" + " batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass != 0.f ? rbA : -rbA;\n" + " batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass != 0.f ? rbB : -rbB;\n" + " batchConstraints[i].m_batchId = -1;\n" + " batchConstraints[i].m_originalConstraintIndex = i;\n" + "}\n" + "typedef struct\n" + "{\n" + " // integrator parameters: frames per second (1/stepsize), default error\n" + " // reduction parameter (0..1).\n" + " float fps,erp;\n" + " // for the first and second body, pointers to two (linear and angular)\n" + " // n*3 jacobian sub matrices, stored by rows. these matrices will have\n" + " // been initialized to 0 on entry. if the second body is zero then the\n" + " // J2xx pointers may be 0.\n" + " union \n" + " {\n" + " __global float4* m_J1linearAxisFloat4;\n" + " __global float* m_J1linearAxis;\n" + " };\n" + " union\n" + " {\n" + " __global float4* m_J1angularAxisFloat4;\n" + " __global float* m_J1angularAxis;\n" + " };\n" + " union\n" + " {\n" + " __global float4* m_J2linearAxisFloat4;\n" + " __global float* m_J2linearAxis;\n" + " };\n" + " union\n" + " {\n" + " __global float4* m_J2angularAxisFloat4;\n" + " __global float* m_J2angularAxis;\n" + " };\n" + " // elements to jump from one row to the next in J's\n" + " int rowskip;\n" + " // right hand sides of the equation J*v = c + cfm * lambda. cfm is the\n" + " // \"constraint force mixing\" vector. c is set to zero on entry, cfm is\n" + " // set to a constant value (typically very small or zero) value on entry.\n" + " __global float* m_constraintError;\n" + " __global float* cfm;\n" + " // lo and hi limits for variables (set to -/+ infinity on entry).\n" + " __global float* m_lowerLimit;\n" + " __global float* m_upperLimit;\n" + " // findex vector for variables. see the LCP solver interface for a\n" + " // description of what this does. this is set to -1 on entry.\n" + " // note that the returned indexes are relative to the first index of\n" + " // the constraint.\n" + " __global int *findex;\n" + " // number of solver iterations\n" + " int m_numIterations;\n" + " //damping of the velocity\n" + " float m_damping;\n" + "} b3GpuConstraintInfo2;\n" + "void getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2)\n" + "{\n" + " *v0 = (float4)(0. ,-vecIn.z ,vecIn.y,0.f);\n" + " *v1 = (float4)(vecIn.z ,0. ,-vecIn.x,0.f);\n" + " *v2 = (float4)(-vecIn.y ,vecIn.x ,0.f,0.f);\n" + "}\n" + "void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies)\n" + "{\n" + " float4 posA = bodies[constraint->m_rbA].m_pos;\n" + " Quaternion rotA = bodies[constraint->m_rbA].m_quat;\n" + " float4 posB = bodies[constraint->m_rbB].m_pos;\n" + " Quaternion rotB = bodies[constraint->m_rbB].m_quat;\n" + " // anchor points in global coordinates with respect to body PORs.\n" + " \n" + " // set jacobian\n" + " info->m_J1linearAxis[0] = 1;\n" + " info->m_J1linearAxis[info->rowskip+1] = 1;\n" + " info->m_J1linearAxis[2*info->rowskip+2] = 1;\n" + " float4 a1 = qtRotate(rotA,constraint->m_pivotInA);\n" + " {\n" + " __global float4* angular0 = (__global float4*)(info->m_J1angularAxis);\n" + " __global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip);\n" + " __global float4* angular2 = (__global float4*)(info->m_J1angularAxis+2*info->rowskip);\n" + " float4 a1neg = -a1;\n" + " getSkewSymmetricMatrix(a1neg,angular0,angular1,angular2);\n" + " }\n" + " if (info->m_J2linearAxis)\n" + " {\n" + " info->m_J2linearAxis[0] = -1;\n" + " info->m_J2linearAxis[info->rowskip+1] = -1;\n" + " info->m_J2linearAxis[2*info->rowskip+2] = -1;\n" + " }\n" + " \n" + " float4 a2 = qtRotate(rotB,constraint->m_pivotInB);\n" + " \n" + " {\n" + " // float4 a2n = -a2;\n" + " __global float4* angular0 = (__global float4*)(info->m_J2angularAxis);\n" + " __global float4* angular1 = (__global float4*)(info->m_J2angularAxis+info->rowskip);\n" + " __global float4* angular2 = (__global float4*)(info->m_J2angularAxis+2*info->rowskip);\n" + " getSkewSymmetricMatrix(a2,angular0,angular1,angular2);\n" + " }\n" + " \n" + " // set right hand side\n" + "// float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;\n" + " float currERP = info->erp;\n" + " float k = info->fps * currERP;\n" + " int j;\n" + " float4 result = a2 + posB - a1 - posA;\n" + " float* resultPtr = &result;\n" + " for (j=0; j<3; j++)\n" + " {\n" + " info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]);\n" + " }\n" + "}\n" + "Quaternion nearest( Quaternion first, Quaternion qd)\n" + "{\n" + " Quaternion diff,sum;\n" + " diff = first- qd;\n" + " sum = first + qd;\n" + " \n" + " if( dot(diff,diff) < dot(sum,sum) )\n" + " return qd;\n" + " return (-qd);\n" + "}\n" + "float b3Acos(float x) \n" + "{ \n" + " if (x<-1) \n" + " x=-1; \n" + " if (x>1) \n" + " x=1;\n" + " return acos(x); \n" + "}\n" + "float getAngle(Quaternion orn)\n" + "{\n" + " if (orn.w>=1.f)\n" + " orn.w=1.f;\n" + " float s = 2.f * b3Acos(orn.w);\n" + " return s;\n" + "}\n" + "void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle)\n" + "{\n" + " Quaternion orn1 = nearest(orn0,orn1a);\n" + " \n" + " Quaternion dorn = qtMul(orn1,qtInvert(orn0));\n" + " *angle = getAngle(dorn);\n" + " *axis = (float4)(dorn.x,dorn.y,dorn.z,0.f);\n" + " \n" + " //check for axis length\n" + " float len = dot3F4(*axis,*axis);\n" + " if (len < FLT_EPSILON*FLT_EPSILON)\n" + " *axis = (float4)(1,0,0,0);\n" + " else\n" + " *axis /= sqrt(len);\n" + "}\n" + "void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row)\n" + "{\n" + " Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat;\n" + " Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat;\n" + " int s = info->rowskip;\n" + " int start_index = start_row * s;\n" + " // 3 rows to make body rotations equal\n" + " info->m_J1angularAxis[start_index] = 1;\n" + " info->m_J1angularAxis[start_index + s + 1] = 1;\n" + " info->m_J1angularAxis[start_index + s*2+2] = 1;\n" + " if ( info->m_J2angularAxis)\n" + " {\n" + " info->m_J2angularAxis[start_index] = -1;\n" + " info->m_J2angularAxis[start_index + s+1] = -1;\n" + " info->m_J2angularAxis[start_index + s*2+2] = -1;\n" + " }\n" + " \n" + " float currERP = info->erp;\n" + " float k = info->fps * currERP;\n" + " float4 diff;\n" + " float angle;\n" + " float4 qrelCur = qtMul(worldOrnA,qtInvert(worldOrnB));\n" + " \n" + " calculateDiffAxisAngleQuaternion(constraint->m_relTargetAB,qrelCur,&diff,&angle);\n" + " diff*=-angle;\n" + " \n" + " float* resultPtr = &diff;\n" + " \n" + " for (int j=0; j<3; j++)\n" + " {\n" + " info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j];\n" + " }\n" + " \n" + "}\n" + "__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numBodies)\n" + " return;\n" + " if (bodies[i].m_invMass)\n" + " {\n" + "// if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP)\n" + " {\n" + " bodies[i].m_linVel += solverBodies[i].m_deltaLinearVelocity;\n" + " }\n" + "// if (length(solverBodies[i].m_deltaAngularVelocity)<MOTIONCLAMP)\n" + " {\n" + " bodies[i].m_angVel += solverBodies[i].m_deltaAngularVelocity;\n" + " } \n" + " }\n" + "}\n" + "__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, \n" + " __global unsigned int* infos, \n" + " __global unsigned int* constraintRowOffsets, \n" + " __global b3GpuGenericConstraint* constraints, \n" + " __global b3BatchConstraint* batchConstraints, \n" + " __global b3RigidBodyCL* bodies,\n" + " __global BodyInertia* inertias,\n" + " __global b3GpuSolverBody* solverBodies,\n" + " float timeStep,\n" + " float globalErp,\n" + " float globalCfm,\n" + " float globalDamping,\n" + " int globalNumIterations,\n" + " int numConstraints)\n" + "{\n" + " int i = get_global_id(0);\n" + " if (i>=numConstraints)\n" + " return;\n" + " \n" + " //for now, always initialize the batch info\n" + " int info1 = infos[i];\n" + " \n" + " __global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]];\n" + " __global b3GpuGenericConstraint* constraint = &constraints[i];\n" + " __global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA];\n" + " __global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB];\n" + " int solverBodyIdA = constraint->m_rbA;\n" + " int solverBodyIdB = constraint->m_rbB;\n" + " __global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA];\n" + " __global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB];\n" + " if (rbA->m_invMass)\n" + " {\n" + " batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;\n" + " } else\n" + " {\n" + "// if (!solverBodyIdA)\n" + "// m_staticIdx = 0;\n" + " batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;\n" + " }\n" + " if (rbB->m_invMass)\n" + " {\n" + " batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;\n" + " } else\n" + " {\n" + "// if (!solverBodyIdB)\n" + "// m_staticIdx = 0;\n" + " batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;\n" + " }\n" + " if (info1)\n" + " {\n" + " int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;\n" + "// if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)\n" + " // m_maxOverrideNumSolverIterations = overrideNumSolverIterations;\n" + " int j;\n" + " for ( j=0;j<info1;j++)\n" + " {\n" + "// memset(¤tConstraintRow[j],0,sizeof(b3SolverConstraint));\n" + " currentConstraintRow[j].m_angularComponentA = (float4)(0,0,0,0);\n" + " currentConstraintRow[j].m_angularComponentB = (float4)(0,0,0,0);\n" + " currentConstraintRow[j].m_appliedImpulse = 0.f;\n" + " currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n" + " currentConstraintRow[j].m_cfm = 0.f;\n" + " currentConstraintRow[j].m_contactNormal = (float4)(0,0,0,0);\n" + " currentConstraintRow[j].m_friction = 0.f;\n" + " currentConstraintRow[j].m_frictionIndex = 0;\n" + " currentConstraintRow[j].m_jacDiagABInv = 0.f;\n" + " currentConstraintRow[j].m_lowerLimit = 0.f;\n" + " currentConstraintRow[j].m_upperLimit = 0.f;\n" + " currentConstraintRow[j].m_originalConstraint = i;\n" + " currentConstraintRow[j].m_overrideNumSolverIterations = 0;\n" + " currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0);\n" + " currentConstraintRow[j].m_relpos2CrossNormal = (float4)(0,0,0,0);\n" + " currentConstraintRow[j].m_rhs = 0.f;\n" + " currentConstraintRow[j].m_rhsPenetration = 0.f;\n" + " currentConstraintRow[j].m_solverBodyIdA = 0;\n" + " currentConstraintRow[j].m_solverBodyIdB = 0;\n" + " \n" + " currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;\n" + " currentConstraintRow[j].m_upperLimit = B3_INFINITY;\n" + " currentConstraintRow[j].m_appliedImpulse = 0.f;\n" + " currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n" + " currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;\n" + " currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;\n" + " currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations; \n" + " }\n" + " bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n" + " bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n" + " bodyAPtr->m_pushVelocity = (float4)(0,0,0,0);\n" + " bodyAPtr->m_turnVelocity = (float4)(0,0,0,0);\n" + " bodyBPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n" + " bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n" + " bodyBPtr->m_pushVelocity = (float4)(0,0,0,0);\n" + " bodyBPtr->m_turnVelocity = (float4)(0,0,0,0);\n" + " int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n" + " \n" + " b3GpuConstraintInfo2 info2;\n" + " info2.fps = 1.f/timeStep;\n" + " info2.erp = globalErp;\n" + " info2.m_J1linearAxisFloat4 = ¤tConstraintRow->m_contactNormal;\n" + " info2.m_J1angularAxisFloat4 = ¤tConstraintRow->m_relpos1CrossNormal;\n" + " info2.m_J2linearAxisFloat4 = 0;\n" + " info2.m_J2angularAxisFloat4 = ¤tConstraintRow->m_relpos2CrossNormal;\n" + " info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n" + " ///the size of b3SolverConstraint needs be a multiple of float\n" + "// b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint));\n" + " info2.m_constraintError = ¤tConstraintRow->m_rhs;\n" + " currentConstraintRow->m_cfm = globalCfm;\n" + " info2.m_damping = globalDamping;\n" + " info2.cfm = ¤tConstraintRow->m_cfm;\n" + " info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit;\n" + " info2.m_upperLimit = ¤tConstraintRow->m_upperLimit;\n" + " info2.m_numIterations = globalNumIterations;\n" + " switch (constraint->m_constraintType)\n" + " {\n" + " case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n" + " {\n" + " getInfo2Point2Point(constraint,&info2,bodies);\n" + " break;\n" + " }\n" + " case B3_GPU_FIXED_CONSTRAINT_TYPE:\n" + " {\n" + " getInfo2Point2Point(constraint,&info2,bodies);\n" + " getInfo2FixedOrientation(constraint,&info2,bodies,3);\n" + " break;\n" + " }\n" + " default:\n" + " {\n" + " }\n" + " }\n" + " ///finalize the constraint setup\n" + " for ( j=0;j<info1;j++)\n" + " {\n" + " __global b3SolverConstraint* solverConstraint = ¤tConstraintRow[j];\n" + " if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold)\n" + " {\n" + " solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold;\n" + " }\n" + " if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold)\n" + " {\n" + " solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold;\n" + " }\n" + "// solverConstraint->m_originalContactPoint = constraint;\n" + " \n" + " Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld;\n" + " {\n" + " //float4 angularFactorA(1,1,1);\n" + " float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal;\n" + " solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA;\n" + " }\n" + " \n" + " Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld;\n" + " {\n" + " float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal;\n" + " solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor();\n" + " }\n" + " {\n" + " //it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal\n" + " //because it gets multiplied iMJlB\n" + " float4 iMJlA = solverConstraint->m_contactNormal*rbA->m_invMass;\n" + " float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA);\n" + " float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal?\n" + " float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB);\n" + " float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal);\n" + " sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal);\n" + " sum += dot3F4(iMJlB,solverConstraint->m_contactNormal);\n" + " sum += dot3F4(iMJaB,solverConstraint->m_relpos2CrossNormal);\n" + " float fsum = fabs(sum);\n" + " if (fsum>FLT_EPSILON)\n" + " {\n" + " solverConstraint->m_jacDiagABInv = 1.f/sum;\n" + " } else\n" + " {\n" + " solverConstraint->m_jacDiagABInv = 0.f;\n" + " }\n" + " }\n" + " ///fix rhs\n" + " ///todo: add force/torque accelerators\n" + " {\n" + " float rel_vel;\n" + " float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel);\n" + " float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel);\n" + " rel_vel = vel1Dotn+vel2Dotn;\n" + " float restitution = 0.f;\n" + " float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2\n" + " float velocityError = restitution - rel_vel * info2.m_damping;\n" + " float penetrationImpulse = positionalError*solverConstraint->m_jacDiagABInv;\n" + " float velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv;\n" + " solverConstraint->m_rhs = penetrationImpulse+velocityImpulse;\n" + " solverConstraint->m_appliedImpulse = 0.f;\n" + " }\n" + " }\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.h index 15a049992b..6e14ad51fc 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solveContact.h @@ -1,393 +1,392 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* solveContactCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"#ifdef cl_ext_atomic_counters_32\n" -"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -"#define counter32_t volatile global int*\n" -"#endif\n" -"typedef unsigned int u32;\n" -"typedef unsigned short u16;\n" -"typedef unsigned char u8;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GET_NUM_GROUPS get_num_groups(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" -"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define mymake_float4 (float4)\n" -"//#define make_float2 (float2)\n" -"//#define make_uint4 (uint4)\n" -"//#define make_int4 (int4)\n" -"//#define make_uint2 (uint2)\n" -"//#define make_int2 (int2)\n" -"#define max2 max\n" -"#define min2 min\n" -"///////////////////////////////////////\n" -"// Vector\n" -"///////////////////////////////////////\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" return fast_normalize(v);\n" -"}\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -"}\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = mymake_float4(a.xyz,0.f);\n" -" float4 b1 = mymake_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float4 normalize3(const float4 a)\n" -"{\n" -" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n" -" return fastNormalize4( n );\n" -"// float length = sqrtf(dot3F4(a, a));\n" -"// return 1.f/length * a;\n" -"}\n" -"///////////////////////////////////////\n" -"// Matrix3x3\n" -"///////////////////////////////////////\n" -"typedef struct\n" -"{\n" -" float4 m_row[3];\n" -"}Matrix3x3;\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b);\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b);\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b)\n" -"{\n" -" float4 ans;\n" -" ans.x = dot3F4( a.m_row[0], b );\n" -" ans.y = dot3F4( a.m_row[1], b );\n" -" ans.z = dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b)\n" -"{\n" -" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" float4 ans;\n" -" ans.x = dot3F4( a, colx );\n" -" ans.y = dot3F4( a, coly );\n" -" ans.z = dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"///////////////////////////////////////\n" -"// Quaternion\n" -"///////////////////////////////////////\n" -"typedef float4 Quaternion;\n" -"#define WG_SIZE 64\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" Quaternion m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" u32 m_shapeIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} Body;\n" -"typedef struct\n" -"{\n" -" Matrix3x3 m_invInertia;\n" -" Matrix3x3 m_initInvInertia;\n" -"} Shape;\n" -"typedef struct\n" -"{\n" -" float4 m_linear;\n" -" float4 m_worldPos[4];\n" -" float4 m_center; \n" -" float m_jacCoeffInv[4];\n" -" float m_b[4];\n" -" float m_appliedRambdaDt[4];\n" -" float m_fJacCoeffInv[2]; \n" -" float m_fAppliedRambdaDt[2]; \n" -" u32 m_bodyA;\n" -" u32 m_bodyB;\n" -" int m_batchIdx;\n" -" u32 m_paddings[1];\n" -"} Constraint4;\n" -"typedef struct\n" -"{\n" -" int m_nConstraints;\n" -" int m_start;\n" -" int m_batchIdx;\n" -" int m_nSplit;\n" -"// int m_paddings[1];\n" -"} ConstBuffer;\n" -"typedef struct\n" -"{\n" -" int m_solveFriction;\n" -" int m_maxBatch; // long batch really kills the performance\n" -" int m_batchIdx;\n" -" int m_nSplit;\n" -"// int m_paddings[1];\n" -"} ConstBufferBatchSolve;\n" -"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n" -"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" -"{\n" -" *linear = mymake_float4(-n.xyz,0.f);\n" -" *angular0 = -cross3(r0, n);\n" -" *angular1 = cross3(r1, n);\n" -"}\n" -"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n" -"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" -"{\n" -" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" -"}\n" -"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" -" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n" -"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" -" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n" -"{\n" -" // linear0,1 are normlized\n" -" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" -" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" -" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" -" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" -" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" -"}\n" -"void solveContact(__global Constraint4* cs,\n" -" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" -" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n" -"void solveContact(__global Constraint4* cs,\n" -" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" -" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n" -"{\n" -" float minRambdaDt = 0;\n" -" float maxRambdaDt = FLT_MAX;\n" -" for(int ic=0; ic<4; ic++)\n" -" {\n" -" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n" -" float4 angular0, angular1, linear;\n" -" float4 r0 = cs->m_worldPos[ic] - posA;\n" -" float4 r1 = cs->m_worldPos[ic] - posB;\n" -" setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n" -" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n" -" *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n" -" rambdaDt *= cs->m_jacCoeffInv[ic];\n" -" {\n" -" float prevSum = cs->m_appliedRambdaDt[ic];\n" -" float updated = prevSum;\n" -" updated += rambdaDt;\n" -" updated = max2( updated, minRambdaDt );\n" -" updated = min2( updated, maxRambdaDt );\n" -" rambdaDt = updated - prevSum;\n" -" cs->m_appliedRambdaDt[ic] = updated;\n" -" }\n" -" float4 linImp0 = invMassA*linear*rambdaDt;\n" -" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" -" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" -" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" -" *linVelA += linImp0;\n" -" *angVelA += angImp0;\n" -" *linVelB += linImp1;\n" -" *angVelB += angImp1;\n" -" }\n" -"}\n" -"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" -" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" -"{\n" -" if (fabs(n[0].z) > 0.70710678f) {\n" -" // choose p in y-z plane\n" -" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" -" float k = 1.f/sqrt(a);\n" -" p[0].x = 0;\n" -" p[0].y = -n[0].z*k;\n" -" p[0].z = n[0].y*k;\n" -" // set q = n x p\n" -" q[0].x = a*k;\n" -" q[0].y = -n[0].x*p[0].z;\n" -" q[0].z = n[0].x*p[0].y;\n" -" }\n" -" else {\n" -" // choose p in x-y plane\n" -" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" -" float k = 1.f/sqrt(a);\n" -" p[0].x = -n[0].y*k;\n" -" p[0].y = n[0].x*k;\n" -" p[0].z = 0;\n" -" // set q = n x p\n" -" q[0].x = -n[0].z*p[0].y;\n" -" q[0].y = n[0].z*p[0].x;\n" -" q[0].z = a*k;\n" -" }\n" -"}\n" -"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" -"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" -"{\n" -" //float frictionCoeff = ldsCs[0].m_linear.w;\n" -" int aIdx = ldsCs[0].m_bodyA;\n" -" int bIdx = ldsCs[0].m_bodyB;\n" -" float4 posA = gBodies[aIdx].m_pos;\n" -" float4 linVelA = gBodies[aIdx].m_linVel;\n" -" float4 angVelA = gBodies[aIdx].m_angVel;\n" -" float invMassA = gBodies[aIdx].m_invMass;\n" -" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" -" float4 posB = gBodies[bIdx].m_pos;\n" -" float4 linVelB = gBodies[bIdx].m_linVel;\n" -" float4 angVelB = gBodies[bIdx].m_angVel;\n" -" float invMassB = gBodies[bIdx].m_invMass;\n" -" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" -" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" -" posB, &linVelB, &angVelB, invMassB, invInertiaB );\n" -" if (gBodies[aIdx].m_invMass)\n" -" {\n" -" gBodies[aIdx].m_linVel = linVelA;\n" -" gBodies[aIdx].m_angVel = angVelA;\n" -" } else\n" -" {\n" -" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" -" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" -" \n" -" }\n" -" if (gBodies[bIdx].m_invMass)\n" -" {\n" -" gBodies[bIdx].m_linVel = linVelB;\n" -" gBodies[bIdx].m_angVel = angVelB;\n" -" } else\n" -" {\n" -" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" -" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" -" \n" -" }\n" -"}\n" -"typedef struct \n" -"{\n" -" int m_valInt0;\n" -" int m_valInt1;\n" -" int m_valInt2;\n" -" int m_valInt3;\n" -" float m_val0;\n" -" float m_val1;\n" -" float m_val2;\n" -" float m_val3;\n" -"} SolverDebugInfo;\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void BatchSolveKernelContact(__global Body* gBodies,\n" -" __global Shape* gShapes,\n" -" __global Constraint4* gConstraints,\n" -" __global int* gN,\n" -" __global int* gOffsets,\n" -" __global int* batchSizes,\n" -" int maxBatch1,\n" -" int cellBatch,\n" -" int4 nSplit\n" -" )\n" -"{\n" -" //__local int ldsBatchIdx[WG_SIZE+1];\n" -" __local int ldsCurBatch;\n" -" __local int ldsNextBatch;\n" -" __local int ldsStart;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" int wgIdx = GET_GROUP_IDX;\n" -"// int gIdx = GET_GLOBAL_IDX;\n" -"// debugInfo[gIdx].m_valInt0 = gIdx;\n" -" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" -" \n" -" \n" -" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" -" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" -" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" -" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" -" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" -" //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n" -" //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n" -" //int cellIdx = xIdx+yIdx*nSplit;\n" -" \n" -" if( gN[cellIdx] == 0 ) \n" -" return;\n" -" int maxBatch = batchSizes[cellIdx];\n" -" \n" -" \n" -" const int start = gOffsets[cellIdx];\n" -" const int end = start + gN[cellIdx];\n" -" \n" -" \n" -" \n" -" if( lIdx == 0 )\n" -" {\n" -" ldsCurBatch = 0;\n" -" ldsNextBatch = 0;\n" -" ldsStart = start;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" int idx=ldsStart+lIdx;\n" -" while (ldsCurBatch < maxBatch)\n" -" {\n" -" for(; idx<end; )\n" -" {\n" -" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n" -" {\n" -" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n" -" idx+=64;\n" -" } else\n" -" {\n" -" break;\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" \n" -" if( lIdx == 0 )\n" -" {\n" -" ldsCurBatch++;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" }\n" -" \n" -" \n" -"}\n" -"__kernel void solveSingleContactKernel(__global Body* gBodies,\n" -" __global Shape* gShapes,\n" -" __global Constraint4* gConstraints,\n" -" int cellIdx,\n" -" int batchOffset,\n" -" int numConstraintsInBatch\n" -" )\n" -"{\n" -" int index = get_global_id(0);\n" -" if (index < numConstraintsInBatch)\n" -" {\n" -" int idx=batchOffset+index;\n" -" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n" -" } \n" -"}\n" -; +static const char* solveContactCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "#ifdef cl_ext_atomic_counters_32\n" + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + "#define counter32_t volatile global int*\n" + "#endif\n" + "typedef unsigned int u32;\n" + "typedef unsigned short u16;\n" + "typedef unsigned char u8;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GET_NUM_GROUPS get_num_groups(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" + "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define mymake_float4 (float4)\n" + "//#define make_float2 (float2)\n" + "//#define make_uint4 (uint4)\n" + "//#define make_int4 (int4)\n" + "//#define make_uint2 (uint2)\n" + "//#define make_int2 (int2)\n" + "#define max2 max\n" + "#define min2 min\n" + "///////////////////////////////////////\n" + "// Vector\n" + "///////////////////////////////////////\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " return fast_normalize(v);\n" + "}\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + "}\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = mymake_float4(a.xyz,0.f);\n" + " float4 b1 = mymake_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float4 normalize3(const float4 a)\n" + "{\n" + " float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n" + " return fastNormalize4( n );\n" + "// float length = sqrtf(dot3F4(a, a));\n" + "// return 1.f/length * a;\n" + "}\n" + "///////////////////////////////////////\n" + "// Matrix3x3\n" + "///////////////////////////////////////\n" + "typedef struct\n" + "{\n" + " float4 m_row[3];\n" + "}Matrix3x3;\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b);\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b);\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b)\n" + "{\n" + " float4 ans;\n" + " ans.x = dot3F4( a.m_row[0], b );\n" + " ans.y = dot3F4( a.m_row[1], b );\n" + " ans.z = dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b)\n" + "{\n" + " float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " float4 ans;\n" + " ans.x = dot3F4( a, colx );\n" + " ans.y = dot3F4( a, coly );\n" + " ans.z = dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "///////////////////////////////////////\n" + "// Quaternion\n" + "///////////////////////////////////////\n" + "typedef float4 Quaternion;\n" + "#define WG_SIZE 64\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " Quaternion m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " u32 m_shapeIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} Body;\n" + "typedef struct\n" + "{\n" + " Matrix3x3 m_invInertia;\n" + " Matrix3x3 m_initInvInertia;\n" + "} Shape;\n" + "typedef struct\n" + "{\n" + " float4 m_linear;\n" + " float4 m_worldPos[4];\n" + " float4 m_center; \n" + " float m_jacCoeffInv[4];\n" + " float m_b[4];\n" + " float m_appliedRambdaDt[4];\n" + " float m_fJacCoeffInv[2]; \n" + " float m_fAppliedRambdaDt[2]; \n" + " u32 m_bodyA;\n" + " u32 m_bodyB;\n" + " int m_batchIdx;\n" + " u32 m_paddings[1];\n" + "} Constraint4;\n" + "typedef struct\n" + "{\n" + " int m_nConstraints;\n" + " int m_start;\n" + " int m_batchIdx;\n" + " int m_nSplit;\n" + "// int m_paddings[1];\n" + "} ConstBuffer;\n" + "typedef struct\n" + "{\n" + " int m_solveFriction;\n" + " int m_maxBatch; // long batch really kills the performance\n" + " int m_batchIdx;\n" + " int m_nSplit;\n" + "// int m_paddings[1];\n" + "} ConstBufferBatchSolve;\n" + "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n" + "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" + "{\n" + " *linear = mymake_float4(-n.xyz,0.f);\n" + " *angular0 = -cross3(r0, n);\n" + " *angular1 = cross3(r1, n);\n" + "}\n" + "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n" + "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" + "{\n" + " return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" + "}\n" + "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" + " float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n" + "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" + " float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n" + "{\n" + " // linear0,1 are normlized\n" + " float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" + " float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" + " float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" + " float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" + " return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" + "}\n" + "void solveContact(__global Constraint4* cs,\n" + " float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" + " float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n" + "void solveContact(__global Constraint4* cs,\n" + " float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" + " float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n" + "{\n" + " float minRambdaDt = 0;\n" + " float maxRambdaDt = FLT_MAX;\n" + " for(int ic=0; ic<4; ic++)\n" + " {\n" + " if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n" + " float4 angular0, angular1, linear;\n" + " float4 r0 = cs->m_worldPos[ic] - posA;\n" + " float4 r1 = cs->m_worldPos[ic] - posB;\n" + " setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n" + " float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n" + " *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n" + " rambdaDt *= cs->m_jacCoeffInv[ic];\n" + " {\n" + " float prevSum = cs->m_appliedRambdaDt[ic];\n" + " float updated = prevSum;\n" + " updated += rambdaDt;\n" + " updated = max2( updated, minRambdaDt );\n" + " updated = min2( updated, maxRambdaDt );\n" + " rambdaDt = updated - prevSum;\n" + " cs->m_appliedRambdaDt[ic] = updated;\n" + " }\n" + " float4 linImp0 = invMassA*linear*rambdaDt;\n" + " float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" + " float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" + " float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" + " *linVelA += linImp0;\n" + " *angVelA += angImp0;\n" + " *linVelB += linImp1;\n" + " *angVelB += angImp1;\n" + " }\n" + "}\n" + "void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" + " void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" + "{\n" + " if (fabs(n[0].z) > 0.70710678f) {\n" + " // choose p in y-z plane\n" + " float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" + " float k = 1.f/sqrt(a);\n" + " p[0].x = 0;\n" + " p[0].y = -n[0].z*k;\n" + " p[0].z = n[0].y*k;\n" + " // set q = n x p\n" + " q[0].x = a*k;\n" + " q[0].y = -n[0].x*p[0].z;\n" + " q[0].z = n[0].x*p[0].y;\n" + " }\n" + " else {\n" + " // choose p in x-y plane\n" + " float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" + " float k = 1.f/sqrt(a);\n" + " p[0].x = -n[0].y*k;\n" + " p[0].y = n[0].x*k;\n" + " p[0].z = 0;\n" + " // set q = n x p\n" + " q[0].x = -n[0].z*p[0].y;\n" + " q[0].y = n[0].z*p[0].x;\n" + " q[0].z = a*k;\n" + " }\n" + "}\n" + "void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" + "void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" + "{\n" + " //float frictionCoeff = ldsCs[0].m_linear.w;\n" + " int aIdx = ldsCs[0].m_bodyA;\n" + " int bIdx = ldsCs[0].m_bodyB;\n" + " float4 posA = gBodies[aIdx].m_pos;\n" + " float4 linVelA = gBodies[aIdx].m_linVel;\n" + " float4 angVelA = gBodies[aIdx].m_angVel;\n" + " float invMassA = gBodies[aIdx].m_invMass;\n" + " Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" + " float4 posB = gBodies[bIdx].m_pos;\n" + " float4 linVelB = gBodies[bIdx].m_linVel;\n" + " float4 angVelB = gBodies[bIdx].m_angVel;\n" + " float invMassB = gBodies[bIdx].m_invMass;\n" + " Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" + " solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" + " posB, &linVelB, &angVelB, invMassB, invInertiaB );\n" + " if (gBodies[aIdx].m_invMass)\n" + " {\n" + " gBodies[aIdx].m_linVel = linVelA;\n" + " gBodies[aIdx].m_angVel = angVelA;\n" + " } else\n" + " {\n" + " gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" + " gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" + " \n" + " }\n" + " if (gBodies[bIdx].m_invMass)\n" + " {\n" + " gBodies[bIdx].m_linVel = linVelB;\n" + " gBodies[bIdx].m_angVel = angVelB;\n" + " } else\n" + " {\n" + " gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" + " gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" + " \n" + " }\n" + "}\n" + "typedef struct \n" + "{\n" + " int m_valInt0;\n" + " int m_valInt1;\n" + " int m_valInt2;\n" + " int m_valInt3;\n" + " float m_val0;\n" + " float m_val1;\n" + " float m_val2;\n" + " float m_val3;\n" + "} SolverDebugInfo;\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void BatchSolveKernelContact(__global Body* gBodies,\n" + " __global Shape* gShapes,\n" + " __global Constraint4* gConstraints,\n" + " __global int* gN,\n" + " __global int* gOffsets,\n" + " __global int* batchSizes,\n" + " int maxBatch1,\n" + " int cellBatch,\n" + " int4 nSplit\n" + " )\n" + "{\n" + " //__local int ldsBatchIdx[WG_SIZE+1];\n" + " __local int ldsCurBatch;\n" + " __local int ldsNextBatch;\n" + " __local int ldsStart;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " int wgIdx = GET_GROUP_IDX;\n" + "// int gIdx = GET_GLOBAL_IDX;\n" + "// debugInfo[gIdx].m_valInt0 = gIdx;\n" + " //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" + " \n" + " \n" + " int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" + " int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" + " int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" + " int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" + " int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" + " //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n" + " //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n" + " //int cellIdx = xIdx+yIdx*nSplit;\n" + " \n" + " if( gN[cellIdx] == 0 ) \n" + " return;\n" + " int maxBatch = batchSizes[cellIdx];\n" + " \n" + " \n" + " const int start = gOffsets[cellIdx];\n" + " const int end = start + gN[cellIdx];\n" + " \n" + " \n" + " \n" + " if( lIdx == 0 )\n" + " {\n" + " ldsCurBatch = 0;\n" + " ldsNextBatch = 0;\n" + " ldsStart = start;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " int idx=ldsStart+lIdx;\n" + " while (ldsCurBatch < maxBatch)\n" + " {\n" + " for(; idx<end; )\n" + " {\n" + " if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n" + " {\n" + " solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n" + " idx+=64;\n" + " } else\n" + " {\n" + " break;\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " \n" + " if( lIdx == 0 )\n" + " {\n" + " ldsCurBatch++;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " }\n" + " \n" + " \n" + "}\n" + "__kernel void solveSingleContactKernel(__global Body* gBodies,\n" + " __global Shape* gShapes,\n" + " __global Constraint4* gConstraints,\n" + " int cellIdx,\n" + " int batchOffset,\n" + " int numConstraintsInBatch\n" + " )\n" + "{\n" + " int index = get_global_id(0);\n" + " if (index < numConstraintsInBatch)\n" + " {\n" + " int idx=batchOffset+index;\n" + " solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n" + " } \n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.h index eb58674f22..9707cdb25d 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solveFriction.h @@ -1,421 +1,420 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* solveFrictionCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"#ifdef cl_ext_atomic_counters_32\n" -"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -"#define counter32_t volatile global int*\n" -"#endif\n" -"typedef unsigned int u32;\n" -"typedef unsigned short u16;\n" -"typedef unsigned char u8;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GET_NUM_GROUPS get_num_groups(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" -"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define mymake_float4 (float4)\n" -"//#define make_float2 (float2)\n" -"//#define make_uint4 (uint4)\n" -"//#define make_int4 (int4)\n" -"//#define make_uint2 (uint2)\n" -"//#define make_int2 (int2)\n" -"#define max2 max\n" -"#define min2 min\n" -"///////////////////////////////////////\n" -"// Vector\n" -"///////////////////////////////////////\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" return fast_normalize(v);\n" -"}\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -"}\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = mymake_float4(a.xyz,0.f);\n" -" float4 b1 = mymake_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float4 normalize3(const float4 a)\n" -"{\n" -" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n" -" return fastNormalize4( n );\n" -"// float length = sqrtf(dot3F4(a, a));\n" -"// return 1.f/length * a;\n" -"}\n" -"///////////////////////////////////////\n" -"// Matrix3x3\n" -"///////////////////////////////////////\n" -"typedef struct\n" -"{\n" -" float4 m_row[3];\n" -"}Matrix3x3;\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b);\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b);\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b)\n" -"{\n" -" float4 ans;\n" -" ans.x = dot3F4( a.m_row[0], b );\n" -" ans.y = dot3F4( a.m_row[1], b );\n" -" ans.z = dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b)\n" -"{\n" -" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" float4 ans;\n" -" ans.x = dot3F4( a, colx );\n" -" ans.y = dot3F4( a, coly );\n" -" ans.z = dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"///////////////////////////////////////\n" -"// Quaternion\n" -"///////////////////////////////////////\n" -"typedef float4 Quaternion;\n" -"#define WG_SIZE 64\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" Quaternion m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" u32 m_shapeIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} Body;\n" -"typedef struct\n" -"{\n" -" Matrix3x3 m_invInertia;\n" -" Matrix3x3 m_initInvInertia;\n" -"} Shape;\n" -"typedef struct\n" -"{\n" -" float4 m_linear;\n" -" float4 m_worldPos[4];\n" -" float4 m_center; \n" -" float m_jacCoeffInv[4];\n" -" float m_b[4];\n" -" float m_appliedRambdaDt[4];\n" -" float m_fJacCoeffInv[2]; \n" -" float m_fAppliedRambdaDt[2]; \n" -" u32 m_bodyA;\n" -" u32 m_bodyB;\n" -" int m_batchIdx;\n" -" u32 m_paddings[1];\n" -"} Constraint4;\n" -"typedef struct\n" -"{\n" -" int m_nConstraints;\n" -" int m_start;\n" -" int m_batchIdx;\n" -" int m_nSplit;\n" -"// int m_paddings[1];\n" -"} ConstBuffer;\n" -"typedef struct\n" -"{\n" -" int m_solveFriction;\n" -" int m_maxBatch; // long batch really kills the performance\n" -" int m_batchIdx;\n" -" int m_nSplit;\n" -"// int m_paddings[1];\n" -"} ConstBufferBatchSolve;\n" -"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n" -"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" -"{\n" -" *linear = mymake_float4(-n.xyz,0.f);\n" -" *angular0 = -cross3(r0, n);\n" -" *angular1 = cross3(r1, n);\n" -"}\n" -"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n" -"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" -"{\n" -" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" -"}\n" -"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" -" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n" -"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" -" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n" -"{\n" -" // linear0,1 are normlized\n" -" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" -" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" -" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" -" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" -" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" -"}\n" -"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" -" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" -"{\n" -" if (fabs(n[0].z) > 0.70710678f) {\n" -" // choose p in y-z plane\n" -" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" -" float k = 1.f/sqrt(a);\n" -" p[0].x = 0;\n" -" p[0].y = -n[0].z*k;\n" -" p[0].z = n[0].y*k;\n" -" // set q = n x p\n" -" q[0].x = a*k;\n" -" q[0].y = -n[0].x*p[0].z;\n" -" q[0].z = n[0].x*p[0].y;\n" -" }\n" -" else {\n" -" // choose p in x-y plane\n" -" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" -" float k = 1.f/sqrt(a);\n" -" p[0].x = -n[0].y*k;\n" -" p[0].y = n[0].x*k;\n" -" p[0].z = 0;\n" -" // set q = n x p\n" -" q[0].x = -n[0].z*p[0].y;\n" -" q[0].y = n[0].z*p[0].x;\n" -" q[0].z = a*k;\n" -" }\n" -"}\n" -"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" -"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" -"{\n" -" float frictionCoeff = ldsCs[0].m_linear.w;\n" -" int aIdx = ldsCs[0].m_bodyA;\n" -" int bIdx = ldsCs[0].m_bodyB;\n" -" float4 posA = gBodies[aIdx].m_pos;\n" -" float4 linVelA = gBodies[aIdx].m_linVel;\n" -" float4 angVelA = gBodies[aIdx].m_angVel;\n" -" float invMassA = gBodies[aIdx].m_invMass;\n" -" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" -" float4 posB = gBodies[bIdx].m_pos;\n" -" float4 linVelB = gBodies[bIdx].m_linVel;\n" -" float4 angVelB = gBodies[bIdx].m_angVel;\n" -" float invMassB = gBodies[bIdx].m_invMass;\n" -" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" -" \n" -" {\n" -" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n" -" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n" -" float sum = 0;\n" -" for(int j=0; j<4; j++)\n" -" {\n" -" sum +=ldsCs[0].m_appliedRambdaDt[j];\n" -" }\n" -" frictionCoeff = 0.7f;\n" -" for(int j=0; j<4; j++)\n" -" {\n" -" maxRambdaDt[j] = frictionCoeff*sum;\n" -" minRambdaDt[j] = -maxRambdaDt[j];\n" -" }\n" -" \n" -"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" -"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n" -" \n" -" \n" -" {\n" -" \n" -" __global Constraint4* cs = ldsCs;\n" -" \n" -" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n" -" const float4 center = cs->m_center;\n" -" \n" -" float4 n = -cs->m_linear;\n" -" \n" -" float4 tangent[2];\n" -" btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n" -" float4 angular0, angular1, linear;\n" -" float4 r0 = center - posA;\n" -" float4 r1 = center - posB;\n" -" for(int i=0; i<2; i++)\n" -" {\n" -" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n" -" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n" -" linVelA, angVelA, linVelB, angVelB );\n" -" rambdaDt *= cs->m_fJacCoeffInv[i];\n" -" \n" -" {\n" -" float prevSum = cs->m_fAppliedRambdaDt[i];\n" -" float updated = prevSum;\n" -" updated += rambdaDt;\n" -" updated = max2( updated, minRambdaDt[i] );\n" -" updated = min2( updated, maxRambdaDt[i] );\n" -" rambdaDt = updated - prevSum;\n" -" cs->m_fAppliedRambdaDt[i] = updated;\n" -" }\n" -" \n" -" float4 linImp0 = invMassA*linear*rambdaDt;\n" -" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" -" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" -" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" -" \n" -" linVelA += linImp0;\n" -" angVelA += angImp0;\n" -" linVelB += linImp1;\n" -" angVelB += angImp1;\n" -" }\n" -" { // angular damping for point constraint\n" -" float4 ab = normalize3( posB - posA );\n" -" float4 ac = normalize3( center - posA );\n" -" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n" -" {\n" -" float angNA = dot3F4( n, angVelA );\n" -" float angNB = dot3F4( n, angVelB );\n" -" \n" -" angVelA -= (angNA*0.1f)*n;\n" -" angVelB -= (angNB*0.1f)*n;\n" -" }\n" -" }\n" -" }\n" -" \n" -" \n" -" }\n" -" if (gBodies[aIdx].m_invMass)\n" -" {\n" -" gBodies[aIdx].m_linVel = linVelA;\n" -" gBodies[aIdx].m_angVel = angVelA;\n" -" } else\n" -" {\n" -" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" -" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" -" }\n" -" if (gBodies[bIdx].m_invMass)\n" -" {\n" -" gBodies[bIdx].m_linVel = linVelB;\n" -" gBodies[bIdx].m_angVel = angVelB;\n" -" } else\n" -" {\n" -" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" -" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" -" }\n" -" \n" -"}\n" -"typedef struct \n" -"{\n" -" int m_valInt0;\n" -" int m_valInt1;\n" -" int m_valInt2;\n" -" int m_valInt3;\n" -" float m_val0;\n" -" float m_val1;\n" -" float m_val2;\n" -" float m_val3;\n" -"} SolverDebugInfo;\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void BatchSolveKernelFriction(__global Body* gBodies,\n" -" __global Shape* gShapes,\n" -" __global Constraint4* gConstraints,\n" -" __global int* gN,\n" -" __global int* gOffsets,\n" -" __global int* batchSizes,\n" -" int maxBatch1,\n" -" int cellBatch,\n" -" int4 nSplit\n" -" )\n" -"{\n" -" //__local int ldsBatchIdx[WG_SIZE+1];\n" -" __local int ldsCurBatch;\n" -" __local int ldsNextBatch;\n" -" __local int ldsStart;\n" -" int lIdx = GET_LOCAL_IDX;\n" -" int wgIdx = GET_GROUP_IDX;\n" -"// int gIdx = GET_GLOBAL_IDX;\n" -"// debugInfo[gIdx].m_valInt0 = gIdx;\n" -" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" -" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" -" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" -" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" -" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" -" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" -" \n" -" if( gN[cellIdx] == 0 ) \n" -" return;\n" -" int maxBatch = batchSizes[cellIdx];\n" -" const int start = gOffsets[cellIdx];\n" -" const int end = start + gN[cellIdx];\n" -" \n" -" if( lIdx == 0 )\n" -" {\n" -" ldsCurBatch = 0;\n" -" ldsNextBatch = 0;\n" -" ldsStart = start;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" int idx=ldsStart+lIdx;\n" -" while (ldsCurBatch < maxBatch)\n" -" {\n" -" for(; idx<end; )\n" -" {\n" -" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n" -" {\n" -" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n" -" idx+=64;\n" -" } else\n" -" {\n" -" break;\n" -" }\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" if( lIdx == 0 )\n" -" {\n" -" ldsCurBatch++;\n" -" }\n" -" GROUP_LDS_BARRIER;\n" -" }\n" -" \n" -" \n" -"}\n" -"__kernel void solveSingleFrictionKernel(__global Body* gBodies,\n" -" __global Shape* gShapes,\n" -" __global Constraint4* gConstraints,\n" -" int cellIdx,\n" -" int batchOffset,\n" -" int numConstraintsInBatch\n" -" )\n" -"{\n" -" int index = get_global_id(0);\n" -" if (index < numConstraintsInBatch)\n" -" {\n" -" \n" -" int idx=batchOffset+index;\n" -" \n" -" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n" -" } \n" -"}\n" -; +static const char* solveFrictionCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "#ifdef cl_ext_atomic_counters_32\n" + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + "#define counter32_t volatile global int*\n" + "#endif\n" + "typedef unsigned int u32;\n" + "typedef unsigned short u16;\n" + "typedef unsigned char u8;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GET_NUM_GROUPS get_num_groups(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" + "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define mymake_float4 (float4)\n" + "//#define make_float2 (float2)\n" + "//#define make_uint4 (uint4)\n" + "//#define make_int4 (int4)\n" + "//#define make_uint2 (uint2)\n" + "//#define make_int2 (int2)\n" + "#define max2 max\n" + "#define min2 min\n" + "///////////////////////////////////////\n" + "// Vector\n" + "///////////////////////////////////////\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " return fast_normalize(v);\n" + "}\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + "}\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = mymake_float4(a.xyz,0.f);\n" + " float4 b1 = mymake_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float4 normalize3(const float4 a)\n" + "{\n" + " float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n" + " return fastNormalize4( n );\n" + "// float length = sqrtf(dot3F4(a, a));\n" + "// return 1.f/length * a;\n" + "}\n" + "///////////////////////////////////////\n" + "// Matrix3x3\n" + "///////////////////////////////////////\n" + "typedef struct\n" + "{\n" + " float4 m_row[3];\n" + "}Matrix3x3;\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b);\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b);\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b)\n" + "{\n" + " float4 ans;\n" + " ans.x = dot3F4( a.m_row[0], b );\n" + " ans.y = dot3F4( a.m_row[1], b );\n" + " ans.z = dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b)\n" + "{\n" + " float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " float4 ans;\n" + " ans.x = dot3F4( a, colx );\n" + " ans.y = dot3F4( a, coly );\n" + " ans.z = dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "///////////////////////////////////////\n" + "// Quaternion\n" + "///////////////////////////////////////\n" + "typedef float4 Quaternion;\n" + "#define WG_SIZE 64\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " Quaternion m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " u32 m_shapeIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} Body;\n" + "typedef struct\n" + "{\n" + " Matrix3x3 m_invInertia;\n" + " Matrix3x3 m_initInvInertia;\n" + "} Shape;\n" + "typedef struct\n" + "{\n" + " float4 m_linear;\n" + " float4 m_worldPos[4];\n" + " float4 m_center; \n" + " float m_jacCoeffInv[4];\n" + " float m_b[4];\n" + " float m_appliedRambdaDt[4];\n" + " float m_fJacCoeffInv[2]; \n" + " float m_fAppliedRambdaDt[2]; \n" + " u32 m_bodyA;\n" + " u32 m_bodyB;\n" + " int m_batchIdx;\n" + " u32 m_paddings[1];\n" + "} Constraint4;\n" + "typedef struct\n" + "{\n" + " int m_nConstraints;\n" + " int m_start;\n" + " int m_batchIdx;\n" + " int m_nSplit;\n" + "// int m_paddings[1];\n" + "} ConstBuffer;\n" + "typedef struct\n" + "{\n" + " int m_solveFriction;\n" + " int m_maxBatch; // long batch really kills the performance\n" + " int m_batchIdx;\n" + " int m_nSplit;\n" + "// int m_paddings[1];\n" + "} ConstBufferBatchSolve;\n" + "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n" + "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" + "{\n" + " *linear = mymake_float4(-n.xyz,0.f);\n" + " *angular0 = -cross3(r0, n);\n" + " *angular1 = cross3(r1, n);\n" + "}\n" + "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n" + "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" + "{\n" + " return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" + "}\n" + "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" + " float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n" + "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" + " float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n" + "{\n" + " // linear0,1 are normlized\n" + " float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" + " float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" + " float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" + " float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" + " return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" + "}\n" + "void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" + " void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" + "{\n" + " if (fabs(n[0].z) > 0.70710678f) {\n" + " // choose p in y-z plane\n" + " float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" + " float k = 1.f/sqrt(a);\n" + " p[0].x = 0;\n" + " p[0].y = -n[0].z*k;\n" + " p[0].z = n[0].y*k;\n" + " // set q = n x p\n" + " q[0].x = a*k;\n" + " q[0].y = -n[0].x*p[0].z;\n" + " q[0].z = n[0].x*p[0].y;\n" + " }\n" + " else {\n" + " // choose p in x-y plane\n" + " float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" + " float k = 1.f/sqrt(a);\n" + " p[0].x = -n[0].y*k;\n" + " p[0].y = n[0].x*k;\n" + " p[0].z = 0;\n" + " // set q = n x p\n" + " q[0].x = -n[0].z*p[0].y;\n" + " q[0].y = n[0].z*p[0].x;\n" + " q[0].z = a*k;\n" + " }\n" + "}\n" + "void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" + "void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" + "{\n" + " float frictionCoeff = ldsCs[0].m_linear.w;\n" + " int aIdx = ldsCs[0].m_bodyA;\n" + " int bIdx = ldsCs[0].m_bodyB;\n" + " float4 posA = gBodies[aIdx].m_pos;\n" + " float4 linVelA = gBodies[aIdx].m_linVel;\n" + " float4 angVelA = gBodies[aIdx].m_angVel;\n" + " float invMassA = gBodies[aIdx].m_invMass;\n" + " Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" + " float4 posB = gBodies[bIdx].m_pos;\n" + " float4 linVelB = gBodies[bIdx].m_linVel;\n" + " float4 angVelB = gBodies[bIdx].m_angVel;\n" + " float invMassB = gBodies[bIdx].m_invMass;\n" + " Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" + " \n" + " {\n" + " float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n" + " float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n" + " float sum = 0;\n" + " for(int j=0; j<4; j++)\n" + " {\n" + " sum +=ldsCs[0].m_appliedRambdaDt[j];\n" + " }\n" + " frictionCoeff = 0.7f;\n" + " for(int j=0; j<4; j++)\n" + " {\n" + " maxRambdaDt[j] = frictionCoeff*sum;\n" + " minRambdaDt[j] = -maxRambdaDt[j];\n" + " }\n" + " \n" + "// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" + "// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n" + " \n" + " \n" + " {\n" + " \n" + " __global Constraint4* cs = ldsCs;\n" + " \n" + " if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n" + " const float4 center = cs->m_center;\n" + " \n" + " float4 n = -cs->m_linear;\n" + " \n" + " float4 tangent[2];\n" + " btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n" + " float4 angular0, angular1, linear;\n" + " float4 r0 = center - posA;\n" + " float4 r1 = center - posB;\n" + " for(int i=0; i<2; i++)\n" + " {\n" + " setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n" + " float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n" + " linVelA, angVelA, linVelB, angVelB );\n" + " rambdaDt *= cs->m_fJacCoeffInv[i];\n" + " \n" + " {\n" + " float prevSum = cs->m_fAppliedRambdaDt[i];\n" + " float updated = prevSum;\n" + " updated += rambdaDt;\n" + " updated = max2( updated, minRambdaDt[i] );\n" + " updated = min2( updated, maxRambdaDt[i] );\n" + " rambdaDt = updated - prevSum;\n" + " cs->m_fAppliedRambdaDt[i] = updated;\n" + " }\n" + " \n" + " float4 linImp0 = invMassA*linear*rambdaDt;\n" + " float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" + " float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" + " float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" + " \n" + " linVelA += linImp0;\n" + " angVelA += angImp0;\n" + " linVelB += linImp1;\n" + " angVelB += angImp1;\n" + " }\n" + " { // angular damping for point constraint\n" + " float4 ab = normalize3( posB - posA );\n" + " float4 ac = normalize3( center - posA );\n" + " if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n" + " {\n" + " float angNA = dot3F4( n, angVelA );\n" + " float angNB = dot3F4( n, angVelB );\n" + " \n" + " angVelA -= (angNA*0.1f)*n;\n" + " angVelB -= (angNB*0.1f)*n;\n" + " }\n" + " }\n" + " }\n" + " \n" + " \n" + " }\n" + " if (gBodies[aIdx].m_invMass)\n" + " {\n" + " gBodies[aIdx].m_linVel = linVelA;\n" + " gBodies[aIdx].m_angVel = angVelA;\n" + " } else\n" + " {\n" + " gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" + " gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" + " }\n" + " if (gBodies[bIdx].m_invMass)\n" + " {\n" + " gBodies[bIdx].m_linVel = linVelB;\n" + " gBodies[bIdx].m_angVel = angVelB;\n" + " } else\n" + " {\n" + " gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" + " gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" + " }\n" + " \n" + "}\n" + "typedef struct \n" + "{\n" + " int m_valInt0;\n" + " int m_valInt1;\n" + " int m_valInt2;\n" + " int m_valInt3;\n" + " float m_val0;\n" + " float m_val1;\n" + " float m_val2;\n" + " float m_val3;\n" + "} SolverDebugInfo;\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void BatchSolveKernelFriction(__global Body* gBodies,\n" + " __global Shape* gShapes,\n" + " __global Constraint4* gConstraints,\n" + " __global int* gN,\n" + " __global int* gOffsets,\n" + " __global int* batchSizes,\n" + " int maxBatch1,\n" + " int cellBatch,\n" + " int4 nSplit\n" + " )\n" + "{\n" + " //__local int ldsBatchIdx[WG_SIZE+1];\n" + " __local int ldsCurBatch;\n" + " __local int ldsNextBatch;\n" + " __local int ldsStart;\n" + " int lIdx = GET_LOCAL_IDX;\n" + " int wgIdx = GET_GROUP_IDX;\n" + "// int gIdx = GET_GLOBAL_IDX;\n" + "// debugInfo[gIdx].m_valInt0 = gIdx;\n" + " //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" + " int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" + " int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" + " int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" + " int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" + " int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" + " \n" + " if( gN[cellIdx] == 0 ) \n" + " return;\n" + " int maxBatch = batchSizes[cellIdx];\n" + " const int start = gOffsets[cellIdx];\n" + " const int end = start + gN[cellIdx];\n" + " \n" + " if( lIdx == 0 )\n" + " {\n" + " ldsCurBatch = 0;\n" + " ldsNextBatch = 0;\n" + " ldsStart = start;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " int idx=ldsStart+lIdx;\n" + " while (ldsCurBatch < maxBatch)\n" + " {\n" + " for(; idx<end; )\n" + " {\n" + " if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n" + " {\n" + " solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n" + " idx+=64;\n" + " } else\n" + " {\n" + " break;\n" + " }\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " if( lIdx == 0 )\n" + " {\n" + " ldsCurBatch++;\n" + " }\n" + " GROUP_LDS_BARRIER;\n" + " }\n" + " \n" + " \n" + "}\n" + "__kernel void solveSingleFrictionKernel(__global Body* gBodies,\n" + " __global Shape* gShapes,\n" + " __global Constraint4* gConstraints,\n" + " int cellIdx,\n" + " int batchOffset,\n" + " int numConstraintsInBatch\n" + " )\n" + "{\n" + " int index = get_global_id(0);\n" + " if (index < numConstraintsInBatch)\n" + " {\n" + " \n" + " int idx=batchOffset+index;\n" + " \n" + " solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n" + " } \n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.h index eb1834ee00..d53db03181 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup.h @@ -1,703 +1,702 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* solverSetupCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"#ifndef B3_CONTACT4DATA_H\n" -"#define B3_CONTACT4DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"typedef struct b3Contact4Data b3Contact4Data_t;\n" -"struct b3Contact4Data\n" -"{\n" -" b3Float4 m_worldPosB[4];\n" -"// b3Float4 m_localPosA[4];\n" -"// b3Float4 m_localPosB[4];\n" -" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" -" unsigned short m_restituitionCoeffCmp;\n" -" unsigned short m_frictionCoeffCmp;\n" -" int m_batchIdx;\n" -" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" -" int m_bodyBPtrAndSignBit;\n" -" int m_childIndexA;\n" -" int m_childIndexB;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" -"{\n" -" return (int)contact->m_worldNormalOnB.w;\n" -"};\n" -"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" -"{\n" -" contact->m_worldNormalOnB.w = (float)numPoints;\n" -"};\n" -"#endif //B3_CONTACT4DATA_H\n" -"#ifndef B3_CONTACT_CONSTRAINT5_H\n" -"#define B3_CONTACT_CONSTRAINT5_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"typedef struct b3ContactConstraint4 b3ContactConstraint4_t;\n" -"struct b3ContactConstraint4\n" -"{\n" -" b3Float4 m_linear;//normal?\n" -" b3Float4 m_worldPos[4];\n" -" b3Float4 m_center; // friction\n" -" float m_jacCoeffInv[4];\n" -" float m_b[4];\n" -" float m_appliedRambdaDt[4];\n" -" float m_fJacCoeffInv[2]; // friction\n" -" float m_fAppliedRambdaDt[2]; // friction\n" -" unsigned int m_bodyA;\n" -" unsigned int m_bodyB;\n" -" int m_batchIdx;\n" -" unsigned int m_paddings;\n" -"};\n" -"//inline void setFrictionCoeff(float value) { m_linear[3] = value; }\n" -"inline float b3GetFrictionCoeff(b3ContactConstraint4_t* constraint) \n" -"{\n" -" return constraint->m_linear.w; \n" -"}\n" -"#endif //B3_CONTACT_CONSTRAINT5_H\n" -"#ifndef B3_RIGIDBODY_DATA_H\n" -"#define B3_RIGIDBODY_DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#define B3_QUAT_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif\n" -"#endif\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Quat;\n" -" #define b3QuatConstArg const b3Quat\n" -" \n" -" \n" -"inline float4 b3FastNormalize4(float4 v)\n" -"{\n" -" v = (float4)(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -" \n" -"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" -"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" -"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" -"{\n" -" b3Quat ans;\n" -" ans = b3Cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" -"{\n" -" b3Quat q;\n" -" q=in;\n" -" //return b3FastNormalize4(in);\n" -" float len = native_sqrt(dot(q, q));\n" -" if(len > 0.f)\n" -" {\n" -" q *= 1.f / len;\n" -" }\n" -" else\n" -" {\n" -" q.x = q.y = q.z = 0.f;\n" -" q.w = 1.f;\n" -" }\n" -" return q;\n" -"}\n" -"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" b3Quat qInv = b3QuatInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" return b3QuatRotate( b3QuatInvert( q ), vec );\n" -"}\n" -"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" -"{\n" -" return b3QuatRotate( orientation, point ) + (translation);\n" -"}\n" -" \n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifndef B3_MAT3x3_H\n" -"#define B3_MAT3x3_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"typedef struct\n" -"{\n" -" b3Float4 m_row[3];\n" -"}b3Mat3x3;\n" -"#define b3Mat3x3ConstArg const b3Mat3x3\n" -"#define b3GetRow(m,row) (m.m_row[row])\n" -"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" -"{\n" -" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" -" b3Mat3x3 out;\n" -" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" -" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" -" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" -" out.m_row[0].w = 0.f;\n" -" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" -" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" -" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" -" out.m_row[1].w = 0.f;\n" -" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" -" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" -" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" -" out.m_row[2].w = 0.f;\n" -" return out;\n" -"}\n" -"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = fabs(matIn.m_row[0]);\n" -" out.m_row[1] = fabs(matIn.m_row[1]);\n" -" out.m_row[2] = fabs(matIn.m_row[2]);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtZero();\n" -"__inline\n" -"b3Mat3x3 mtIdentity();\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Mat3x3 mtZero()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(0.f);\n" -" m.m_row[1] = (b3Float4)(0.f);\n" -" m.m_row[2] = (b3Float4)(0.f);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtIdentity()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(1,0,0,0);\n" -" m.m_row[1] = (b3Float4)(0,1,0,0);\n" -" m.m_row[2] = (b3Float4)(0,0,1,0);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" -" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" -" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" -"{\n" -" b3Mat3x3 transB;\n" -" transB = mtTranspose( b );\n" -" b3Mat3x3 ans;\n" -" // why this doesn't run when 0ing in the for{}\n" -" a.m_row[0].w = 0.f;\n" -" a.m_row[1].w = 0.f;\n" -" a.m_row[2].w = 0.f;\n" -" for(int i=0; i<3; i++)\n" -" {\n" -"// a.m_row[i].w = 0.f;\n" -" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" -" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" -" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" -" ans.m_row[i].w = 0.f;\n" -" }\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" -"{\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a.m_row[0], b );\n" -" ans.y = b3Dot3F4( a.m_row[1], b );\n" -" ans.z = b3Dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" -"{\n" -" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a, colx );\n" -" ans.y = b3Dot3F4( a, coly );\n" -" ans.z = b3Dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"#endif\n" -"#endif //B3_MAT3x3_H\n" -"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" -"struct b3RigidBodyData\n" -"{\n" -" b3Float4 m_pos;\n" -" b3Quat m_quat;\n" -" b3Float4 m_linVel;\n" -" b3Float4 m_angVel;\n" -" int m_collidableIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"};\n" -"typedef struct b3InertiaData b3InertiaData_t;\n" -"struct b3InertiaData\n" -"{\n" -" b3Mat3x3 m_invInertiaWorld;\n" -" b3Mat3x3 m_initInvInertia;\n" -"};\n" -"#endif //B3_RIGIDBODY_DATA_H\n" -" \n" -"void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q);\n" -" void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q)\n" -"{\n" -" if (b3Fabs(n.z) > 0.70710678f) {\n" -" // choose p in y-z plane\n" -" float a = n.y*n.y + n.z*n.z;\n" -" float k = 1.f/sqrt(a);\n" -" p[0].x = 0;\n" -" p[0].y = -n.z*k;\n" -" p[0].z = n.y*k;\n" -" // set q = n x p\n" -" q[0].x = a*k;\n" -" q[0].y = -n.x*p[0].z;\n" -" q[0].z = n.x*p[0].y;\n" -" }\n" -" else {\n" -" // choose p in x-y plane\n" -" float a = n.x*n.x + n.y*n.y;\n" -" float k = 1.f/sqrt(a);\n" -" p[0].x = -n.y*k;\n" -" p[0].y = n.x*k;\n" -" p[0].z = 0;\n" -" // set q = n x p\n" -" q[0].x = -n.z*p[0].y;\n" -" q[0].y = n.z*p[0].x;\n" -" q[0].z = a*k;\n" -" }\n" -"}\n" -" \n" -"void setLinearAndAngular( b3Float4ConstArg n, b3Float4ConstArg r0, b3Float4ConstArg r1, b3Float4* linear, b3Float4* angular0, b3Float4* angular1)\n" -"{\n" -" *linear = b3MakeFloat4(n.x,n.y,n.z,0.f);\n" -" *angular0 = b3Cross3(r0, n);\n" -" *angular1 = -b3Cross3(r1, n);\n" -"}\n" -"float calcRelVel( b3Float4ConstArg l0, b3Float4ConstArg l1, b3Float4ConstArg a0, b3Float4ConstArg a1, b3Float4ConstArg linVel0,\n" -" b3Float4ConstArg angVel0, b3Float4ConstArg linVel1, b3Float4ConstArg angVel1 )\n" -"{\n" -" return b3Dot3F4(l0, linVel0) + b3Dot3F4(a0, angVel0) + b3Dot3F4(l1, linVel1) + b3Dot3F4(a1, angVel1);\n" -"}\n" -"float calcJacCoeff(b3Float4ConstArg linear0, b3Float4ConstArg linear1, b3Float4ConstArg angular0, b3Float4ConstArg angular1,\n" -" float invMass0, const b3Mat3x3* invInertia0, float invMass1, const b3Mat3x3* invInertia1)\n" -"{\n" -" // linear0,1 are normlized\n" -" float jmj0 = invMass0;//b3Dot3F4(linear0, linear0)*invMass0;\n" -" float jmj1 = b3Dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" -" float jmj2 = invMass1;//b3Dot3F4(linear1, linear1)*invMass1;\n" -" float jmj3 = b3Dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" -" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" -"}\n" -"void setConstraint4( b3Float4ConstArg posA, b3Float4ConstArg linVelA, b3Float4ConstArg angVelA, float invMassA, b3Mat3x3ConstArg invInertiaA,\n" -" b3Float4ConstArg posB, b3Float4ConstArg linVelB, b3Float4ConstArg angVelB, float invMassB, b3Mat3x3ConstArg invInertiaB, \n" -" __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,\n" -" b3ContactConstraint4_t* dstC )\n" -"{\n" -" dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n" -" dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n" -" float dtInv = 1.f/dt;\n" -" for(int ic=0; ic<4; ic++)\n" -" {\n" -" dstC->m_appliedRambdaDt[ic] = 0.f;\n" -" }\n" -" dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n" -" dstC->m_linear = src->m_worldNormalOnB;\n" -" dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n" -" for(int ic=0; ic<4; ic++)\n" -" {\n" -" b3Float4 r0 = src->m_worldPosB[ic] - posA;\n" -" b3Float4 r1 = src->m_worldPosB[ic] - posB;\n" -" if( ic >= src->m_worldNormalOnB.w )//npoints\n" -" {\n" -" dstC->m_jacCoeffInv[ic] = 0.f;\n" -" continue;\n" -" }\n" -" float relVelN;\n" -" {\n" -" b3Float4 linear, angular0, angular1;\n" -" setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n" -" dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n" -" invMassA, &invInertiaA, invMassB, &invInertiaB );\n" -" relVelN = calcRelVel(linear, -linear, angular0, angular1,\n" -" linVelA, angVelA, linVelB, angVelB);\n" -" float e = 0.f;//src->getRestituitionCoeff();\n" -" if( relVelN*relVelN < 0.004f ) e = 0.f;\n" -" dstC->m_b[ic] = e*relVelN;\n" -" //float penetration = src->m_worldPosB[ic].w;\n" -" dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n" -" dstC->m_appliedRambdaDt[ic] = 0.f;\n" -" }\n" -" }\n" -" if( src->m_worldNormalOnB.w > 0 )//npoints\n" -" { // prepare friction\n" -" b3Float4 center = b3MakeFloat4(0.f,0.f,0.f,0.f);\n" -" for(int i=0; i<src->m_worldNormalOnB.w; i++) \n" -" center += src->m_worldPosB[i];\n" -" center /= (float)src->m_worldNormalOnB.w;\n" -" b3Float4 tangent[2];\n" -" b3PlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n" -" \n" -" b3Float4 r[2];\n" -" r[0] = center - posA;\n" -" r[1] = center - posB;\n" -" for(int i=0; i<2; i++)\n" -" {\n" -" b3Float4 linear, angular0, angular1;\n" -" setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n" -" dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n" -" invMassA, &invInertiaA, invMassB, &invInertiaB );\n" -" dstC->m_fAppliedRambdaDt[i] = 0.f;\n" -" }\n" -" dstC->m_center = center;\n" -" }\n" -" for(int i=0; i<4; i++)\n" -" {\n" -" if( i<src->m_worldNormalOnB.w )\n" -" {\n" -" dstC->m_worldPos[i] = src->m_worldPosB[i];\n" -" }\n" -" else\n" -" {\n" -" dstC->m_worldPos[i] = b3MakeFloat4(0.f,0.f,0.f,0.f);\n" -" }\n" -" }\n" -"}\n" -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"#ifdef cl_ext_atomic_counters_32\n" -"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -"#define counter32_t volatile global int*\n" -"#endif\n" -"typedef unsigned int u32;\n" -"typedef unsigned short u16;\n" -"typedef unsigned char u8;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GET_NUM_GROUPS get_num_groups(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" -"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define make_float4 (float4)\n" -"#define make_float2 (float2)\n" -"#define make_uint4 (uint4)\n" -"#define make_int4 (int4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"#define max2 max\n" -"#define min2 min\n" -"///////////////////////////////////////\n" -"// Vector\n" -"///////////////////////////////////////\n" -"__inline\n" -"float fastDiv(float numerator, float denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"// return numerator/denominator; \n" -"}\n" -"__inline\n" -"float4 fastDiv4(float4 numerator, float4 denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"}\n" -"__inline\n" -"float fastSqrtf(float f2)\n" -"{\n" -" return native_sqrt(f2);\n" -"// return sqrt(f2);\n" -"}\n" -"__inline\n" -"float fastRSqrt(float f2)\n" -"{\n" -" return native_rsqrt(f2);\n" -"}\n" -"__inline\n" -"float fastLength4(float4 v)\n" -"{\n" -" return fast_length(v);\n" -"}\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" return fast_normalize(v);\n" -"}\n" -"__inline\n" -"float sqrtf(float a)\n" -"{\n" -"// return sqrt(a);\n" -" return native_sqrt(a);\n" -"}\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -"}\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = make_float4(a.xyz,0.f);\n" -" float4 b1 = make_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float length3(const float4 a)\n" -"{\n" -" return sqrtf(dot3F4(a,a));\n" -"}\n" -"__inline\n" -"float dot4(const float4 a, const float4 b)\n" -"{\n" -" return dot( a, b );\n" -"}\n" -"// for height\n" -"__inline\n" -"float dot3w1(const float4 point, const float4 eqn)\n" -"{\n" -" return dot3F4(point,eqn) + eqn.w;\n" -"}\n" -"__inline\n" -"float4 normalize3(const float4 a)\n" -"{\n" -" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" -" return fastNormalize4( n );\n" -"// float length = sqrtf(dot3F4(a, a));\n" -"// return 1.f/length * a;\n" -"}\n" -"__inline\n" -"float4 normalize4(const float4 a)\n" -"{\n" -" float length = sqrtf(dot4(a, a));\n" -" return 1.f/length * a;\n" -"}\n" -"__inline\n" -"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" -"{\n" -" float4 eqn;\n" -" float4 ab = b-a;\n" -" float4 ac = c-a;\n" -" eqn = normalize3( cross3(ab, ac) );\n" -" eqn.w = -dot3F4(eqn,a);\n" -" return eqn;\n" -"}\n" -"#define WG_SIZE 64\n" -"typedef struct\n" -"{\n" -" int m_nConstraints;\n" -" int m_start;\n" -" int m_batchIdx;\n" -" int m_nSplit;\n" -"// int m_paddings[1];\n" -"} ConstBuffer;\n" -"typedef struct\n" -"{\n" -" int m_solveFriction;\n" -" int m_maxBatch; // long batch really kills the performance\n" -" int m_batchIdx;\n" -" int m_nSplit;\n" -"// int m_paddings[1];\n" -"} ConstBufferBatchSolve;\n" -" \n" -"typedef struct \n" -"{\n" -" int m_valInt0;\n" -" int m_valInt1;\n" -" int m_valInt2;\n" -" int m_valInt3;\n" -" float m_val0;\n" -" float m_val1;\n" -" float m_val2;\n" -" float m_val3;\n" -"} SolverDebugInfo;\n" -"typedef struct\n" -"{\n" -" int m_nContacts;\n" -" float m_dt;\n" -" float m_positionDrift;\n" -" float m_positionConstraintCoeff;\n" -"} ConstBufferCTC;\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, \n" -"int nContacts,\n" -"float dt,\n" -"float positionDrift,\n" -"float positionConstraintCoeff\n" -")\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" \n" -" if( gIdx < nContacts )\n" -" {\n" -" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" -" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" -" float4 posA = gBodies[aIdx].m_pos;\n" -" float4 linVelA = gBodies[aIdx].m_linVel;\n" -" float4 angVelA = gBodies[aIdx].m_angVel;\n" -" float invMassA = gBodies[aIdx].m_invMass;\n" -" b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;\n" -" float4 posB = gBodies[bIdx].m_pos;\n" -" float4 linVelB = gBodies[bIdx].m_linVel;\n" -" float4 angVelB = gBodies[bIdx].m_angVel;\n" -" float invMassB = gBodies[bIdx].m_invMass;\n" -" b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;\n" -" b3ContactConstraint4_t cs;\n" -" setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n" -" &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n" -" &cs );\n" -" \n" -" cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n" -" gConstraintOut[gIdx] = cs;\n" -" }\n" -"}\n" -; +static const char* solverSetupCL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "#ifndef B3_CONTACT4DATA_H\n" + "#define B3_CONTACT4DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "typedef struct b3Contact4Data b3Contact4Data_t;\n" + "struct b3Contact4Data\n" + "{\n" + " b3Float4 m_worldPosB[4];\n" + "// b3Float4 m_localPosA[4];\n" + "// b3Float4 m_localPosB[4];\n" + " b3Float4 m_worldNormalOnB; // w: m_nPoints\n" + " unsigned short m_restituitionCoeffCmp;\n" + " unsigned short m_frictionCoeffCmp;\n" + " int m_batchIdx;\n" + " int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" + " int m_bodyBPtrAndSignBit;\n" + " int m_childIndexA;\n" + " int m_childIndexB;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" + "{\n" + " return (int)contact->m_worldNormalOnB.w;\n" + "};\n" + "inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" + "{\n" + " contact->m_worldNormalOnB.w = (float)numPoints;\n" + "};\n" + "#endif //B3_CONTACT4DATA_H\n" + "#ifndef B3_CONTACT_CONSTRAINT5_H\n" + "#define B3_CONTACT_CONSTRAINT5_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "typedef struct b3ContactConstraint4 b3ContactConstraint4_t;\n" + "struct b3ContactConstraint4\n" + "{\n" + " b3Float4 m_linear;//normal?\n" + " b3Float4 m_worldPos[4];\n" + " b3Float4 m_center; // friction\n" + " float m_jacCoeffInv[4];\n" + " float m_b[4];\n" + " float m_appliedRambdaDt[4];\n" + " float m_fJacCoeffInv[2]; // friction\n" + " float m_fAppliedRambdaDt[2]; // friction\n" + " unsigned int m_bodyA;\n" + " unsigned int m_bodyB;\n" + " int m_batchIdx;\n" + " unsigned int m_paddings;\n" + "};\n" + "//inline void setFrictionCoeff(float value) { m_linear[3] = value; }\n" + "inline float b3GetFrictionCoeff(b3ContactConstraint4_t* constraint) \n" + "{\n" + " return constraint->m_linear.w; \n" + "}\n" + "#endif //B3_CONTACT_CONSTRAINT5_H\n" + "#ifndef B3_RIGIDBODY_DATA_H\n" + "#define B3_RIGIDBODY_DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#define B3_QUAT_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif\n" + "#endif\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Quat;\n" + " #define b3QuatConstArg const b3Quat\n" + " \n" + " \n" + "inline float4 b3FastNormalize4(float4 v)\n" + "{\n" + " v = (float4)(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + " \n" + "inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" + "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" + "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" + "{\n" + " b3Quat ans;\n" + " ans = b3Cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - b3Dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" + "{\n" + " b3Quat q;\n" + " q=in;\n" + " //return b3FastNormalize4(in);\n" + " float len = native_sqrt(dot(q, q));\n" + " if(len > 0.f)\n" + " {\n" + " q *= 1.f / len;\n" + " }\n" + " else\n" + " {\n" + " q.x = q.y = q.z = 0.f;\n" + " q.w = 1.f;\n" + " }\n" + " return q;\n" + "}\n" + "inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " b3Quat qInv = b3QuatInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " return b3QuatRotate( b3QuatInvert( q ), vec );\n" + "}\n" + "inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" + "{\n" + " return b3QuatRotate( orientation, point ) + (translation);\n" + "}\n" + " \n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifndef B3_MAT3x3_H\n" + "#define B3_MAT3x3_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "typedef struct\n" + "{\n" + " b3Float4 m_row[3];\n" + "}b3Mat3x3;\n" + "#define b3Mat3x3ConstArg const b3Mat3x3\n" + "#define b3GetRow(m,row) (m.m_row[row])\n" + "inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" + "{\n" + " b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" + " b3Mat3x3 out;\n" + " out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" + " out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" + " out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" + " out.m_row[0].w = 0.f;\n" + " out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" + " out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" + " out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" + " out.m_row[1].w = 0.f;\n" + " out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" + " out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" + " out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" + " out.m_row[2].w = 0.f;\n" + " return out;\n" + "}\n" + "inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = fabs(matIn.m_row[0]);\n" + " out.m_row[1] = fabs(matIn.m_row[1]);\n" + " out.m_row[2] = fabs(matIn.m_row[2]);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtZero();\n" + "__inline\n" + "b3Mat3x3 mtIdentity();\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Mat3x3 mtZero()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(0.f);\n" + " m.m_row[1] = (b3Float4)(0.f);\n" + " m.m_row[2] = (b3Float4)(0.f);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtIdentity()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(1,0,0,0);\n" + " m.m_row[1] = (b3Float4)(0,1,0,0);\n" + " m.m_row[2] = (b3Float4)(0,0,1,0);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" + " out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" + " out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" + "{\n" + " b3Mat3x3 transB;\n" + " transB = mtTranspose( b );\n" + " b3Mat3x3 ans;\n" + " // why this doesn't run when 0ing in the for{}\n" + " a.m_row[0].w = 0.f;\n" + " a.m_row[1].w = 0.f;\n" + " a.m_row[2].w = 0.f;\n" + " for(int i=0; i<3; i++)\n" + " {\n" + "// a.m_row[i].w = 0.f;\n" + " ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" + " ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" + " ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" + " ans.m_row[i].w = 0.f;\n" + " }\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" + "{\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a.m_row[0], b );\n" + " ans.y = b3Dot3F4( a.m_row[1], b );\n" + " ans.z = b3Dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" + "{\n" + " b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a, colx );\n" + " ans.y = b3Dot3F4( a, coly );\n" + " ans.z = b3Dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "#endif\n" + "#endif //B3_MAT3x3_H\n" + "typedef struct b3RigidBodyData b3RigidBodyData_t;\n" + "struct b3RigidBodyData\n" + "{\n" + " b3Float4 m_pos;\n" + " b3Quat m_quat;\n" + " b3Float4 m_linVel;\n" + " b3Float4 m_angVel;\n" + " int m_collidableIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "};\n" + "typedef struct b3InertiaData b3InertiaData_t;\n" + "struct b3InertiaData\n" + "{\n" + " b3Mat3x3 m_invInertiaWorld;\n" + " b3Mat3x3 m_initInvInertia;\n" + "};\n" + "#endif //B3_RIGIDBODY_DATA_H\n" + " \n" + "void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q);\n" + " void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q)\n" + "{\n" + " if (b3Fabs(n.z) > 0.70710678f) {\n" + " // choose p in y-z plane\n" + " float a = n.y*n.y + n.z*n.z;\n" + " float k = 1.f/sqrt(a);\n" + " p[0].x = 0;\n" + " p[0].y = -n.z*k;\n" + " p[0].z = n.y*k;\n" + " // set q = n x p\n" + " q[0].x = a*k;\n" + " q[0].y = -n.x*p[0].z;\n" + " q[0].z = n.x*p[0].y;\n" + " }\n" + " else {\n" + " // choose p in x-y plane\n" + " float a = n.x*n.x + n.y*n.y;\n" + " float k = 1.f/sqrt(a);\n" + " p[0].x = -n.y*k;\n" + " p[0].y = n.x*k;\n" + " p[0].z = 0;\n" + " // set q = n x p\n" + " q[0].x = -n.z*p[0].y;\n" + " q[0].y = n.z*p[0].x;\n" + " q[0].z = a*k;\n" + " }\n" + "}\n" + " \n" + "void setLinearAndAngular( b3Float4ConstArg n, b3Float4ConstArg r0, b3Float4ConstArg r1, b3Float4* linear, b3Float4* angular0, b3Float4* angular1)\n" + "{\n" + " *linear = b3MakeFloat4(n.x,n.y,n.z,0.f);\n" + " *angular0 = b3Cross3(r0, n);\n" + " *angular1 = -b3Cross3(r1, n);\n" + "}\n" + "float calcRelVel( b3Float4ConstArg l0, b3Float4ConstArg l1, b3Float4ConstArg a0, b3Float4ConstArg a1, b3Float4ConstArg linVel0,\n" + " b3Float4ConstArg angVel0, b3Float4ConstArg linVel1, b3Float4ConstArg angVel1 )\n" + "{\n" + " return b3Dot3F4(l0, linVel0) + b3Dot3F4(a0, angVel0) + b3Dot3F4(l1, linVel1) + b3Dot3F4(a1, angVel1);\n" + "}\n" + "float calcJacCoeff(b3Float4ConstArg linear0, b3Float4ConstArg linear1, b3Float4ConstArg angular0, b3Float4ConstArg angular1,\n" + " float invMass0, const b3Mat3x3* invInertia0, float invMass1, const b3Mat3x3* invInertia1)\n" + "{\n" + " // linear0,1 are normlized\n" + " float jmj0 = invMass0;//b3Dot3F4(linear0, linear0)*invMass0;\n" + " float jmj1 = b3Dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" + " float jmj2 = invMass1;//b3Dot3F4(linear1, linear1)*invMass1;\n" + " float jmj3 = b3Dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" + " return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" + "}\n" + "void setConstraint4( b3Float4ConstArg posA, b3Float4ConstArg linVelA, b3Float4ConstArg angVelA, float invMassA, b3Mat3x3ConstArg invInertiaA,\n" + " b3Float4ConstArg posB, b3Float4ConstArg linVelB, b3Float4ConstArg angVelB, float invMassB, b3Mat3x3ConstArg invInertiaB, \n" + " __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,\n" + " b3ContactConstraint4_t* dstC )\n" + "{\n" + " dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n" + " dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n" + " float dtInv = 1.f/dt;\n" + " for(int ic=0; ic<4; ic++)\n" + " {\n" + " dstC->m_appliedRambdaDt[ic] = 0.f;\n" + " }\n" + " dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n" + " dstC->m_linear = src->m_worldNormalOnB;\n" + " dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n" + " for(int ic=0; ic<4; ic++)\n" + " {\n" + " b3Float4 r0 = src->m_worldPosB[ic] - posA;\n" + " b3Float4 r1 = src->m_worldPosB[ic] - posB;\n" + " if( ic >= src->m_worldNormalOnB.w )//npoints\n" + " {\n" + " dstC->m_jacCoeffInv[ic] = 0.f;\n" + " continue;\n" + " }\n" + " float relVelN;\n" + " {\n" + " b3Float4 linear, angular0, angular1;\n" + " setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n" + " dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n" + " invMassA, &invInertiaA, invMassB, &invInertiaB );\n" + " relVelN = calcRelVel(linear, -linear, angular0, angular1,\n" + " linVelA, angVelA, linVelB, angVelB);\n" + " float e = 0.f;//src->getRestituitionCoeff();\n" + " if( relVelN*relVelN < 0.004f ) e = 0.f;\n" + " dstC->m_b[ic] = e*relVelN;\n" + " //float penetration = src->m_worldPosB[ic].w;\n" + " dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n" + " dstC->m_appliedRambdaDt[ic] = 0.f;\n" + " }\n" + " }\n" + " if( src->m_worldNormalOnB.w > 0 )//npoints\n" + " { // prepare friction\n" + " b3Float4 center = b3MakeFloat4(0.f,0.f,0.f,0.f);\n" + " for(int i=0; i<src->m_worldNormalOnB.w; i++) \n" + " center += src->m_worldPosB[i];\n" + " center /= (float)src->m_worldNormalOnB.w;\n" + " b3Float4 tangent[2];\n" + " b3PlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n" + " \n" + " b3Float4 r[2];\n" + " r[0] = center - posA;\n" + " r[1] = center - posB;\n" + " for(int i=0; i<2; i++)\n" + " {\n" + " b3Float4 linear, angular0, angular1;\n" + " setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n" + " dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n" + " invMassA, &invInertiaA, invMassB, &invInertiaB );\n" + " dstC->m_fAppliedRambdaDt[i] = 0.f;\n" + " }\n" + " dstC->m_center = center;\n" + " }\n" + " for(int i=0; i<4; i++)\n" + " {\n" + " if( i<src->m_worldNormalOnB.w )\n" + " {\n" + " dstC->m_worldPos[i] = src->m_worldPosB[i];\n" + " }\n" + " else\n" + " {\n" + " dstC->m_worldPos[i] = b3MakeFloat4(0.f,0.f,0.f,0.f);\n" + " }\n" + " }\n" + "}\n" + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "#ifdef cl_ext_atomic_counters_32\n" + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + "#define counter32_t volatile global int*\n" + "#endif\n" + "typedef unsigned int u32;\n" + "typedef unsigned short u16;\n" + "typedef unsigned char u8;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GET_NUM_GROUPS get_num_groups(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" + "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define make_float4 (float4)\n" + "#define make_float2 (float2)\n" + "#define make_uint4 (uint4)\n" + "#define make_int4 (int4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "#define max2 max\n" + "#define min2 min\n" + "///////////////////////////////////////\n" + "// Vector\n" + "///////////////////////////////////////\n" + "__inline\n" + "float fastDiv(float numerator, float denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "// return numerator/denominator; \n" + "}\n" + "__inline\n" + "float4 fastDiv4(float4 numerator, float4 denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "}\n" + "__inline\n" + "float fastSqrtf(float f2)\n" + "{\n" + " return native_sqrt(f2);\n" + "// return sqrt(f2);\n" + "}\n" + "__inline\n" + "float fastRSqrt(float f2)\n" + "{\n" + " return native_rsqrt(f2);\n" + "}\n" + "__inline\n" + "float fastLength4(float4 v)\n" + "{\n" + " return fast_length(v);\n" + "}\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " return fast_normalize(v);\n" + "}\n" + "__inline\n" + "float sqrtf(float a)\n" + "{\n" + "// return sqrt(a);\n" + " return native_sqrt(a);\n" + "}\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + "}\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = make_float4(a.xyz,0.f);\n" + " float4 b1 = make_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float length3(const float4 a)\n" + "{\n" + " return sqrtf(dot3F4(a,a));\n" + "}\n" + "__inline\n" + "float dot4(const float4 a, const float4 b)\n" + "{\n" + " return dot( a, b );\n" + "}\n" + "// for height\n" + "__inline\n" + "float dot3w1(const float4 point, const float4 eqn)\n" + "{\n" + " return dot3F4(point,eqn) + eqn.w;\n" + "}\n" + "__inline\n" + "float4 normalize3(const float4 a)\n" + "{\n" + " float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" + " return fastNormalize4( n );\n" + "// float length = sqrtf(dot3F4(a, a));\n" + "// return 1.f/length * a;\n" + "}\n" + "__inline\n" + "float4 normalize4(const float4 a)\n" + "{\n" + " float length = sqrtf(dot4(a, a));\n" + " return 1.f/length * a;\n" + "}\n" + "__inline\n" + "float4 createEquation(const float4 a, const float4 b, const float4 c)\n" + "{\n" + " float4 eqn;\n" + " float4 ab = b-a;\n" + " float4 ac = c-a;\n" + " eqn = normalize3( cross3(ab, ac) );\n" + " eqn.w = -dot3F4(eqn,a);\n" + " return eqn;\n" + "}\n" + "#define WG_SIZE 64\n" + "typedef struct\n" + "{\n" + " int m_nConstraints;\n" + " int m_start;\n" + " int m_batchIdx;\n" + " int m_nSplit;\n" + "// int m_paddings[1];\n" + "} ConstBuffer;\n" + "typedef struct\n" + "{\n" + " int m_solveFriction;\n" + " int m_maxBatch; // long batch really kills the performance\n" + " int m_batchIdx;\n" + " int m_nSplit;\n" + "// int m_paddings[1];\n" + "} ConstBufferBatchSolve;\n" + " \n" + "typedef struct \n" + "{\n" + " int m_valInt0;\n" + " int m_valInt1;\n" + " int m_valInt2;\n" + " int m_valInt3;\n" + " float m_val0;\n" + " float m_val1;\n" + " float m_val2;\n" + " float m_val3;\n" + "} SolverDebugInfo;\n" + "typedef struct\n" + "{\n" + " int m_nContacts;\n" + " float m_dt;\n" + " float m_positionDrift;\n" + " float m_positionConstraintCoeff;\n" + "} ConstBufferCTC;\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, \n" + "int nContacts,\n" + "float dt,\n" + "float positionDrift,\n" + "float positionConstraintCoeff\n" + ")\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " \n" + " if( gIdx < nContacts )\n" + " {\n" + " int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" + " int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" + " float4 posA = gBodies[aIdx].m_pos;\n" + " float4 linVelA = gBodies[aIdx].m_linVel;\n" + " float4 angVelA = gBodies[aIdx].m_angVel;\n" + " float invMassA = gBodies[aIdx].m_invMass;\n" + " b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;\n" + " float4 posB = gBodies[bIdx].m_pos;\n" + " float4 linVelB = gBodies[bIdx].m_linVel;\n" + " float4 angVelB = gBodies[bIdx].m_angVel;\n" + " float invMassB = gBodies[bIdx].m_invMass;\n" + " b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;\n" + " b3ContactConstraint4_t cs;\n" + " setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n" + " &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n" + " &cs );\n" + " \n" + " cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n" + " gConstraintOut[gIdx] = cs;\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h index 1b5819f6cf..1e6e3579b6 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h @@ -1,601 +1,600 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* solverSetup2CL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Takahiro Harada\n" -"#ifndef B3_CONTACT4DATA_H\n" -"#define B3_CONTACT4DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"typedef struct b3Contact4Data b3Contact4Data_t;\n" -"struct b3Contact4Data\n" -"{\n" -" b3Float4 m_worldPosB[4];\n" -"// b3Float4 m_localPosA[4];\n" -"// b3Float4 m_localPosB[4];\n" -" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" -" unsigned short m_restituitionCoeffCmp;\n" -" unsigned short m_frictionCoeffCmp;\n" -" int m_batchIdx;\n" -" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" -" int m_bodyBPtrAndSignBit;\n" -" int m_childIndexA;\n" -" int m_childIndexB;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" -"{\n" -" return (int)contact->m_worldNormalOnB.w;\n" -"};\n" -"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" -"{\n" -" contact->m_worldNormalOnB.w = (float)numPoints;\n" -"};\n" -"#endif //B3_CONTACT4DATA_H\n" -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"#ifdef cl_ext_atomic_counters_32\n" -"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -"#define counter32_t volatile global int*\n" -"#endif\n" -"typedef unsigned int u32;\n" -"typedef unsigned short u16;\n" -"typedef unsigned char u8;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GET_NUM_GROUPS get_num_groups(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" -"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define make_float4 (float4)\n" -"#define make_float2 (float2)\n" -"#define make_uint4 (uint4)\n" -"#define make_int4 (int4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"#define max2 max\n" -"#define min2 min\n" -"///////////////////////////////////////\n" -"// Vector\n" -"///////////////////////////////////////\n" -"__inline\n" -"float fastDiv(float numerator, float denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"// return numerator/denominator; \n" -"}\n" -"__inline\n" -"float4 fastDiv4(float4 numerator, float4 denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"}\n" -"__inline\n" -"float fastSqrtf(float f2)\n" -"{\n" -" return native_sqrt(f2);\n" -"// return sqrt(f2);\n" -"}\n" -"__inline\n" -"float fastRSqrt(float f2)\n" -"{\n" -" return native_rsqrt(f2);\n" -"}\n" -"__inline\n" -"float fastLength4(float4 v)\n" -"{\n" -" return fast_length(v);\n" -"}\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" return fast_normalize(v);\n" -"}\n" -"__inline\n" -"float sqrtf(float a)\n" -"{\n" -"// return sqrt(a);\n" -" return native_sqrt(a);\n" -"}\n" -"__inline\n" -"float4 cross3(float4 a, float4 b)\n" -"{\n" -" return cross(a,b);\n" -"}\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = make_float4(a.xyz,0.f);\n" -" float4 b1 = make_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float length3(const float4 a)\n" -"{\n" -" return sqrtf(dot3F4(a,a));\n" -"}\n" -"__inline\n" -"float dot4(const float4 a, const float4 b)\n" -"{\n" -" return dot( a, b );\n" -"}\n" -"// for height\n" -"__inline\n" -"float dot3w1(const float4 point, const float4 eqn)\n" -"{\n" -" return dot3F4(point,eqn) + eqn.w;\n" -"}\n" -"__inline\n" -"float4 normalize3(const float4 a)\n" -"{\n" -" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" -" return fastNormalize4( n );\n" -"// float length = sqrtf(dot3F4(a, a));\n" -"// return 1.f/length * a;\n" -"}\n" -"__inline\n" -"float4 normalize4(const float4 a)\n" -"{\n" -" float length = sqrtf(dot4(a, a));\n" -" return 1.f/length * a;\n" -"}\n" -"__inline\n" -"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" -"{\n" -" float4 eqn;\n" -" float4 ab = b-a;\n" -" float4 ac = c-a;\n" -" eqn = normalize3( cross3(ab, ac) );\n" -" eqn.w = -dot3F4(eqn,a);\n" -" return eqn;\n" -"}\n" -"///////////////////////////////////////\n" -"// Matrix3x3\n" -"///////////////////////////////////////\n" -"typedef struct\n" -"{\n" -" float4 m_row[3];\n" -"}Matrix3x3;\n" -"__inline\n" -"Matrix3x3 mtZero();\n" -"__inline\n" -"Matrix3x3 mtIdentity();\n" -"__inline\n" -"Matrix3x3 mtTranspose(Matrix3x3 m);\n" -"__inline\n" -"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b);\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b);\n" -"__inline\n" -"Matrix3x3 mtZero()\n" -"{\n" -" Matrix3x3 m;\n" -" m.m_row[0] = (float4)(0.f);\n" -" m.m_row[1] = (float4)(0.f);\n" -" m.m_row[2] = (float4)(0.f);\n" -" return m;\n" -"}\n" -"__inline\n" -"Matrix3x3 mtIdentity()\n" -"{\n" -" Matrix3x3 m;\n" -" m.m_row[0] = (float4)(1,0,0,0);\n" -" m.m_row[1] = (float4)(0,1,0,0);\n" -" m.m_row[2] = (float4)(0,0,1,0);\n" -" return m;\n" -"}\n" -"__inline\n" -"Matrix3x3 mtTranspose(Matrix3x3 m)\n" -"{\n" -" Matrix3x3 out;\n" -" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" -" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" -" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" -" return out;\n" -"}\n" -"__inline\n" -"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" -"{\n" -" Matrix3x3 transB;\n" -" transB = mtTranspose( b );\n" -" Matrix3x3 ans;\n" -" // why this doesn't run when 0ing in the for{}\n" -" a.m_row[0].w = 0.f;\n" -" a.m_row[1].w = 0.f;\n" -" a.m_row[2].w = 0.f;\n" -" for(int i=0; i<3; i++)\n" -" {\n" -"// a.m_row[i].w = 0.f;\n" -" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" -" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" -" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" -" ans.m_row[i].w = 0.f;\n" -" }\n" -" return ans;\n" -"}\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b)\n" -"{\n" -" float4 ans;\n" -" ans.x = dot3F4( a.m_row[0], b );\n" -" ans.y = dot3F4( a.m_row[1], b );\n" -" ans.z = dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b)\n" -"{\n" -" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" float4 ans;\n" -" ans.x = dot3F4( a, colx );\n" -" ans.y = dot3F4( a, coly );\n" -" ans.z = dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"///////////////////////////////////////\n" -"// Quaternion\n" -"///////////////////////////////////////\n" -"typedef float4 Quaternion;\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b);\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in);\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec);\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q);\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b)\n" -"{\n" -" Quaternion ans;\n" -" ans = cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in)\n" -"{\n" -" return fastNormalize4(in);\n" -"// in /= length( in );\n" -"// return in;\n" -"}\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec)\n" -"{\n" -" Quaternion qInv = qtInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q)\n" -"{\n" -" return (Quaternion)(-q.xyz, q.w);\n" -"}\n" -"__inline\n" -"float4 qtInvRotate(const Quaternion q, float4 vec)\n" -"{\n" -" return qtRotate( qtInvert( q ), vec );\n" -"}\n" -"#define WG_SIZE 64\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" Quaternion m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" u32 m_shapeIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} Body;\n" -"typedef struct\n" -"{\n" -" Matrix3x3 m_invInertia;\n" -" Matrix3x3 m_initInvInertia;\n" -"} Shape;\n" -"typedef struct\n" -"{\n" -" float4 m_linear;\n" -" float4 m_worldPos[4];\n" -" float4 m_center; \n" -" float m_jacCoeffInv[4];\n" -" float m_b[4];\n" -" float m_appliedRambdaDt[4];\n" -" float m_fJacCoeffInv[2]; \n" -" float m_fAppliedRambdaDt[2]; \n" -" u32 m_bodyA;\n" -" u32 m_bodyB;\n" -" int m_batchIdx;\n" -" u32 m_paddings[1];\n" -"} Constraint4;\n" -"typedef struct\n" -"{\n" -" int m_nConstraints;\n" -" int m_start;\n" -" int m_batchIdx;\n" -" int m_nSplit;\n" -"// int m_paddings[1];\n" -"} ConstBuffer;\n" -"typedef struct\n" -"{\n" -" int m_solveFriction;\n" -" int m_maxBatch; // long batch really kills the performance\n" -" int m_batchIdx;\n" -" int m_nSplit;\n" -"// int m_paddings[1];\n" -"} ConstBufferBatchSolve;\n" -" \n" -"typedef struct \n" -"{\n" -" int m_valInt0;\n" -" int m_valInt1;\n" -" int m_valInt2;\n" -" int m_valInt3;\n" -" float m_val0;\n" -" float m_val1;\n" -" float m_val2;\n" -" float m_val3;\n" -"} SolverDebugInfo;\n" -"// others\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n" -"{\n" -" int nContacts = cb.x;\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < nContacts )\n" -" {\n" -" int srcIdx = sortData[gIdx].y;\n" -" out[gIdx] = in[srcIdx];\n" -" }\n" -"}\n" -"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < nContacts )\n" -" {\n" -" int2 sd;\n" -" sd.x = contactsIn[gIdx].m_childIndexB;\n" -" sd.y = gIdx;\n" -" sortDataOut[gIdx] = sd;\n" -" }\n" -"}\n" -"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < nContacts )\n" -" {\n" -" int2 sdIn;\n" -" sdIn = sortDataInOut[gIdx];\n" -" int2 sdOut;\n" -" sdOut.x = contactsIn[sdIn.y].m_childIndexA;\n" -" sdOut.y = sdIn.y;\n" -" sortDataInOut[gIdx] = sdOut;\n" -" }\n" -"}\n" -"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < nContacts )\n" -" {\n" -" int2 sdIn;\n" -" sdIn = sortDataInOut[gIdx];\n" -" int2 sdOut;\n" -" sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit;\n" -" sdOut.y = sdIn.y;\n" -" sortDataInOut[gIdx] = sdOut;\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < nContacts )\n" -" {\n" -" int2 sdIn;\n" -" sdIn = sortDataInOut[gIdx];\n" -" int2 sdOut;\n" -" sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit;\n" -" sdOut.y = sdIn.y;\n" -" sortDataInOut[gIdx] = sdOut;\n" -" }\n" -"}\n" -"typedef struct\n" -"{\n" -" int m_nContacts;\n" -" int m_staticIdx;\n" -" float m_scale;\n" -" int m_nSplit;\n" -"} ConstBufferSSD;\n" -"__constant const int gridTable4x4[] = \n" -"{\n" -" 0,1,17,16,\n" -" 1,2,18,19,\n" -" 17,18,32,3,\n" -" 16,19,3,34\n" -"};\n" -"__constant const int gridTable8x8[] = \n" -"{\n" -" 0, 2, 3, 16, 17, 18, 19, 1,\n" -" 66, 64, 80, 67, 82, 81, 65, 83,\n" -" 131,144,128,130,147,129,145,146,\n" -" 208,195,194,192,193,211,210,209,\n" -" 21, 22, 23, 5, 4, 6, 7, 20,\n" -" 86, 85, 69, 87, 70, 68, 84, 71,\n" -" 151,133,149,150,135,148,132,134,\n" -" 197,27,214,213,212,199,198,196\n" -" \n" -"};\n" -"#define USE_SPATIAL_BATCHING 1\n" -"#define USE_4x4_GRID 1\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n" -"int nContacts,float scale,int4 nSplit,int staticIdx)\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" \n" -" if( gIdx < nContacts )\n" -" {\n" -" int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;\n" -" int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;\n" -" int aIdx = abs(aPtrAndSignBit );\n" -" int bIdx = abs(bPtrAndSignBit);\n" -" bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);\n" -" bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);\n" -"#if USE_SPATIAL_BATCHING \n" -" int idx = (aStatic)? bIdx: aIdx;\n" -" float4 p = gBodies[idx].m_pos;\n" -" int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);\n" -" int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);\n" -" int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);\n" -" int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);\n" -" \n" -"#else//USE_SPATIAL_BATCHING\n" -" #if USE_4x4_GRID\n" -" int aa = aIdx&3;\n" -" int bb = bIdx&3;\n" -" if (aStatic)\n" -" aa = bb;\n" -" if (bStatic)\n" -" bb = aa;\n" -" int gridIndex = aa + bb*4;\n" -" int newIndex = gridTable4x4[gridIndex];\n" -" #else//USE_4x4_GRID\n" -" int aa = aIdx&7;\n" -" int bb = bIdx&7;\n" -" if (aStatic)\n" -" aa = bb;\n" -" if (bStatic)\n" -" bb = aa;\n" -" int gridIndex = aa + bb*8;\n" -" int newIndex = gridTable8x8[gridIndex];\n" -" #endif//USE_4x4_GRID\n" -"#endif//USE_SPATIAL_BATCHING\n" -" gSortDataOut[gIdx].x = newIndex;\n" -" gSortDataOut[gIdx].y = gIdx;\n" -" }\n" -" else\n" -" {\n" -" gSortDataOut[gIdx].x = 0xffffffff;\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" if( gIdx < cb.x )\n" -" {\n" -" gOut[gIdx] = gIn[gIdx];\n" -" }\n" -"}\n" -; +static const char* solverSetup2CL = + "/*\n" + "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Takahiro Harada\n" + "#ifndef B3_CONTACT4DATA_H\n" + "#define B3_CONTACT4DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "typedef struct b3Contact4Data b3Contact4Data_t;\n" + "struct b3Contact4Data\n" + "{\n" + " b3Float4 m_worldPosB[4];\n" + "// b3Float4 m_localPosA[4];\n" + "// b3Float4 m_localPosB[4];\n" + " b3Float4 m_worldNormalOnB; // w: m_nPoints\n" + " unsigned short m_restituitionCoeffCmp;\n" + " unsigned short m_frictionCoeffCmp;\n" + " int m_batchIdx;\n" + " int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" + " int m_bodyBPtrAndSignBit;\n" + " int m_childIndexA;\n" + " int m_childIndexB;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" + "{\n" + " return (int)contact->m_worldNormalOnB.w;\n" + "};\n" + "inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" + "{\n" + " contact->m_worldNormalOnB.w = (float)numPoints;\n" + "};\n" + "#endif //B3_CONTACT4DATA_H\n" + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "#ifdef cl_ext_atomic_counters_32\n" + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + "#define counter32_t volatile global int*\n" + "#endif\n" + "typedef unsigned int u32;\n" + "typedef unsigned short u16;\n" + "typedef unsigned char u8;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GET_NUM_GROUPS get_num_groups(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" + "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define make_float4 (float4)\n" + "#define make_float2 (float2)\n" + "#define make_uint4 (uint4)\n" + "#define make_int4 (int4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "#define max2 max\n" + "#define min2 min\n" + "///////////////////////////////////////\n" + "// Vector\n" + "///////////////////////////////////////\n" + "__inline\n" + "float fastDiv(float numerator, float denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "// return numerator/denominator; \n" + "}\n" + "__inline\n" + "float4 fastDiv4(float4 numerator, float4 denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "}\n" + "__inline\n" + "float fastSqrtf(float f2)\n" + "{\n" + " return native_sqrt(f2);\n" + "// return sqrt(f2);\n" + "}\n" + "__inline\n" + "float fastRSqrt(float f2)\n" + "{\n" + " return native_rsqrt(f2);\n" + "}\n" + "__inline\n" + "float fastLength4(float4 v)\n" + "{\n" + " return fast_length(v);\n" + "}\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " return fast_normalize(v);\n" + "}\n" + "__inline\n" + "float sqrtf(float a)\n" + "{\n" + "// return sqrt(a);\n" + " return native_sqrt(a);\n" + "}\n" + "__inline\n" + "float4 cross3(float4 a, float4 b)\n" + "{\n" + " return cross(a,b);\n" + "}\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = make_float4(a.xyz,0.f);\n" + " float4 b1 = make_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float length3(const float4 a)\n" + "{\n" + " return sqrtf(dot3F4(a,a));\n" + "}\n" + "__inline\n" + "float dot4(const float4 a, const float4 b)\n" + "{\n" + " return dot( a, b );\n" + "}\n" + "// for height\n" + "__inline\n" + "float dot3w1(const float4 point, const float4 eqn)\n" + "{\n" + " return dot3F4(point,eqn) + eqn.w;\n" + "}\n" + "__inline\n" + "float4 normalize3(const float4 a)\n" + "{\n" + " float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" + " return fastNormalize4( n );\n" + "// float length = sqrtf(dot3F4(a, a));\n" + "// return 1.f/length * a;\n" + "}\n" + "__inline\n" + "float4 normalize4(const float4 a)\n" + "{\n" + " float length = sqrtf(dot4(a, a));\n" + " return 1.f/length * a;\n" + "}\n" + "__inline\n" + "float4 createEquation(const float4 a, const float4 b, const float4 c)\n" + "{\n" + " float4 eqn;\n" + " float4 ab = b-a;\n" + " float4 ac = c-a;\n" + " eqn = normalize3( cross3(ab, ac) );\n" + " eqn.w = -dot3F4(eqn,a);\n" + " return eqn;\n" + "}\n" + "///////////////////////////////////////\n" + "// Matrix3x3\n" + "///////////////////////////////////////\n" + "typedef struct\n" + "{\n" + " float4 m_row[3];\n" + "}Matrix3x3;\n" + "__inline\n" + "Matrix3x3 mtZero();\n" + "__inline\n" + "Matrix3x3 mtIdentity();\n" + "__inline\n" + "Matrix3x3 mtTranspose(Matrix3x3 m);\n" + "__inline\n" + "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b);\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b);\n" + "__inline\n" + "Matrix3x3 mtZero()\n" + "{\n" + " Matrix3x3 m;\n" + " m.m_row[0] = (float4)(0.f);\n" + " m.m_row[1] = (float4)(0.f);\n" + " m.m_row[2] = (float4)(0.f);\n" + " return m;\n" + "}\n" + "__inline\n" + "Matrix3x3 mtIdentity()\n" + "{\n" + " Matrix3x3 m;\n" + " m.m_row[0] = (float4)(1,0,0,0);\n" + " m.m_row[1] = (float4)(0,1,0,0);\n" + " m.m_row[2] = (float4)(0,0,1,0);\n" + " return m;\n" + "}\n" + "__inline\n" + "Matrix3x3 mtTranspose(Matrix3x3 m)\n" + "{\n" + " Matrix3x3 out;\n" + " out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" + " out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" + " out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" + " return out;\n" + "}\n" + "__inline\n" + "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" + "{\n" + " Matrix3x3 transB;\n" + " transB = mtTranspose( b );\n" + " Matrix3x3 ans;\n" + " // why this doesn't run when 0ing in the for{}\n" + " a.m_row[0].w = 0.f;\n" + " a.m_row[1].w = 0.f;\n" + " a.m_row[2].w = 0.f;\n" + " for(int i=0; i<3; i++)\n" + " {\n" + "// a.m_row[i].w = 0.f;\n" + " ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" + " ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" + " ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" + " ans.m_row[i].w = 0.f;\n" + " }\n" + " return ans;\n" + "}\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b)\n" + "{\n" + " float4 ans;\n" + " ans.x = dot3F4( a.m_row[0], b );\n" + " ans.y = dot3F4( a.m_row[1], b );\n" + " ans.z = dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b)\n" + "{\n" + " float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " float4 ans;\n" + " ans.x = dot3F4( a, colx );\n" + " ans.y = dot3F4( a, coly );\n" + " ans.z = dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "///////////////////////////////////////\n" + "// Quaternion\n" + "///////////////////////////////////////\n" + "typedef float4 Quaternion;\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b);\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in);\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec);\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q);\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b)\n" + "{\n" + " Quaternion ans;\n" + " ans = cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in)\n" + "{\n" + " return fastNormalize4(in);\n" + "// in /= length( in );\n" + "// return in;\n" + "}\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec)\n" + "{\n" + " Quaternion qInv = qtInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = qtMul(qtMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q)\n" + "{\n" + " return (Quaternion)(-q.xyz, q.w);\n" + "}\n" + "__inline\n" + "float4 qtInvRotate(const Quaternion q, float4 vec)\n" + "{\n" + " return qtRotate( qtInvert( q ), vec );\n" + "}\n" + "#define WG_SIZE 64\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " Quaternion m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " u32 m_shapeIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} Body;\n" + "typedef struct\n" + "{\n" + " Matrix3x3 m_invInertia;\n" + " Matrix3x3 m_initInvInertia;\n" + "} Shape;\n" + "typedef struct\n" + "{\n" + " float4 m_linear;\n" + " float4 m_worldPos[4];\n" + " float4 m_center; \n" + " float m_jacCoeffInv[4];\n" + " float m_b[4];\n" + " float m_appliedRambdaDt[4];\n" + " float m_fJacCoeffInv[2]; \n" + " float m_fAppliedRambdaDt[2]; \n" + " u32 m_bodyA;\n" + " u32 m_bodyB;\n" + " int m_batchIdx;\n" + " u32 m_paddings[1];\n" + "} Constraint4;\n" + "typedef struct\n" + "{\n" + " int m_nConstraints;\n" + " int m_start;\n" + " int m_batchIdx;\n" + " int m_nSplit;\n" + "// int m_paddings[1];\n" + "} ConstBuffer;\n" + "typedef struct\n" + "{\n" + " int m_solveFriction;\n" + " int m_maxBatch; // long batch really kills the performance\n" + " int m_batchIdx;\n" + " int m_nSplit;\n" + "// int m_paddings[1];\n" + "} ConstBufferBatchSolve;\n" + " \n" + "typedef struct \n" + "{\n" + " int m_valInt0;\n" + " int m_valInt1;\n" + " int m_valInt2;\n" + " int m_valInt3;\n" + " float m_val0;\n" + " float m_val1;\n" + " float m_val2;\n" + " float m_val3;\n" + "} SolverDebugInfo;\n" + "// others\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n" + "{\n" + " int nContacts = cb.x;\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < nContacts )\n" + " {\n" + " int srcIdx = sortData[gIdx].y;\n" + " out[gIdx] = in[srcIdx];\n" + " }\n" + "}\n" + "__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < nContacts )\n" + " {\n" + " int2 sd;\n" + " sd.x = contactsIn[gIdx].m_childIndexB;\n" + " sd.y = gIdx;\n" + " sortDataOut[gIdx] = sd;\n" + " }\n" + "}\n" + "__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < nContacts )\n" + " {\n" + " int2 sdIn;\n" + " sdIn = sortDataInOut[gIdx];\n" + " int2 sdOut;\n" + " sdOut.x = contactsIn[sdIn.y].m_childIndexA;\n" + " sdOut.y = sdIn.y;\n" + " sortDataInOut[gIdx] = sdOut;\n" + " }\n" + "}\n" + "__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < nContacts )\n" + " {\n" + " int2 sdIn;\n" + " sdIn = sortDataInOut[gIdx];\n" + " int2 sdOut;\n" + " sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit;\n" + " sdOut.y = sdIn.y;\n" + " sortDataInOut[gIdx] = sdOut;\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < nContacts )\n" + " {\n" + " int2 sdIn;\n" + " sdIn = sortDataInOut[gIdx];\n" + " int2 sdOut;\n" + " sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit;\n" + " sdOut.y = sdIn.y;\n" + " sortDataInOut[gIdx] = sdOut;\n" + " }\n" + "}\n" + "typedef struct\n" + "{\n" + " int m_nContacts;\n" + " int m_staticIdx;\n" + " float m_scale;\n" + " int m_nSplit;\n" + "} ConstBufferSSD;\n" + "__constant const int gridTable4x4[] = \n" + "{\n" + " 0,1,17,16,\n" + " 1,2,18,19,\n" + " 17,18,32,3,\n" + " 16,19,3,34\n" + "};\n" + "__constant const int gridTable8x8[] = \n" + "{\n" + " 0, 2, 3, 16, 17, 18, 19, 1,\n" + " 66, 64, 80, 67, 82, 81, 65, 83,\n" + " 131,144,128,130,147,129,145,146,\n" + " 208,195,194,192,193,211,210,209,\n" + " 21, 22, 23, 5, 4, 6, 7, 20,\n" + " 86, 85, 69, 87, 70, 68, 84, 71,\n" + " 151,133,149,150,135,148,132,134,\n" + " 197,27,214,213,212,199,198,196\n" + " \n" + "};\n" + "#define USE_SPATIAL_BATCHING 1\n" + "#define USE_4x4_GRID 1\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n" + "int nContacts,float scale,int4 nSplit,int staticIdx)\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " \n" + " if( gIdx < nContacts )\n" + " {\n" + " int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;\n" + " int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;\n" + " int aIdx = abs(aPtrAndSignBit );\n" + " int bIdx = abs(bPtrAndSignBit);\n" + " bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);\n" + " bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);\n" + "#if USE_SPATIAL_BATCHING \n" + " int idx = (aStatic)? bIdx: aIdx;\n" + " float4 p = gBodies[idx].m_pos;\n" + " int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);\n" + " int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);\n" + " int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);\n" + " int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);\n" + " \n" + "#else//USE_SPATIAL_BATCHING\n" + " #if USE_4x4_GRID\n" + " int aa = aIdx&3;\n" + " int bb = bIdx&3;\n" + " if (aStatic)\n" + " aa = bb;\n" + " if (bStatic)\n" + " bb = aa;\n" + " int gridIndex = aa + bb*4;\n" + " int newIndex = gridTable4x4[gridIndex];\n" + " #else//USE_4x4_GRID\n" + " int aa = aIdx&7;\n" + " int bb = bIdx&7;\n" + " if (aStatic)\n" + " aa = bb;\n" + " if (bStatic)\n" + " bb = aa;\n" + " int gridIndex = aa + bb*8;\n" + " int newIndex = gridTable8x8[gridIndex];\n" + " #endif//USE_4x4_GRID\n" + "#endif//USE_SPATIAL_BATCHING\n" + " gSortDataOut[gIdx].x = newIndex;\n" + " gSortDataOut[gIdx].y = gIdx;\n" + " }\n" + " else\n" + " {\n" + " gSortDataOut[gIdx].x = 0xffffffff;\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " if( gIdx < cb.x )\n" + " {\n" + " gOut[gIdx] = gIn[gIdx];\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.h index c0173ad9f4..f4d98d9941 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/solverUtils.h @@ -1,909 +1,908 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* solverUtilsCL= \ -"/*\n" -"Copyright (c) 2013 Advanced Micro Devices, Inc. \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Erwin Coumans\n" -"#ifndef B3_CONTACT4DATA_H\n" -"#define B3_CONTACT4DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"typedef struct b3Contact4Data b3Contact4Data_t;\n" -"struct b3Contact4Data\n" -"{\n" -" b3Float4 m_worldPosB[4];\n" -"// b3Float4 m_localPosA[4];\n" -"// b3Float4 m_localPosB[4];\n" -" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" -" unsigned short m_restituitionCoeffCmp;\n" -" unsigned short m_frictionCoeffCmp;\n" -" int m_batchIdx;\n" -" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" -" int m_bodyBPtrAndSignBit;\n" -" int m_childIndexA;\n" -" int m_childIndexB;\n" -" int m_unused1;\n" -" int m_unused2;\n" -"};\n" -"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" -"{\n" -" return (int)contact->m_worldNormalOnB.w;\n" -"};\n" -"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" -"{\n" -" contact->m_worldNormalOnB.w = (float)numPoints;\n" -"};\n" -"#endif //B3_CONTACT4DATA_H\n" -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"#ifdef cl_ext_atomic_counters_32\n" -"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" -"#else\n" -"#define counter32_t volatile global int*\n" -"#endif\n" -"typedef unsigned int u32;\n" -"typedef unsigned short u16;\n" -"typedef unsigned char u8;\n" -"#define GET_GROUP_IDX get_group_id(0)\n" -"#define GET_LOCAL_IDX get_local_id(0)\n" -"#define GET_GLOBAL_IDX get_global_id(0)\n" -"#define GET_GROUP_SIZE get_local_size(0)\n" -"#define GET_NUM_GROUPS get_num_groups(0)\n" -"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" -"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" -"#define AtomInc(x) atom_inc(&(x))\n" -"#define AtomInc1(x, out) out = atom_inc(&(x))\n" -"#define AppendInc(x, out) out = atomic_inc(x)\n" -"#define AtomAdd(x, value) atom_add(&(x), value)\n" -"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" -"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" -"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" -"#define make_float4 (float4)\n" -"#define make_float2 (float2)\n" -"#define make_uint4 (uint4)\n" -"#define make_int4 (int4)\n" -"#define make_uint2 (uint2)\n" -"#define make_int2 (int2)\n" -"#define max2 max\n" -"#define min2 min\n" -"///////////////////////////////////////\n" -"// Vector\n" -"///////////////////////////////////////\n" -"__inline\n" -"float fastDiv(float numerator, float denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"// return numerator/denominator; \n" -"}\n" -"__inline\n" -"float4 fastDiv4(float4 numerator, float4 denominator)\n" -"{\n" -" return native_divide(numerator, denominator); \n" -"}\n" -"__inline\n" -"float fastSqrtf(float f2)\n" -"{\n" -" return native_sqrt(f2);\n" -"// return sqrt(f2);\n" -"}\n" -"__inline\n" -"float fastRSqrt(float f2)\n" -"{\n" -" return native_rsqrt(f2);\n" -"}\n" -"__inline\n" -"float fastLength4(float4 v)\n" -"{\n" -" return fast_length(v);\n" -"}\n" -"__inline\n" -"float4 fastNormalize4(float4 v)\n" -"{\n" -" return fast_normalize(v);\n" -"}\n" -"__inline\n" -"float sqrtf(float a)\n" -"{\n" -"// return sqrt(a);\n" -" return native_sqrt(a);\n" -"}\n" -"__inline\n" -"float4 cross3(float4 a1, float4 b1)\n" -"{\n" -" float4 a=make_float4(a1.xyz,0.f);\n" -" float4 b=make_float4(b1.xyz,0.f);\n" -" //float4 a=a1;\n" -" //float4 b=b1;\n" -" return cross(a,b);\n" -"}\n" -"__inline\n" -"float dot3F4(float4 a, float4 b)\n" -"{\n" -" float4 a1 = make_float4(a.xyz,0.f);\n" -" float4 b1 = make_float4(b.xyz,0.f);\n" -" return dot(a1, b1);\n" -"}\n" -"__inline\n" -"float length3(const float4 a)\n" -"{\n" -" return sqrtf(dot3F4(a,a));\n" -"}\n" -"__inline\n" -"float dot4(const float4 a, const float4 b)\n" -"{\n" -" return dot( a, b );\n" -"}\n" -"// for height\n" -"__inline\n" -"float dot3w1(const float4 point, const float4 eqn)\n" -"{\n" -" return dot3F4(point,eqn) + eqn.w;\n" -"}\n" -"__inline\n" -"float4 normalize3(const float4 a)\n" -"{\n" -" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" -" return fastNormalize4( n );\n" -"// float length = sqrtf(dot3F4(a, a));\n" -"// return 1.f/length * a;\n" -"}\n" -"__inline\n" -"float4 normalize4(const float4 a)\n" -"{\n" -" float length = sqrtf(dot4(a, a));\n" -" return 1.f/length * a;\n" -"}\n" -"__inline\n" -"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" -"{\n" -" float4 eqn;\n" -" float4 ab = b-a;\n" -" float4 ac = c-a;\n" -" eqn = normalize3( cross3(ab, ac) );\n" -" eqn.w = -dot3F4(eqn,a);\n" -" return eqn;\n" -"}\n" -"///////////////////////////////////////\n" -"// Matrix3x3\n" -"///////////////////////////////////////\n" -"typedef struct\n" -"{\n" -" float4 m_row[3];\n" -"}Matrix3x3;\n" -"__inline\n" -"Matrix3x3 mtZero();\n" -"__inline\n" -"Matrix3x3 mtIdentity();\n" -"__inline\n" -"Matrix3x3 mtTranspose(Matrix3x3 m);\n" -"__inline\n" -"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b);\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b);\n" -"__inline\n" -"Matrix3x3 mtZero()\n" -"{\n" -" Matrix3x3 m;\n" -" m.m_row[0] = (float4)(0.f);\n" -" m.m_row[1] = (float4)(0.f);\n" -" m.m_row[2] = (float4)(0.f);\n" -" return m;\n" -"}\n" -"__inline\n" -"Matrix3x3 mtIdentity()\n" -"{\n" -" Matrix3x3 m;\n" -" m.m_row[0] = (float4)(1,0,0,0);\n" -" m.m_row[1] = (float4)(0,1,0,0);\n" -" m.m_row[2] = (float4)(0,0,1,0);\n" -" return m;\n" -"}\n" -"__inline\n" -"Matrix3x3 mtTranspose(Matrix3x3 m)\n" -"{\n" -" Matrix3x3 out;\n" -" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" -" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" -" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" -" return out;\n" -"}\n" -"__inline\n" -"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" -"{\n" -" Matrix3x3 transB;\n" -" transB = mtTranspose( b );\n" -" Matrix3x3 ans;\n" -" // why this doesn't run when 0ing in the for{}\n" -" a.m_row[0].w = 0.f;\n" -" a.m_row[1].w = 0.f;\n" -" a.m_row[2].w = 0.f;\n" -" for(int i=0; i<3; i++)\n" -" {\n" -"// a.m_row[i].w = 0.f;\n" -" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" -" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" -" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" -" ans.m_row[i].w = 0.f;\n" -" }\n" -" return ans;\n" -"}\n" -"__inline\n" -"float4 mtMul1(Matrix3x3 a, float4 b)\n" -"{\n" -" float4 ans;\n" -" ans.x = dot3F4( a.m_row[0], b );\n" -" ans.y = dot3F4( a.m_row[1], b );\n" -" ans.z = dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"float4 mtMul3(float4 a, Matrix3x3 b)\n" -"{\n" -" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" float4 ans;\n" -" ans.x = dot3F4( a, colx );\n" -" ans.y = dot3F4( a, coly );\n" -" ans.z = dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"///////////////////////////////////////\n" -"// Quaternion\n" -"///////////////////////////////////////\n" -"typedef float4 Quaternion;\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b);\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in);\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec);\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q);\n" -"__inline\n" -"Quaternion qtMul(Quaternion a, Quaternion b)\n" -"{\n" -" Quaternion ans;\n" -" ans = cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"__inline\n" -"Quaternion qtNormalize(Quaternion in)\n" -"{\n" -" return fastNormalize4(in);\n" -"// in /= length( in );\n" -"// return in;\n" -"}\n" -"__inline\n" -"float4 qtRotate(Quaternion q, float4 vec)\n" -"{\n" -" Quaternion qInv = qtInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"__inline\n" -"Quaternion qtInvert(Quaternion q)\n" -"{\n" -" return (Quaternion)(-q.xyz, q.w);\n" -"}\n" -"__inline\n" -"float4 qtInvRotate(const Quaternion q, float4 vec)\n" -"{\n" -" return qtRotate( qtInvert( q ), vec );\n" -"}\n" -"#define WG_SIZE 64\n" -"typedef struct\n" -"{\n" -" float4 m_pos;\n" -" Quaternion m_quat;\n" -" float4 m_linVel;\n" -" float4 m_angVel;\n" -" u32 m_shapeIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"} Body;\n" -"typedef struct\n" -"{\n" -" Matrix3x3 m_invInertia;\n" -" Matrix3x3 m_initInvInertia;\n" -"} Shape;\n" -"typedef struct\n" -"{\n" -" float4 m_linear;\n" -" float4 m_worldPos[4];\n" -" float4 m_center; \n" -" float m_jacCoeffInv[4];\n" -" float m_b[4];\n" -" float m_appliedRambdaDt[4];\n" -" float m_fJacCoeffInv[2]; \n" -" float m_fAppliedRambdaDt[2]; \n" -" u32 m_bodyA;\n" -" u32 m_bodyB;\n" -" int m_batchIdx;\n" -" u32 m_paddings;\n" -"} Constraint4;\n" -"__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n" -"{\n" -" int i = GET_GLOBAL_IDX;\n" -" \n" -" if( i < numContactManifolds)\n" -" {\n" -" int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;\n" -" bool isFixedA = (pa <0) || (pa == fixedBodyIndex);\n" -" int bodyIndexA = abs(pa);\n" -" if (!isFixedA)\n" -" {\n" -" AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x);\n" -" }\n" -" barrier(CLK_GLOBAL_MEM_FENCE);\n" -" int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;\n" -" bool isFixedB = (pb <0) || (pb == fixedBodyIndex);\n" -" int bodyIndexB = abs(pb);\n" -" if (!isFixedB)\n" -" {\n" -" AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y);\n" -" } \n" -" }\n" -"}\n" -"__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)\n" -"{\n" -" int i = GET_GLOBAL_IDX;\n" -" \n" -" if( i < numSplitBodies)\n" -" {\n" -" linearVelocities[i] = make_float4(0);\n" -" angularVelocities[i] = make_float4(0);\n" -" }\n" -"}\n" -"__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n" -"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n" -"{\n" -" int i = GET_GLOBAL_IDX;\n" -" if (i<numBodies)\n" -" {\n" -" if (gBodies[i].m_invMass)\n" -" {\n" -" int bodyOffset = offsetSplitBodies[i];\n" -" int count = bodyCount[i];\n" -" float factor = 1.f/((float)count);\n" -" float4 averageLinVel = make_float4(0.f);\n" -" float4 averageAngVel = make_float4(0.f);\n" -" \n" -" for (int j=0;j<count;j++)\n" -" {\n" -" averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;\n" -" averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;\n" -" }\n" -" \n" -" for (int j=0;j<count;j++)\n" -" {\n" -" deltaLinearVelocities[bodyOffset+j] = averageLinVel;\n" -" deltaAngularVelocities[bodyOffset+j] = averageAngVel;\n" -" }\n" -" \n" -" }//bodies[i].m_invMass\n" -" }//i<numBodies\n" -"}\n" -"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" -"{\n" -" *linear = make_float4(n.xyz,0.f);\n" -" *angular0 = cross3(r0, n);\n" -" *angular1 = -cross3(r1, n);\n" -"}\n" -"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" -"{\n" -" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" -"}\n" -"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" -" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)\n" -"{\n" -" // linear0,1 are normlized\n" -" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" -" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" -" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" -" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" -" return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);\n" -"}\n" -"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n" -" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n" -"{\n" -" if (fabs(n.z) > 0.70710678f) {\n" -" // choose p in y-z plane\n" -" float a = n.y*n.y + n.z*n.z;\n" -" float k = 1.f/sqrt(a);\n" -" p[0].x = 0;\n" -" p[0].y = -n.z*k;\n" -" p[0].z = n.y*k;\n" -" // set q = n x p\n" -" q[0].x = a*k;\n" -" q[0].y = -n.x*p[0].z;\n" -" q[0].z = n.x*p[0].y;\n" -" }\n" -" else {\n" -" // choose p in x-y plane\n" -" float a = n.x*n.x + n.y*n.y;\n" -" float k = 1.f/sqrt(a);\n" -" p[0].x = -n.y*k;\n" -" p[0].y = n.x*k;\n" -" p[0].z = 0;\n" -" // set q = n x p\n" -" q[0].x = -n.z*p[0].y;\n" -" q[0].y = n.z*p[0].x;\n" -" q[0].z = a*k;\n" -" }\n" -"}\n" -"void solveContact(__global Constraint4* cs,\n" -" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" -" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,\n" -" float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB)\n" -"{\n" -" float minRambdaDt = 0;\n" -" float maxRambdaDt = FLT_MAX;\n" -" for(int ic=0; ic<4; ic++)\n" -" {\n" -" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n" -" float4 angular0, angular1, linear;\n" -" float4 r0 = cs->m_worldPos[ic] - posA;\n" -" float4 r1 = cs->m_worldPos[ic] - posB;\n" -" setLinearAndAngular( cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n" -" \n" -" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n" -" *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];\n" -" rambdaDt *= cs->m_jacCoeffInv[ic];\n" -" \n" -" {\n" -" float prevSum = cs->m_appliedRambdaDt[ic];\n" -" float updated = prevSum;\n" -" updated += rambdaDt;\n" -" updated = max2( updated, minRambdaDt );\n" -" updated = min2( updated, maxRambdaDt );\n" -" rambdaDt = updated - prevSum;\n" -" cs->m_appliedRambdaDt[ic] = updated;\n" -" }\n" -" \n" -" float4 linImp0 = invMassA*linear*rambdaDt;\n" -" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" -" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" -" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" -" \n" -" if (invMassA)\n" -" {\n" -" *dLinVelA += linImp0;\n" -" *dAngVelA += angImp0;\n" -" }\n" -" if (invMassB)\n" -" {\n" -" *dLinVelB += linImp1;\n" -" *dAngVelB += angImp1;\n" -" }\n" -" }\n" -"}\n" -"// solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" -"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, \n" -"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" -"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n" -"{\n" -" //float frictionCoeff = ldsCs[0].m_linear.w;\n" -" int aIdx = ldsCs[0].m_bodyA;\n" -" int bIdx = ldsCs[0].m_bodyB;\n" -" float4 posA = gBodies[aIdx].m_pos;\n" -" float4 linVelA = gBodies[aIdx].m_linVel;\n" -" float4 angVelA = gBodies[aIdx].m_angVel;\n" -" float invMassA = gBodies[aIdx].m_invMass;\n" -" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" -" float4 posB = gBodies[bIdx].m_pos;\n" -" float4 linVelB = gBodies[bIdx].m_linVel;\n" -" float4 angVelB = gBodies[bIdx].m_angVel;\n" -" float invMassB = gBodies[bIdx].m_invMass;\n" -" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" -" \n" -" float4 dLinVelA = make_float4(0,0,0,0);\n" -" float4 dAngVelA = make_float4(0,0,0,0);\n" -" float4 dLinVelB = make_float4(0,0,0,0);\n" -" float4 dAngVelB = make_float4(0,0,0,0);\n" -" \n" -" int bodyOffsetA = offsetSplitBodies[aIdx];\n" -" int constraintOffsetA = contactConstraintOffsets[0].x;\n" -" int splitIndexA = bodyOffsetA+constraintOffsetA;\n" -" \n" -" if (invMassA)\n" -" {\n" -" dLinVelA = deltaLinearVelocities[splitIndexA];\n" -" dAngVelA = deltaAngularVelocities[splitIndexA];\n" -" }\n" -" int bodyOffsetB = offsetSplitBodies[bIdx];\n" -" int constraintOffsetB = contactConstraintOffsets[0].y;\n" -" int splitIndexB= bodyOffsetB+constraintOffsetB;\n" -" if (invMassB)\n" -" {\n" -" dLinVelB = deltaLinearVelocities[splitIndexB];\n" -" dAngVelB = deltaAngularVelocities[splitIndexB];\n" -" }\n" -" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" -" posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);\n" -" if (invMassA)\n" -" {\n" -" deltaLinearVelocities[splitIndexA] = dLinVelA;\n" -" deltaAngularVelocities[splitIndexA] = dAngVelA;\n" -" } \n" -" if (invMassB)\n" -" {\n" -" deltaLinearVelocities[splitIndexB] = dLinVelB;\n" -" deltaAngularVelocities[splitIndexB] = dAngVelB;\n" -" }\n" -"}\n" -"__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n" -"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n" -"float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n" -")\n" -"{\n" -" int i = GET_GLOBAL_IDX;\n" -" if (i<numManifolds)\n" -" {\n" -" solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" -" }\n" -"}\n" -"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,\n" -" __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" -" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n" -"{\n" -" float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;\n" -" int aIdx = ldsCs[0].m_bodyA;\n" -" int bIdx = ldsCs[0].m_bodyB;\n" -" float4 posA = gBodies[aIdx].m_pos;\n" -" float4 linVelA = gBodies[aIdx].m_linVel;\n" -" float4 angVelA = gBodies[aIdx].m_angVel;\n" -" float invMassA = gBodies[aIdx].m_invMass;\n" -" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" -" float4 posB = gBodies[bIdx].m_pos;\n" -" float4 linVelB = gBodies[bIdx].m_linVel;\n" -" float4 angVelB = gBodies[bIdx].m_angVel;\n" -" float invMassB = gBodies[bIdx].m_invMass;\n" -" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" -" \n" -" float4 dLinVelA = make_float4(0,0,0,0);\n" -" float4 dAngVelA = make_float4(0,0,0,0);\n" -" float4 dLinVelB = make_float4(0,0,0,0);\n" -" float4 dAngVelB = make_float4(0,0,0,0);\n" -" \n" -" int bodyOffsetA = offsetSplitBodies[aIdx];\n" -" int constraintOffsetA = contactConstraintOffsets[0].x;\n" -" int splitIndexA = bodyOffsetA+constraintOffsetA;\n" -" \n" -" if (invMassA)\n" -" {\n" -" dLinVelA = deltaLinearVelocities[splitIndexA];\n" -" dAngVelA = deltaAngularVelocities[splitIndexA];\n" -" }\n" -" int bodyOffsetB = offsetSplitBodies[bIdx];\n" -" int constraintOffsetB = contactConstraintOffsets[0].y;\n" -" int splitIndexB= bodyOffsetB+constraintOffsetB;\n" -" if (invMassB)\n" -" {\n" -" dLinVelB = deltaLinearVelocities[splitIndexB];\n" -" dAngVelB = deltaAngularVelocities[splitIndexB];\n" -" }\n" -" {\n" -" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n" -" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n" -" float sum = 0;\n" -" for(int j=0; j<4; j++)\n" -" {\n" -" sum +=ldsCs[0].m_appliedRambdaDt[j];\n" -" }\n" -" frictionCoeff = 0.7f;\n" -" for(int j=0; j<4; j++)\n" -" {\n" -" maxRambdaDt[j] = frictionCoeff*sum;\n" -" minRambdaDt[j] = -maxRambdaDt[j];\n" -" }\n" -" \n" -"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" -"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n" -" \n" -" \n" -" {\n" -" \n" -" __global Constraint4* cs = ldsCs;\n" -" \n" -" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n" -" const float4 center = cs->m_center;\n" -" \n" -" float4 n = -cs->m_linear;\n" -" \n" -" float4 tangent[2];\n" -" btPlaneSpace1(n,&tangent[0],&tangent[1]);\n" -" float4 angular0, angular1, linear;\n" -" float4 r0 = center - posA;\n" -" float4 r1 = center - posB;\n" -" for(int i=0; i<2; i++)\n" -" {\n" -" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n" -" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n" -" linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB );\n" -" rambdaDt *= cs->m_fJacCoeffInv[i];\n" -" \n" -" {\n" -" float prevSum = cs->m_fAppliedRambdaDt[i];\n" -" float updated = prevSum;\n" -" updated += rambdaDt;\n" -" updated = max2( updated, minRambdaDt[i] );\n" -" updated = min2( updated, maxRambdaDt[i] );\n" -" rambdaDt = updated - prevSum;\n" -" cs->m_fAppliedRambdaDt[i] = updated;\n" -" }\n" -" \n" -" float4 linImp0 = invMassA*linear*rambdaDt;\n" -" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" -" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" -" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" -" \n" -" dLinVelA += linImp0;\n" -" dAngVelA += angImp0;\n" -" dLinVelB += linImp1;\n" -" dAngVelB += angImp1;\n" -" }\n" -" { // angular damping for point constraint\n" -" float4 ab = normalize3( posB - posA );\n" -" float4 ac = normalize3( center - posA );\n" -" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n" -" {\n" -" float angNA = dot3F4( n, angVelA );\n" -" float angNB = dot3F4( n, angVelB );\n" -" \n" -" dAngVelA -= (angNA*0.1f)*n;\n" -" dAngVelB -= (angNB*0.1f)*n;\n" -" }\n" -" }\n" -" }\n" -" \n" -" \n" -" }\n" -" if (invMassA)\n" -" {\n" -" deltaLinearVelocities[splitIndexA] = dLinVelA;\n" -" deltaAngularVelocities[splitIndexA] = dAngVelA;\n" -" } \n" -" if (invMassB)\n" -" {\n" -" deltaLinearVelocities[splitIndexB] = dLinVelB;\n" -" deltaAngularVelocities[splitIndexB] = dAngVelB;\n" -" }\n" -" \n" -"}\n" -"__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n" -" __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" -" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n" -" float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n" -")\n" -"{\n" -" int i = GET_GLOBAL_IDX;\n" -" if (i<numManifolds)\n" -" {\n" -" solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" -" }\n" -"}\n" -"__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n" -" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n" -"{\n" -" int i = GET_GLOBAL_IDX;\n" -" if (i<numBodies)\n" -" {\n" -" if (gBodies[i].m_invMass)\n" -" {\n" -" int bodyOffset = offsetSplitBodies[i];\n" -" int count = bodyCount[i];\n" -" if (count)\n" -" {\n" -" gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset];\n" -" gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset];\n" -" }\n" -" }\n" -" }\n" -"}\n" -"void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n" -" const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n" -" __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n" -" Constraint4* dstC )\n" -"{\n" -" dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n" -" dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n" -" float dtInv = 1.f/dt;\n" -" for(int ic=0; ic<4; ic++)\n" -" {\n" -" dstC->m_appliedRambdaDt[ic] = 0.f;\n" -" }\n" -" dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n" -" dstC->m_linear = src->m_worldNormalOnB;\n" -" dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n" -" for(int ic=0; ic<4; ic++)\n" -" {\n" -" float4 r0 = src->m_worldPosB[ic] - posA;\n" -" float4 r1 = src->m_worldPosB[ic] - posB;\n" -" if( ic >= src->m_worldNormalOnB.w )//npoints\n" -" {\n" -" dstC->m_jacCoeffInv[ic] = 0.f;\n" -" continue;\n" -" }\n" -" float relVelN;\n" -" {\n" -" float4 linear, angular0, angular1;\n" -" setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n" -" dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n" -" invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);\n" -" relVelN = calcRelVel(linear, -linear, angular0, angular1,\n" -" linVelA, angVelA, linVelB, angVelB);\n" -" float e = 0.f;//src->getRestituitionCoeff();\n" -" if( relVelN*relVelN < 0.004f ) e = 0.f;\n" -" dstC->m_b[ic] = e*relVelN;\n" -" //float penetration = src->m_worldPosB[ic].w;\n" -" dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n" -" dstC->m_appliedRambdaDt[ic] = 0.f;\n" -" }\n" -" }\n" -" if( src->m_worldNormalOnB.w > 0 )//npoints\n" -" { // prepare friction\n" -" float4 center = make_float4(0.f);\n" -" for(int i=0; i<src->m_worldNormalOnB.w; i++) \n" -" center += src->m_worldPosB[i];\n" -" center /= (float)src->m_worldNormalOnB.w;\n" -" float4 tangent[2];\n" -" btPlaneSpace1(-src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n" -" \n" -" float4 r[2];\n" -" r[0] = center - posA;\n" -" r[1] = center - posB;\n" -" for(int i=0; i<2; i++)\n" -" {\n" -" float4 linear, angular0, angular1;\n" -" setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n" -" dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n" -" invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);\n" -" dstC->m_fAppliedRambdaDt[i] = 0.f;\n" -" }\n" -" dstC->m_center = center;\n" -" }\n" -" for(int i=0; i<4; i++)\n" -" {\n" -" if( i<src->m_worldNormalOnB.w )\n" -" {\n" -" dstC->m_worldPos[i] = src->m_worldPosB[i];\n" -" }\n" -" else\n" -" {\n" -" dstC->m_worldPos[i] = make_float4(0.f);\n" -" }\n" -" }\n" -"}\n" -"__kernel\n" -"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" -"void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n" -"__global const unsigned int* bodyCount,\n" -"int nContacts,\n" -"float dt,\n" -"float positionDrift,\n" -"float positionConstraintCoeff\n" -")\n" -"{\n" -" int gIdx = GET_GLOBAL_IDX;\n" -" \n" -" if( gIdx < nContacts )\n" -" {\n" -" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" -" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" -" float4 posA = gBodies[aIdx].m_pos;\n" -" float4 linVelA = gBodies[aIdx].m_linVel;\n" -" float4 angVelA = gBodies[aIdx].m_angVel;\n" -" float invMassA = gBodies[aIdx].m_invMass;\n" -" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" -" float4 posB = gBodies[bIdx].m_pos;\n" -" float4 linVelB = gBodies[bIdx].m_linVel;\n" -" float4 angVelB = gBodies[bIdx].m_angVel;\n" -" float invMassB = gBodies[bIdx].m_invMass;\n" -" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" -" Constraint4 cs;\n" -" float countA = invMassA != 0.f ? (float)bodyCount[aIdx] : 1;\n" -" float countB = invMassB != 0.f ? (float)bodyCount[bIdx] : 1;\n" -" setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n" -" &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,\n" -" &cs );\n" -" \n" -" cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n" -" gConstraintOut[gIdx] = cs;\n" -" }\n" -"}\n" -; +static const char* solverUtilsCL = + "/*\n" + "Copyright (c) 2013 Advanced Micro Devices, Inc. \n" + "This software is provided 'as-is', without any express or implied warranty.\n" + "In no event will the authors be held liable for any damages arising from the use of this software.\n" + "Permission is granted to anyone to use this software for any purpose, \n" + "including commercial applications, and to alter it and redistribute it freely, \n" + "subject to the following restrictions:\n" + "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" + "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" + "3. This notice may not be removed or altered from any source distribution.\n" + "*/\n" + "//Originally written by Erwin Coumans\n" + "#ifndef B3_CONTACT4DATA_H\n" + "#define B3_CONTACT4DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "typedef struct b3Contact4Data b3Contact4Data_t;\n" + "struct b3Contact4Data\n" + "{\n" + " b3Float4 m_worldPosB[4];\n" + "// b3Float4 m_localPosA[4];\n" + "// b3Float4 m_localPosB[4];\n" + " b3Float4 m_worldNormalOnB; // w: m_nPoints\n" + " unsigned short m_restituitionCoeffCmp;\n" + " unsigned short m_frictionCoeffCmp;\n" + " int m_batchIdx;\n" + " int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" + " int m_bodyBPtrAndSignBit;\n" + " int m_childIndexA;\n" + " int m_childIndexB;\n" + " int m_unused1;\n" + " int m_unused2;\n" + "};\n" + "inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" + "{\n" + " return (int)contact->m_worldNormalOnB.w;\n" + "};\n" + "inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" + "{\n" + " contact->m_worldNormalOnB.w = (float)numPoints;\n" + "};\n" + "#endif //B3_CONTACT4DATA_H\n" + "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "#ifdef cl_ext_atomic_counters_32\n" + "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" + "#else\n" + "#define counter32_t volatile global int*\n" + "#endif\n" + "typedef unsigned int u32;\n" + "typedef unsigned short u16;\n" + "typedef unsigned char u8;\n" + "#define GET_GROUP_IDX get_group_id(0)\n" + "#define GET_LOCAL_IDX get_local_id(0)\n" + "#define GET_GLOBAL_IDX get_global_id(0)\n" + "#define GET_GROUP_SIZE get_local_size(0)\n" + "#define GET_NUM_GROUPS get_num_groups(0)\n" + "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" + "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" + "#define AtomInc(x) atom_inc(&(x))\n" + "#define AtomInc1(x, out) out = atom_inc(&(x))\n" + "#define AppendInc(x, out) out = atomic_inc(x)\n" + "#define AtomAdd(x, value) atom_add(&(x), value)\n" + "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" + "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" + "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" + "#define make_float4 (float4)\n" + "#define make_float2 (float2)\n" + "#define make_uint4 (uint4)\n" + "#define make_int4 (int4)\n" + "#define make_uint2 (uint2)\n" + "#define make_int2 (int2)\n" + "#define max2 max\n" + "#define min2 min\n" + "///////////////////////////////////////\n" + "// Vector\n" + "///////////////////////////////////////\n" + "__inline\n" + "float fastDiv(float numerator, float denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "// return numerator/denominator; \n" + "}\n" + "__inline\n" + "float4 fastDiv4(float4 numerator, float4 denominator)\n" + "{\n" + " return native_divide(numerator, denominator); \n" + "}\n" + "__inline\n" + "float fastSqrtf(float f2)\n" + "{\n" + " return native_sqrt(f2);\n" + "// return sqrt(f2);\n" + "}\n" + "__inline\n" + "float fastRSqrt(float f2)\n" + "{\n" + " return native_rsqrt(f2);\n" + "}\n" + "__inline\n" + "float fastLength4(float4 v)\n" + "{\n" + " return fast_length(v);\n" + "}\n" + "__inline\n" + "float4 fastNormalize4(float4 v)\n" + "{\n" + " return fast_normalize(v);\n" + "}\n" + "__inline\n" + "float sqrtf(float a)\n" + "{\n" + "// return sqrt(a);\n" + " return native_sqrt(a);\n" + "}\n" + "__inline\n" + "float4 cross3(float4 a1, float4 b1)\n" + "{\n" + " float4 a=make_float4(a1.xyz,0.f);\n" + " float4 b=make_float4(b1.xyz,0.f);\n" + " //float4 a=a1;\n" + " //float4 b=b1;\n" + " return cross(a,b);\n" + "}\n" + "__inline\n" + "float dot3F4(float4 a, float4 b)\n" + "{\n" + " float4 a1 = make_float4(a.xyz,0.f);\n" + " float4 b1 = make_float4(b.xyz,0.f);\n" + " return dot(a1, b1);\n" + "}\n" + "__inline\n" + "float length3(const float4 a)\n" + "{\n" + " return sqrtf(dot3F4(a,a));\n" + "}\n" + "__inline\n" + "float dot4(const float4 a, const float4 b)\n" + "{\n" + " return dot( a, b );\n" + "}\n" + "// for height\n" + "__inline\n" + "float dot3w1(const float4 point, const float4 eqn)\n" + "{\n" + " return dot3F4(point,eqn) + eqn.w;\n" + "}\n" + "__inline\n" + "float4 normalize3(const float4 a)\n" + "{\n" + " float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" + " return fastNormalize4( n );\n" + "// float length = sqrtf(dot3F4(a, a));\n" + "// return 1.f/length * a;\n" + "}\n" + "__inline\n" + "float4 normalize4(const float4 a)\n" + "{\n" + " float length = sqrtf(dot4(a, a));\n" + " return 1.f/length * a;\n" + "}\n" + "__inline\n" + "float4 createEquation(const float4 a, const float4 b, const float4 c)\n" + "{\n" + " float4 eqn;\n" + " float4 ab = b-a;\n" + " float4 ac = c-a;\n" + " eqn = normalize3( cross3(ab, ac) );\n" + " eqn.w = -dot3F4(eqn,a);\n" + " return eqn;\n" + "}\n" + "///////////////////////////////////////\n" + "// Matrix3x3\n" + "///////////////////////////////////////\n" + "typedef struct\n" + "{\n" + " float4 m_row[3];\n" + "}Matrix3x3;\n" + "__inline\n" + "Matrix3x3 mtZero();\n" + "__inline\n" + "Matrix3x3 mtIdentity();\n" + "__inline\n" + "Matrix3x3 mtTranspose(Matrix3x3 m);\n" + "__inline\n" + "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b);\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b);\n" + "__inline\n" + "Matrix3x3 mtZero()\n" + "{\n" + " Matrix3x3 m;\n" + " m.m_row[0] = (float4)(0.f);\n" + " m.m_row[1] = (float4)(0.f);\n" + " m.m_row[2] = (float4)(0.f);\n" + " return m;\n" + "}\n" + "__inline\n" + "Matrix3x3 mtIdentity()\n" + "{\n" + " Matrix3x3 m;\n" + " m.m_row[0] = (float4)(1,0,0,0);\n" + " m.m_row[1] = (float4)(0,1,0,0);\n" + " m.m_row[2] = (float4)(0,0,1,0);\n" + " return m;\n" + "}\n" + "__inline\n" + "Matrix3x3 mtTranspose(Matrix3x3 m)\n" + "{\n" + " Matrix3x3 out;\n" + " out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" + " out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" + " out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" + " return out;\n" + "}\n" + "__inline\n" + "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" + "{\n" + " Matrix3x3 transB;\n" + " transB = mtTranspose( b );\n" + " Matrix3x3 ans;\n" + " // why this doesn't run when 0ing in the for{}\n" + " a.m_row[0].w = 0.f;\n" + " a.m_row[1].w = 0.f;\n" + " a.m_row[2].w = 0.f;\n" + " for(int i=0; i<3; i++)\n" + " {\n" + "// a.m_row[i].w = 0.f;\n" + " ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" + " ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" + " ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" + " ans.m_row[i].w = 0.f;\n" + " }\n" + " return ans;\n" + "}\n" + "__inline\n" + "float4 mtMul1(Matrix3x3 a, float4 b)\n" + "{\n" + " float4 ans;\n" + " ans.x = dot3F4( a.m_row[0], b );\n" + " ans.y = dot3F4( a.m_row[1], b );\n" + " ans.z = dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "float4 mtMul3(float4 a, Matrix3x3 b)\n" + "{\n" + " float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " float4 ans;\n" + " ans.x = dot3F4( a, colx );\n" + " ans.y = dot3F4( a, coly );\n" + " ans.z = dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "///////////////////////////////////////\n" + "// Quaternion\n" + "///////////////////////////////////////\n" + "typedef float4 Quaternion;\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b);\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in);\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec);\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q);\n" + "__inline\n" + "Quaternion qtMul(Quaternion a, Quaternion b)\n" + "{\n" + " Quaternion ans;\n" + " ans = cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "__inline\n" + "Quaternion qtNormalize(Quaternion in)\n" + "{\n" + " return fastNormalize4(in);\n" + "// in /= length( in );\n" + "// return in;\n" + "}\n" + "__inline\n" + "float4 qtRotate(Quaternion q, float4 vec)\n" + "{\n" + " Quaternion qInv = qtInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = qtMul(qtMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "__inline\n" + "Quaternion qtInvert(Quaternion q)\n" + "{\n" + " return (Quaternion)(-q.xyz, q.w);\n" + "}\n" + "__inline\n" + "float4 qtInvRotate(const Quaternion q, float4 vec)\n" + "{\n" + " return qtRotate( qtInvert( q ), vec );\n" + "}\n" + "#define WG_SIZE 64\n" + "typedef struct\n" + "{\n" + " float4 m_pos;\n" + " Quaternion m_quat;\n" + " float4 m_linVel;\n" + " float4 m_angVel;\n" + " u32 m_shapeIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "} Body;\n" + "typedef struct\n" + "{\n" + " Matrix3x3 m_invInertia;\n" + " Matrix3x3 m_initInvInertia;\n" + "} Shape;\n" + "typedef struct\n" + "{\n" + " float4 m_linear;\n" + " float4 m_worldPos[4];\n" + " float4 m_center; \n" + " float m_jacCoeffInv[4];\n" + " float m_b[4];\n" + " float m_appliedRambdaDt[4];\n" + " float m_fJacCoeffInv[2]; \n" + " float m_fAppliedRambdaDt[2]; \n" + " u32 m_bodyA;\n" + " u32 m_bodyB;\n" + " int m_batchIdx;\n" + " u32 m_paddings;\n" + "} Constraint4;\n" + "__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n" + "{\n" + " int i = GET_GLOBAL_IDX;\n" + " \n" + " if( i < numContactManifolds)\n" + " {\n" + " int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;\n" + " bool isFixedA = (pa <0) || (pa == fixedBodyIndex);\n" + " int bodyIndexA = abs(pa);\n" + " if (!isFixedA)\n" + " {\n" + " AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x);\n" + " }\n" + " barrier(CLK_GLOBAL_MEM_FENCE);\n" + " int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;\n" + " bool isFixedB = (pb <0) || (pb == fixedBodyIndex);\n" + " int bodyIndexB = abs(pb);\n" + " if (!isFixedB)\n" + " {\n" + " AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y);\n" + " } \n" + " }\n" + "}\n" + "__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)\n" + "{\n" + " int i = GET_GLOBAL_IDX;\n" + " \n" + " if( i < numSplitBodies)\n" + " {\n" + " linearVelocities[i] = make_float4(0);\n" + " angularVelocities[i] = make_float4(0);\n" + " }\n" + "}\n" + "__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n" + "__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n" + "{\n" + " int i = GET_GLOBAL_IDX;\n" + " if (i<numBodies)\n" + " {\n" + " if (gBodies[i].m_invMass)\n" + " {\n" + " int bodyOffset = offsetSplitBodies[i];\n" + " int count = bodyCount[i];\n" + " float factor = 1.f/((float)count);\n" + " float4 averageLinVel = make_float4(0.f);\n" + " float4 averageAngVel = make_float4(0.f);\n" + " \n" + " for (int j=0;j<count;j++)\n" + " {\n" + " averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;\n" + " averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;\n" + " }\n" + " \n" + " for (int j=0;j<count;j++)\n" + " {\n" + " deltaLinearVelocities[bodyOffset+j] = averageLinVel;\n" + " deltaAngularVelocities[bodyOffset+j] = averageAngVel;\n" + " }\n" + " \n" + " }//bodies[i].m_invMass\n" + " }//i<numBodies\n" + "}\n" + "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" + "{\n" + " *linear = make_float4(n.xyz,0.f);\n" + " *angular0 = cross3(r0, n);\n" + " *angular1 = -cross3(r1, n);\n" + "}\n" + "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" + "{\n" + " return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" + "}\n" + "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" + " float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)\n" + "{\n" + " // linear0,1 are normlized\n" + " float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" + " float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" + " float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" + " float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" + " return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);\n" + "}\n" + "void btPlaneSpace1 (float4 n, float4* p, float4* q);\n" + " void btPlaneSpace1 (float4 n, float4* p, float4* q)\n" + "{\n" + " if (fabs(n.z) > 0.70710678f) {\n" + " // choose p in y-z plane\n" + " float a = n.y*n.y + n.z*n.z;\n" + " float k = 1.f/sqrt(a);\n" + " p[0].x = 0;\n" + " p[0].y = -n.z*k;\n" + " p[0].z = n.y*k;\n" + " // set q = n x p\n" + " q[0].x = a*k;\n" + " q[0].y = -n.x*p[0].z;\n" + " q[0].z = n.x*p[0].y;\n" + " }\n" + " else {\n" + " // choose p in x-y plane\n" + " float a = n.x*n.x + n.y*n.y;\n" + " float k = 1.f/sqrt(a);\n" + " p[0].x = -n.y*k;\n" + " p[0].y = n.x*k;\n" + " p[0].z = 0;\n" + " // set q = n x p\n" + " q[0].x = -n.z*p[0].y;\n" + " q[0].y = n.z*p[0].x;\n" + " q[0].z = a*k;\n" + " }\n" + "}\n" + "void solveContact(__global Constraint4* cs,\n" + " float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" + " float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,\n" + " float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB)\n" + "{\n" + " float minRambdaDt = 0;\n" + " float maxRambdaDt = FLT_MAX;\n" + " for(int ic=0; ic<4; ic++)\n" + " {\n" + " if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n" + " float4 angular0, angular1, linear;\n" + " float4 r0 = cs->m_worldPos[ic] - posA;\n" + " float4 r1 = cs->m_worldPos[ic] - posB;\n" + " setLinearAndAngular( cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n" + " \n" + " float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n" + " *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];\n" + " rambdaDt *= cs->m_jacCoeffInv[ic];\n" + " \n" + " {\n" + " float prevSum = cs->m_appliedRambdaDt[ic];\n" + " float updated = prevSum;\n" + " updated += rambdaDt;\n" + " updated = max2( updated, minRambdaDt );\n" + " updated = min2( updated, maxRambdaDt );\n" + " rambdaDt = updated - prevSum;\n" + " cs->m_appliedRambdaDt[ic] = updated;\n" + " }\n" + " \n" + " float4 linImp0 = invMassA*linear*rambdaDt;\n" + " float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" + " float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" + " float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" + " \n" + " if (invMassA)\n" + " {\n" + " *dLinVelA += linImp0;\n" + " *dAngVelA += angImp0;\n" + " }\n" + " if (invMassB)\n" + " {\n" + " *dLinVelB += linImp1;\n" + " *dAngVelB += angImp1;\n" + " }\n" + " }\n" + "}\n" + "// solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" + "void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, \n" + "__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" + "__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n" + "{\n" + " //float frictionCoeff = ldsCs[0].m_linear.w;\n" + " int aIdx = ldsCs[0].m_bodyA;\n" + " int bIdx = ldsCs[0].m_bodyB;\n" + " float4 posA = gBodies[aIdx].m_pos;\n" + " float4 linVelA = gBodies[aIdx].m_linVel;\n" + " float4 angVelA = gBodies[aIdx].m_angVel;\n" + " float invMassA = gBodies[aIdx].m_invMass;\n" + " Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" + " float4 posB = gBodies[bIdx].m_pos;\n" + " float4 linVelB = gBodies[bIdx].m_linVel;\n" + " float4 angVelB = gBodies[bIdx].m_angVel;\n" + " float invMassB = gBodies[bIdx].m_invMass;\n" + " Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" + " \n" + " float4 dLinVelA = make_float4(0,0,0,0);\n" + " float4 dAngVelA = make_float4(0,0,0,0);\n" + " float4 dLinVelB = make_float4(0,0,0,0);\n" + " float4 dAngVelB = make_float4(0,0,0,0);\n" + " \n" + " int bodyOffsetA = offsetSplitBodies[aIdx];\n" + " int constraintOffsetA = contactConstraintOffsets[0].x;\n" + " int splitIndexA = bodyOffsetA+constraintOffsetA;\n" + " \n" + " if (invMassA)\n" + " {\n" + " dLinVelA = deltaLinearVelocities[splitIndexA];\n" + " dAngVelA = deltaAngularVelocities[splitIndexA];\n" + " }\n" + " int bodyOffsetB = offsetSplitBodies[bIdx];\n" + " int constraintOffsetB = contactConstraintOffsets[0].y;\n" + " int splitIndexB= bodyOffsetB+constraintOffsetB;\n" + " if (invMassB)\n" + " {\n" + " dLinVelB = deltaLinearVelocities[splitIndexB];\n" + " dAngVelB = deltaAngularVelocities[splitIndexB];\n" + " }\n" + " solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" + " posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);\n" + " if (invMassA)\n" + " {\n" + " deltaLinearVelocities[splitIndexA] = dLinVelA;\n" + " deltaAngularVelocities[splitIndexA] = dAngVelA;\n" + " } \n" + " if (invMassB)\n" + " {\n" + " deltaLinearVelocities[splitIndexB] = dLinVelB;\n" + " deltaAngularVelocities[splitIndexB] = dAngVelB;\n" + " }\n" + "}\n" + "__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n" + "__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n" + "float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n" + ")\n" + "{\n" + " int i = GET_GLOBAL_IDX;\n" + " if (i<numManifolds)\n" + " {\n" + " solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" + " }\n" + "}\n" + "void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,\n" + " __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" + " __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n" + "{\n" + " float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;\n" + " int aIdx = ldsCs[0].m_bodyA;\n" + " int bIdx = ldsCs[0].m_bodyB;\n" + " float4 posA = gBodies[aIdx].m_pos;\n" + " float4 linVelA = gBodies[aIdx].m_linVel;\n" + " float4 angVelA = gBodies[aIdx].m_angVel;\n" + " float invMassA = gBodies[aIdx].m_invMass;\n" + " Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" + " float4 posB = gBodies[bIdx].m_pos;\n" + " float4 linVelB = gBodies[bIdx].m_linVel;\n" + " float4 angVelB = gBodies[bIdx].m_angVel;\n" + " float invMassB = gBodies[bIdx].m_invMass;\n" + " Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" + " \n" + " float4 dLinVelA = make_float4(0,0,0,0);\n" + " float4 dAngVelA = make_float4(0,0,0,0);\n" + " float4 dLinVelB = make_float4(0,0,0,0);\n" + " float4 dAngVelB = make_float4(0,0,0,0);\n" + " \n" + " int bodyOffsetA = offsetSplitBodies[aIdx];\n" + " int constraintOffsetA = contactConstraintOffsets[0].x;\n" + " int splitIndexA = bodyOffsetA+constraintOffsetA;\n" + " \n" + " if (invMassA)\n" + " {\n" + " dLinVelA = deltaLinearVelocities[splitIndexA];\n" + " dAngVelA = deltaAngularVelocities[splitIndexA];\n" + " }\n" + " int bodyOffsetB = offsetSplitBodies[bIdx];\n" + " int constraintOffsetB = contactConstraintOffsets[0].y;\n" + " int splitIndexB= bodyOffsetB+constraintOffsetB;\n" + " if (invMassB)\n" + " {\n" + " dLinVelB = deltaLinearVelocities[splitIndexB];\n" + " dAngVelB = deltaAngularVelocities[splitIndexB];\n" + " }\n" + " {\n" + " float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n" + " float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n" + " float sum = 0;\n" + " for(int j=0; j<4; j++)\n" + " {\n" + " sum +=ldsCs[0].m_appliedRambdaDt[j];\n" + " }\n" + " frictionCoeff = 0.7f;\n" + " for(int j=0; j<4; j++)\n" + " {\n" + " maxRambdaDt[j] = frictionCoeff*sum;\n" + " minRambdaDt[j] = -maxRambdaDt[j];\n" + " }\n" + " \n" + "// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" + "// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n" + " \n" + " \n" + " {\n" + " \n" + " __global Constraint4* cs = ldsCs;\n" + " \n" + " if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n" + " const float4 center = cs->m_center;\n" + " \n" + " float4 n = -cs->m_linear;\n" + " \n" + " float4 tangent[2];\n" + " btPlaneSpace1(n,&tangent[0],&tangent[1]);\n" + " float4 angular0, angular1, linear;\n" + " float4 r0 = center - posA;\n" + " float4 r1 = center - posB;\n" + " for(int i=0; i<2; i++)\n" + " {\n" + " setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n" + " float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n" + " linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB );\n" + " rambdaDt *= cs->m_fJacCoeffInv[i];\n" + " \n" + " {\n" + " float prevSum = cs->m_fAppliedRambdaDt[i];\n" + " float updated = prevSum;\n" + " updated += rambdaDt;\n" + " updated = max2( updated, minRambdaDt[i] );\n" + " updated = min2( updated, maxRambdaDt[i] );\n" + " rambdaDt = updated - prevSum;\n" + " cs->m_fAppliedRambdaDt[i] = updated;\n" + " }\n" + " \n" + " float4 linImp0 = invMassA*linear*rambdaDt;\n" + " float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" + " float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" + " float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" + " \n" + " dLinVelA += linImp0;\n" + " dAngVelA += angImp0;\n" + " dLinVelB += linImp1;\n" + " dAngVelB += angImp1;\n" + " }\n" + " { // angular damping for point constraint\n" + " float4 ab = normalize3( posB - posA );\n" + " float4 ac = normalize3( center - posA );\n" + " if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n" + " {\n" + " float angNA = dot3F4( n, angVelA );\n" + " float angNB = dot3F4( n, angVelB );\n" + " \n" + " dAngVelA -= (angNA*0.1f)*n;\n" + " dAngVelB -= (angNB*0.1f)*n;\n" + " }\n" + " }\n" + " }\n" + " \n" + " \n" + " }\n" + " if (invMassA)\n" + " {\n" + " deltaLinearVelocities[splitIndexA] = dLinVelA;\n" + " deltaAngularVelocities[splitIndexA] = dAngVelA;\n" + " } \n" + " if (invMassB)\n" + " {\n" + " deltaLinearVelocities[splitIndexB] = dLinVelB;\n" + " deltaAngularVelocities[splitIndexB] = dAngVelB;\n" + " }\n" + " \n" + "}\n" + "__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n" + " __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" + " __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n" + " float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n" + ")\n" + "{\n" + " int i = GET_GLOBAL_IDX;\n" + " if (i<numManifolds)\n" + " {\n" + " solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" + " }\n" + "}\n" + "__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n" + " __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n" + "{\n" + " int i = GET_GLOBAL_IDX;\n" + " if (i<numBodies)\n" + " {\n" + " if (gBodies[i].m_invMass)\n" + " {\n" + " int bodyOffset = offsetSplitBodies[i];\n" + " int count = bodyCount[i];\n" + " if (count)\n" + " {\n" + " gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset];\n" + " gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset];\n" + " }\n" + " }\n" + " }\n" + "}\n" + "void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n" + " const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n" + " __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n" + " Constraint4* dstC )\n" + "{\n" + " dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n" + " dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n" + " float dtInv = 1.f/dt;\n" + " for(int ic=0; ic<4; ic++)\n" + " {\n" + " dstC->m_appliedRambdaDt[ic] = 0.f;\n" + " }\n" + " dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n" + " dstC->m_linear = src->m_worldNormalOnB;\n" + " dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n" + " for(int ic=0; ic<4; ic++)\n" + " {\n" + " float4 r0 = src->m_worldPosB[ic] - posA;\n" + " float4 r1 = src->m_worldPosB[ic] - posB;\n" + " if( ic >= src->m_worldNormalOnB.w )//npoints\n" + " {\n" + " dstC->m_jacCoeffInv[ic] = 0.f;\n" + " continue;\n" + " }\n" + " float relVelN;\n" + " {\n" + " float4 linear, angular0, angular1;\n" + " setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n" + " dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n" + " invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);\n" + " relVelN = calcRelVel(linear, -linear, angular0, angular1,\n" + " linVelA, angVelA, linVelB, angVelB);\n" + " float e = 0.f;//src->getRestituitionCoeff();\n" + " if( relVelN*relVelN < 0.004f ) e = 0.f;\n" + " dstC->m_b[ic] = e*relVelN;\n" + " //float penetration = src->m_worldPosB[ic].w;\n" + " dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n" + " dstC->m_appliedRambdaDt[ic] = 0.f;\n" + " }\n" + " }\n" + " if( src->m_worldNormalOnB.w > 0 )//npoints\n" + " { // prepare friction\n" + " float4 center = make_float4(0.f);\n" + " for(int i=0; i<src->m_worldNormalOnB.w; i++) \n" + " center += src->m_worldPosB[i];\n" + " center /= (float)src->m_worldNormalOnB.w;\n" + " float4 tangent[2];\n" + " btPlaneSpace1(-src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n" + " \n" + " float4 r[2];\n" + " r[0] = center - posA;\n" + " r[1] = center - posB;\n" + " for(int i=0; i<2; i++)\n" + " {\n" + " float4 linear, angular0, angular1;\n" + " setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n" + " dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n" + " invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);\n" + " dstC->m_fAppliedRambdaDt[i] = 0.f;\n" + " }\n" + " dstC->m_center = center;\n" + " }\n" + " for(int i=0; i<4; i++)\n" + " {\n" + " if( i<src->m_worldNormalOnB.w )\n" + " {\n" + " dstC->m_worldPos[i] = src->m_worldPosB[i];\n" + " }\n" + " else\n" + " {\n" + " dstC->m_worldPos[i] = make_float4(0.f);\n" + " }\n" + " }\n" + "}\n" + "__kernel\n" + "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" + "void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n" + "__global const unsigned int* bodyCount,\n" + "int nContacts,\n" + "float dt,\n" + "float positionDrift,\n" + "float positionConstraintCoeff\n" + ")\n" + "{\n" + " int gIdx = GET_GLOBAL_IDX;\n" + " \n" + " if( gIdx < nContacts )\n" + " {\n" + " int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" + " int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" + " float4 posA = gBodies[aIdx].m_pos;\n" + " float4 linVelA = gBodies[aIdx].m_linVel;\n" + " float4 angVelA = gBodies[aIdx].m_angVel;\n" + " float invMassA = gBodies[aIdx].m_invMass;\n" + " Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" + " float4 posB = gBodies[bIdx].m_pos;\n" + " float4 linVelB = gBodies[bIdx].m_linVel;\n" + " float4 angVelB = gBodies[bIdx].m_angVel;\n" + " float invMassB = gBodies[bIdx].m_invMass;\n" + " Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" + " Constraint4 cs;\n" + " float countA = invMassA != 0.f ? (float)bodyCount[aIdx] : 1;\n" + " float countB = invMassB != 0.f ? (float)bodyCount[bIdx] : 1;\n" + " setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n" + " &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,\n" + " &cs );\n" + " \n" + " cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n" + " gConstraintOut[gIdx] = cs;\n" + " }\n" + "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h index d70e74017a..bb949b2027 100644 --- a/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h +++ b/thirdparty/bullet/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h @@ -1,483 +1,482 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* updateAabbsKernelCL= \ -"#ifndef B3_UPDATE_AABBS_H\n" -"#define B3_UPDATE_AABBS_H\n" -"#ifndef B3_AABB_H\n" -"#define B3_AABB_H\n" -"#ifndef B3_FLOAT4_H\n" -"#define B3_FLOAT4_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#define B3_PLATFORM_DEFINITIONS_H\n" -"struct MyTest\n" -"{\n" -" int bla;\n" -"};\n" -"#ifdef __cplusplus\n" -"#else\n" -"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" -"#define B3_LARGE_FLOAT 1e18f\n" -"#define B3_INFINITY 1e18f\n" -"#define b3Assert(a)\n" -"#define b3ConstArray(a) __global const a*\n" -"#define b3AtomicInc atomic_inc\n" -"#define b3AtomicAdd atomic_add\n" -"#define b3Fabs fabs\n" -"#define b3Sqrt native_sqrt\n" -"#define b3Sin native_sin\n" -"#define b3Cos native_cos\n" -"#define B3_STATIC\n" -"#endif\n" -"#endif\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Float4;\n" -" #define b3Float4ConstArg const b3Float4\n" -" #define b3MakeFloat4 (float4)\n" -" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return dot(a1, b1);\n" -" }\n" -" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" -" {\n" -" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" -" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" -" return cross(a1, b1);\n" -" }\n" -" #define b3MinFloat4 min\n" -" #define b3MaxFloat4 max\n" -" #define b3Normalized(a) normalize(a)\n" -"#endif \n" -" \n" -"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" -"{\n" -" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" -" return false;\n" -" return true;\n" -"}\n" -"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" -"{\n" -" float maxDot = -B3_INFINITY;\n" -" int i = 0;\n" -" int ptIndex = -1;\n" -" for( i = 0; i < vecLen; i++ )\n" -" {\n" -" float dot = b3Dot3F4(vecArray[i],vec);\n" -" \n" -" if( dot > maxDot )\n" -" {\n" -" maxDot = dot;\n" -" ptIndex = i;\n" -" }\n" -" }\n" -" b3Assert(ptIndex>=0);\n" -" if (ptIndex<0)\n" -" {\n" -" ptIndex = 0;\n" -" }\n" -" *dotOut = maxDot;\n" -" return ptIndex;\n" -"}\n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_MAT3x3_H\n" -"#define B3_MAT3x3_H\n" -"#ifndef B3_QUAT_H\n" -"#define B3_QUAT_H\n" -"#ifndef B3_PLATFORM_DEFINITIONS_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif\n" -"#endif\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -" typedef float4 b3Quat;\n" -" #define b3QuatConstArg const b3Quat\n" -" \n" -" \n" -"inline float4 b3FastNormalize4(float4 v)\n" -"{\n" -" v = (float4)(v.xyz,0.f);\n" -" return fast_normalize(v);\n" -"}\n" -" \n" -"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" -"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" -"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" -"{\n" -" b3Quat ans;\n" -" ans = b3Cross3( a, b );\n" -" ans += a.w*b+b.w*a;\n" -"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" -" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" -" return ans;\n" -"}\n" -"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" -"{\n" -" b3Quat q;\n" -" q=in;\n" -" //return b3FastNormalize4(in);\n" -" float len = native_sqrt(dot(q, q));\n" -" if(len > 0.f)\n" -" {\n" -" q *= 1.f / len;\n" -" }\n" -" else\n" -" {\n" -" q.x = q.y = q.z = 0.f;\n" -" q.w = 1.f;\n" -" }\n" -" return q;\n" -"}\n" -"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" b3Quat qInv = b3QuatInvert( q );\n" -" float4 vcpy = vec;\n" -" vcpy.w = 0.f;\n" -" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" -" return out;\n" -"}\n" -"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" -"{\n" -" return (b3Quat)(-q.xyz, q.w);\n" -"}\n" -"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" -"{\n" -" return b3QuatRotate( b3QuatInvert( q ), vec );\n" -"}\n" -"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" -"{\n" -" return b3QuatRotate( orientation, point ) + (translation);\n" -"}\n" -" \n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"typedef struct\n" -"{\n" -" b3Float4 m_row[3];\n" -"}b3Mat3x3;\n" -"#define b3Mat3x3ConstArg const b3Mat3x3\n" -"#define b3GetRow(m,row) (m.m_row[row])\n" -"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" -"{\n" -" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" -" b3Mat3x3 out;\n" -" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" -" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" -" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" -" out.m_row[0].w = 0.f;\n" -" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" -" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" -" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" -" out.m_row[1].w = 0.f;\n" -" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" -" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" -" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" -" out.m_row[2].w = 0.f;\n" -" return out;\n" -"}\n" -"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = fabs(matIn.m_row[0]);\n" -" out.m_row[1] = fabs(matIn.m_row[1]);\n" -" out.m_row[2] = fabs(matIn.m_row[2]);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtZero();\n" -"__inline\n" -"b3Mat3x3 mtIdentity();\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" -"__inline\n" -"b3Mat3x3 mtZero()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(0.f);\n" -" m.m_row[1] = (b3Float4)(0.f);\n" -" m.m_row[2] = (b3Float4)(0.f);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtIdentity()\n" -"{\n" -" b3Mat3x3 m;\n" -" m.m_row[0] = (b3Float4)(1,0,0,0);\n" -" m.m_row[1] = (b3Float4)(0,1,0,0);\n" -" m.m_row[2] = (b3Float4)(0,0,1,0);\n" -" return m;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" -"{\n" -" b3Mat3x3 out;\n" -" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" -" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" -" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" -" return out;\n" -"}\n" -"__inline\n" -"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" -"{\n" -" b3Mat3x3 transB;\n" -" transB = mtTranspose( b );\n" -" b3Mat3x3 ans;\n" -" // why this doesn't run when 0ing in the for{}\n" -" a.m_row[0].w = 0.f;\n" -" a.m_row[1].w = 0.f;\n" -" a.m_row[2].w = 0.f;\n" -" for(int i=0; i<3; i++)\n" -" {\n" -"// a.m_row[i].w = 0.f;\n" -" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" -" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" -" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" -" ans.m_row[i].w = 0.f;\n" -" }\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" -"{\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a.m_row[0], b );\n" -" ans.y = b3Dot3F4( a.m_row[1], b );\n" -" ans.z = b3Dot3F4( a.m_row[2], b );\n" -" ans.w = 0.f;\n" -" return ans;\n" -"}\n" -"__inline\n" -"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" -"{\n" -" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" -" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" -" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" -" b3Float4 ans;\n" -" ans.x = b3Dot3F4( a, colx );\n" -" ans.y = b3Dot3F4( a, coly );\n" -" ans.z = b3Dot3F4( a, colz );\n" -" return ans;\n" -"}\n" -"#endif\n" -"#endif //B3_MAT3x3_H\n" -"typedef struct b3Aabb b3Aabb_t;\n" -"struct b3Aabb\n" -"{\n" -" union\n" -" {\n" -" float m_min[4];\n" -" b3Float4 m_minVec;\n" -" int m_minIndices[4];\n" -" };\n" -" union\n" -" {\n" -" float m_max[4];\n" -" b3Float4 m_maxVec;\n" -" int m_signedMaxIndices[4];\n" -" };\n" -"};\n" -"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" -" b3Float4ConstArg pos,\n" -" b3QuatConstArg orn,\n" -" b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" -"{\n" -" b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" -" localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" -" b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" -" b3Mat3x3 m;\n" -" m = b3QuatGetRotationMatrix(orn);\n" -" b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" -" b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" -" \n" -" b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" -" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" -" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" -" 0.f);\n" -" *aabbMinOut = center-extent;\n" -" *aabbMaxOut = center+extent;\n" -"}\n" -"/// conservative test for overlap between two aabbs\n" -"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" -" b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" -"{\n" -" bool overlap = true;\n" -" overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" -" overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" -" overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" -" return overlap;\n" -"}\n" -"#endif //B3_AABB_H\n" -"#ifndef B3_COLLIDABLE_H\n" -"#define B3_COLLIDABLE_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"enum b3ShapeTypes\n" -"{\n" -" SHAPE_HEIGHT_FIELD=1,\n" -" SHAPE_CONVEX_HULL=3,\n" -" SHAPE_PLANE=4,\n" -" SHAPE_CONCAVE_TRIMESH=5,\n" -" SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" -" SHAPE_SPHERE=7,\n" -" MAX_NUM_SHAPE_TYPES,\n" -"};\n" -"typedef struct b3Collidable b3Collidable_t;\n" -"struct b3Collidable\n" -"{\n" -" union {\n" -" int m_numChildShapes;\n" -" int m_bvhIndex;\n" -" };\n" -" union\n" -" {\n" -" float m_radius;\n" -" int m_compoundBvhIndex;\n" -" };\n" -" int m_shapeType;\n" -" union\n" -" {\n" -" int m_shapeIndex;\n" -" float m_height;\n" -" };\n" -"};\n" -"typedef struct b3GpuChildShape b3GpuChildShape_t;\n" -"struct b3GpuChildShape\n" -"{\n" -" b3Float4 m_childPosition;\n" -" b3Quat m_childOrientation;\n" -" union\n" -" {\n" -" int m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS\n" -" int m_capsuleAxis;\n" -" };\n" -" union \n" -" {\n" -" float m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES\n" -" int m_numChildShapes;//used for compound shape\n" -" };\n" -" union \n" -" {\n" -" float m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES\n" -" int m_collidableShapeIndex;\n" -" };\n" -" int m_shapeType;\n" -"};\n" -"struct b3CompoundOverlappingPair\n" -"{\n" -" int m_bodyIndexA;\n" -" int m_bodyIndexB;\n" -"// int m_pairType;\n" -" int m_childShapeIndexA;\n" -" int m_childShapeIndexB;\n" -"};\n" -"#endif //B3_COLLIDABLE_H\n" -"#ifndef B3_RIGIDBODY_DATA_H\n" -"#define B3_RIGIDBODY_DATA_H\n" -"#ifndef B3_FLOAT4_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_FLOAT4_H\n" -"#ifndef B3_QUAT_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif \n" -"#endif //B3_QUAT_H\n" -"#ifndef B3_MAT3x3_H\n" -"#ifdef __cplusplus\n" -"#else\n" -"#endif\n" -"#endif //B3_MAT3x3_H\n" -"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" -"struct b3RigidBodyData\n" -"{\n" -" b3Float4 m_pos;\n" -" b3Quat m_quat;\n" -" b3Float4 m_linVel;\n" -" b3Float4 m_angVel;\n" -" int m_collidableIdx;\n" -" float m_invMass;\n" -" float m_restituitionCoeff;\n" -" float m_frictionCoeff;\n" -"};\n" -"typedef struct b3InertiaData b3InertiaData_t;\n" -"struct b3InertiaData\n" -"{\n" -" b3Mat3x3 m_invInertiaWorld;\n" -" b3Mat3x3 m_initInvInertia;\n" -"};\n" -"#endif //B3_RIGIDBODY_DATA_H\n" -" \n" -"void b3ComputeWorldAabb( int bodyId, __global const b3RigidBodyData_t* bodies, __global const b3Collidable_t* collidables, __global const b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n" -"{\n" -" __global const b3RigidBodyData_t* body = &bodies[bodyId];\n" -" b3Float4 position = body->m_pos;\n" -" b3Quat orientation = body->m_quat;\n" -" \n" -" int collidableIndex = body->m_collidableIdx;\n" -" int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n" -" \n" -" if (shapeIndex>=0)\n" -" {\n" -" \n" -" b3Aabb_t localAabb = localShapeAABB[collidableIndex];\n" -" b3Aabb_t worldAabb;\n" -" \n" -" b3Float4 aabbAMinOut,aabbAMaxOut; \n" -" float margin = 0.f;\n" -" b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);\n" -" \n" -" worldAabb.m_minVec =aabbAMinOut;\n" -" worldAabb.m_minIndices[3] = bodyId;\n" -" worldAabb.m_maxVec = aabbAMaxOut;\n" -" worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;\n" -" worldAabbs[bodyId] = worldAabb;\n" -" }\n" -"}\n" -"#endif //B3_UPDATE_AABBS_H\n" -"__kernel void initializeGpuAabbsFull( const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)\n" -"{\n" -" int nodeID = get_global_id(0);\n" -" if( nodeID < numNodes )\n" -" {\n" -" b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n" -" }\n" -"}\n" -"__kernel void clearOverlappingPairsKernel( __global int4* pairs, int numPairs)\n" -"{\n" -" int pairId = get_global_id(0);\n" -" if( pairId< numPairs )\n" -" {\n" -" pairs[pairId].z = 0xffffffff;\n" -" }\n" -"}\n" -; +static const char* updateAabbsKernelCL = + "#ifndef B3_UPDATE_AABBS_H\n" + "#define B3_UPDATE_AABBS_H\n" + "#ifndef B3_AABB_H\n" + "#define B3_AABB_H\n" + "#ifndef B3_FLOAT4_H\n" + "#define B3_FLOAT4_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#define B3_PLATFORM_DEFINITIONS_H\n" + "struct MyTest\n" + "{\n" + " int bla;\n" + "};\n" + "#ifdef __cplusplus\n" + "#else\n" + "//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" + "#define B3_LARGE_FLOAT 1e18f\n" + "#define B3_INFINITY 1e18f\n" + "#define b3Assert(a)\n" + "#define b3ConstArray(a) __global const a*\n" + "#define b3AtomicInc atomic_inc\n" + "#define b3AtomicAdd atomic_add\n" + "#define b3Fabs fabs\n" + "#define b3Sqrt native_sqrt\n" + "#define b3Sin native_sin\n" + "#define b3Cos native_cos\n" + "#define B3_STATIC\n" + "#endif\n" + "#endif\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Float4;\n" + " #define b3Float4ConstArg const b3Float4\n" + " #define b3MakeFloat4 (float4)\n" + " float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return dot(a1, b1);\n" + " }\n" + " b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" + " {\n" + " float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" + " float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" + " return cross(a1, b1);\n" + " }\n" + " #define b3MinFloat4 min\n" + " #define b3MaxFloat4 max\n" + " #define b3Normalized(a) normalize(a)\n" + "#endif \n" + " \n" + "inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" + "{\n" + " if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" + " return false;\n" + " return true;\n" + "}\n" + "inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" + "{\n" + " float maxDot = -B3_INFINITY;\n" + " int i = 0;\n" + " int ptIndex = -1;\n" + " for( i = 0; i < vecLen; i++ )\n" + " {\n" + " float dot = b3Dot3F4(vecArray[i],vec);\n" + " \n" + " if( dot > maxDot )\n" + " {\n" + " maxDot = dot;\n" + " ptIndex = i;\n" + " }\n" + " }\n" + " b3Assert(ptIndex>=0);\n" + " if (ptIndex<0)\n" + " {\n" + " ptIndex = 0;\n" + " }\n" + " *dotOut = maxDot;\n" + " return ptIndex;\n" + "}\n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_MAT3x3_H\n" + "#define B3_MAT3x3_H\n" + "#ifndef B3_QUAT_H\n" + "#define B3_QUAT_H\n" + "#ifndef B3_PLATFORM_DEFINITIONS_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif\n" + "#endif\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + " typedef float4 b3Quat;\n" + " #define b3QuatConstArg const b3Quat\n" + " \n" + " \n" + "inline float4 b3FastNormalize4(float4 v)\n" + "{\n" + " v = (float4)(v.xyz,0.f);\n" + " return fast_normalize(v);\n" + "}\n" + " \n" + "inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" + "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" + "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" + "{\n" + " b3Quat ans;\n" + " ans = b3Cross3( a, b );\n" + " ans += a.w*b+b.w*a;\n" + "// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" + " ans.w = a.w*b.w - b3Dot3F4(a, b);\n" + " return ans;\n" + "}\n" + "inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" + "{\n" + " b3Quat q;\n" + " q=in;\n" + " //return b3FastNormalize4(in);\n" + " float len = native_sqrt(dot(q, q));\n" + " if(len > 0.f)\n" + " {\n" + " q *= 1.f / len;\n" + " }\n" + " else\n" + " {\n" + " q.x = q.y = q.z = 0.f;\n" + " q.w = 1.f;\n" + " }\n" + " return q;\n" + "}\n" + "inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " b3Quat qInv = b3QuatInvert( q );\n" + " float4 vcpy = vec;\n" + " vcpy.w = 0.f;\n" + " float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" + " return out;\n" + "}\n" + "inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" + "{\n" + " return (b3Quat)(-q.xyz, q.w);\n" + "}\n" + "inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" + "{\n" + " return b3QuatRotate( b3QuatInvert( q ), vec );\n" + "}\n" + "inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" + "{\n" + " return b3QuatRotate( orientation, point ) + (translation);\n" + "}\n" + " \n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "typedef struct\n" + "{\n" + " b3Float4 m_row[3];\n" + "}b3Mat3x3;\n" + "#define b3Mat3x3ConstArg const b3Mat3x3\n" + "#define b3GetRow(m,row) (m.m_row[row])\n" + "inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" + "{\n" + " b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" + " b3Mat3x3 out;\n" + " out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" + " out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" + " out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" + " out.m_row[0].w = 0.f;\n" + " out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" + " out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" + " out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" + " out.m_row[1].w = 0.f;\n" + " out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" + " out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" + " out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" + " out.m_row[2].w = 0.f;\n" + " return out;\n" + "}\n" + "inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = fabs(matIn.m_row[0]);\n" + " out.m_row[1] = fabs(matIn.m_row[1]);\n" + " out.m_row[2] = fabs(matIn.m_row[2]);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtZero();\n" + "__inline\n" + "b3Mat3x3 mtIdentity();\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" + "__inline\n" + "b3Mat3x3 mtZero()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(0.f);\n" + " m.m_row[1] = (b3Float4)(0.f);\n" + " m.m_row[2] = (b3Float4)(0.f);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtIdentity()\n" + "{\n" + " b3Mat3x3 m;\n" + " m.m_row[0] = (b3Float4)(1,0,0,0);\n" + " m.m_row[1] = (b3Float4)(0,1,0,0);\n" + " m.m_row[2] = (b3Float4)(0,0,1,0);\n" + " return m;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" + "{\n" + " b3Mat3x3 out;\n" + " out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" + " out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" + " out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" + " return out;\n" + "}\n" + "__inline\n" + "b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" + "{\n" + " b3Mat3x3 transB;\n" + " transB = mtTranspose( b );\n" + " b3Mat3x3 ans;\n" + " // why this doesn't run when 0ing in the for{}\n" + " a.m_row[0].w = 0.f;\n" + " a.m_row[1].w = 0.f;\n" + " a.m_row[2].w = 0.f;\n" + " for(int i=0; i<3; i++)\n" + " {\n" + "// a.m_row[i].w = 0.f;\n" + " ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" + " ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" + " ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" + " ans.m_row[i].w = 0.f;\n" + " }\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" + "{\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a.m_row[0], b );\n" + " ans.y = b3Dot3F4( a.m_row[1], b );\n" + " ans.z = b3Dot3F4( a.m_row[2], b );\n" + " ans.w = 0.f;\n" + " return ans;\n" + "}\n" + "__inline\n" + "b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" + "{\n" + " b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" + " b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" + " b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" + " b3Float4 ans;\n" + " ans.x = b3Dot3F4( a, colx );\n" + " ans.y = b3Dot3F4( a, coly );\n" + " ans.z = b3Dot3F4( a, colz );\n" + " return ans;\n" + "}\n" + "#endif\n" + "#endif //B3_MAT3x3_H\n" + "typedef struct b3Aabb b3Aabb_t;\n" + "struct b3Aabb\n" + "{\n" + " union\n" + " {\n" + " float m_min[4];\n" + " b3Float4 m_minVec;\n" + " int m_minIndices[4];\n" + " };\n" + " union\n" + " {\n" + " float m_max[4];\n" + " b3Float4 m_maxVec;\n" + " int m_signedMaxIndices[4];\n" + " };\n" + "};\n" + "inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" + " b3Float4ConstArg pos,\n" + " b3QuatConstArg orn,\n" + " b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" + "{\n" + " b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" + " localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" + " b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" + " b3Mat3x3 m;\n" + " m = b3QuatGetRotationMatrix(orn);\n" + " b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" + " b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" + " \n" + " b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" + " b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" + " b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" + " 0.f);\n" + " *aabbMinOut = center-extent;\n" + " *aabbMaxOut = center+extent;\n" + "}\n" + "/// conservative test for overlap between two aabbs\n" + "inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" + " b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" + "{\n" + " bool overlap = true;\n" + " overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" + " overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" + " overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" + " return overlap;\n" + "}\n" + "#endif //B3_AABB_H\n" + "#ifndef B3_COLLIDABLE_H\n" + "#define B3_COLLIDABLE_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "enum b3ShapeTypes\n" + "{\n" + " SHAPE_HEIGHT_FIELD=1,\n" + " SHAPE_CONVEX_HULL=3,\n" + " SHAPE_PLANE=4,\n" + " SHAPE_CONCAVE_TRIMESH=5,\n" + " SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" + " SHAPE_SPHERE=7,\n" + " MAX_NUM_SHAPE_TYPES,\n" + "};\n" + "typedef struct b3Collidable b3Collidable_t;\n" + "struct b3Collidable\n" + "{\n" + " union {\n" + " int m_numChildShapes;\n" + " int m_bvhIndex;\n" + " };\n" + " union\n" + " {\n" + " float m_radius;\n" + " int m_compoundBvhIndex;\n" + " };\n" + " int m_shapeType;\n" + " union\n" + " {\n" + " int m_shapeIndex;\n" + " float m_height;\n" + " };\n" + "};\n" + "typedef struct b3GpuChildShape b3GpuChildShape_t;\n" + "struct b3GpuChildShape\n" + "{\n" + " b3Float4 m_childPosition;\n" + " b3Quat m_childOrientation;\n" + " union\n" + " {\n" + " int m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS\n" + " int m_capsuleAxis;\n" + " };\n" + " union \n" + " {\n" + " float m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES\n" + " int m_numChildShapes;//used for compound shape\n" + " };\n" + " union \n" + " {\n" + " float m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES\n" + " int m_collidableShapeIndex;\n" + " };\n" + " int m_shapeType;\n" + "};\n" + "struct b3CompoundOverlappingPair\n" + "{\n" + " int m_bodyIndexA;\n" + " int m_bodyIndexB;\n" + "// int m_pairType;\n" + " int m_childShapeIndexA;\n" + " int m_childShapeIndexB;\n" + "};\n" + "#endif //B3_COLLIDABLE_H\n" + "#ifndef B3_RIGIDBODY_DATA_H\n" + "#define B3_RIGIDBODY_DATA_H\n" + "#ifndef B3_FLOAT4_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_FLOAT4_H\n" + "#ifndef B3_QUAT_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif \n" + "#endif //B3_QUAT_H\n" + "#ifndef B3_MAT3x3_H\n" + "#ifdef __cplusplus\n" + "#else\n" + "#endif\n" + "#endif //B3_MAT3x3_H\n" + "typedef struct b3RigidBodyData b3RigidBodyData_t;\n" + "struct b3RigidBodyData\n" + "{\n" + " b3Float4 m_pos;\n" + " b3Quat m_quat;\n" + " b3Float4 m_linVel;\n" + " b3Float4 m_angVel;\n" + " int m_collidableIdx;\n" + " float m_invMass;\n" + " float m_restituitionCoeff;\n" + " float m_frictionCoeff;\n" + "};\n" + "typedef struct b3InertiaData b3InertiaData_t;\n" + "struct b3InertiaData\n" + "{\n" + " b3Mat3x3 m_invInertiaWorld;\n" + " b3Mat3x3 m_initInvInertia;\n" + "};\n" + "#endif //B3_RIGIDBODY_DATA_H\n" + " \n" + "void b3ComputeWorldAabb( int bodyId, __global const b3RigidBodyData_t* bodies, __global const b3Collidable_t* collidables, __global const b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n" + "{\n" + " __global const b3RigidBodyData_t* body = &bodies[bodyId];\n" + " b3Float4 position = body->m_pos;\n" + " b3Quat orientation = body->m_quat;\n" + " \n" + " int collidableIndex = body->m_collidableIdx;\n" + " int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n" + " \n" + " if (shapeIndex>=0)\n" + " {\n" + " \n" + " b3Aabb_t localAabb = localShapeAABB[collidableIndex];\n" + " b3Aabb_t worldAabb;\n" + " \n" + " b3Float4 aabbAMinOut,aabbAMaxOut; \n" + " float margin = 0.f;\n" + " b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);\n" + " \n" + " worldAabb.m_minVec =aabbAMinOut;\n" + " worldAabb.m_minIndices[3] = bodyId;\n" + " worldAabb.m_maxVec = aabbAMaxOut;\n" + " worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;\n" + " worldAabbs[bodyId] = worldAabb;\n" + " }\n" + "}\n" + "#endif //B3_UPDATE_AABBS_H\n" + "__kernel void initializeGpuAabbsFull( const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)\n" + "{\n" + " int nodeID = get_global_id(0);\n" + " if( nodeID < numNodes )\n" + " {\n" + " b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n" + " }\n" + "}\n" + "__kernel void clearOverlappingPairsKernel( __global int4* pairs, int numPairs)\n" + "{\n" + " int pairId = get_global_id(0);\n" + " if( pairId< numPairs )\n" + " {\n" + " pairs[pairId].z = 0xffffffff;\n" + " }\n" + "}\n"; |