diff options
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision')
13 files changed, 2234 insertions, 2395 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h index 0ed8aa8232..b296992525 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h @@ -12,33 +12,31 @@  class b3GpuBroadphaseInterface  {  public: - -	typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue  q); +	typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q);  	virtual ~b3GpuBroadphaseInterface()  	{  	} -	virtual void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; -	virtual void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0; +	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0; +	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0; -	virtual void  calculateOverlappingPairs(int maxPairs)=0; -	virtual void  calculateOverlappingPairsHost(int maxPairs)=0; +	virtual void calculateOverlappingPairs(int maxPairs) = 0; +	virtual void calculateOverlappingPairsHost(int maxPairs) = 0;  	//call writeAabbsToGpu after done making all changes (createProxy etc) -	virtual void writeAabbsToGpu()=0; +	virtual void writeAabbsToGpu() = 0; + +	virtual cl_mem getAabbBufferWS() = 0; +	virtual int getNumOverlap() = 0; +	virtual cl_mem getOverlappingPairBuffer() = 0; -	virtual cl_mem	getAabbBufferWS()=0; -	virtual int	getNumOverlap()=0; -	virtual cl_mem	getOverlappingPairBuffer()=0; +	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0; +	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0; -	virtual b3OpenCLArray<b3SapAabb>&	getAllAabbsGPU()=0; -	virtual b3AlignedObjectArray<b3SapAabb>&	getAllAabbsCPU()=0; -	  	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;  	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;  	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0; -  }; -#endif //B3_GPU_BROADPHASE_INTERFACE_H +#endif  //B3_GPU_BROADPHASE_INTERFACE_H diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp index 74d0c8056c..e714fadac3 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp @@ -5,12 +5,9 @@  #include "kernels/sapKernels.h"  //#include "kernels/gridBroadphase.cl" -  #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"  #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" - -  #define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"  #define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" @@ -21,31 +18,25 @@ cl_kernel kFindOverlappingPairs;  cl_kernel m_copyAabbsKernel;  cl_kernel m_sap2Kernel; - - - -  //int maxPairsPerBody = 64; -int maxBodiesPerCell = 256;//?? +int maxBodiesPerCell = 256;  //?? -b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q ) -:m_context(ctx), -m_device(device), -m_queue(q), -m_allAabbsGPU1(ctx,q), -m_smallAabbsMappingGPU(ctx,q), -m_largeAabbsMappingGPU(ctx,q), -m_gpuPairs(ctx,q), +b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q) +	: m_context(ctx), +	  m_device(device), +	  m_queue(q), +	  m_allAabbsGPU1(ctx, q), +	  m_smallAabbsMappingGPU(ctx, q), +	  m_largeAabbsMappingGPU(ctx, q), +	  m_gpuPairs(ctx, q), -m_hashGpu(ctx,q), +	  m_hashGpu(ctx, q), -m_cellStartGpu(ctx,q), -m_paramsGPU(ctx,q) +	  m_cellStartGpu(ctx, q), +	  m_paramsGPU(ctx, q)  { - -	 -	b3Vector3 gridSize = b3MakeVector3(3,3,3); -	b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]); +	b3Vector3 gridSize = b3MakeVector3(3, 3, 3); +	b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]);  	m_paramsCPU.m_gridSize[0] = 128;  	m_paramsCPU.m_gridSize[1] = 128; @@ -58,92 +49,79 @@ m_paramsGPU(ctx,q)  	m_paramsCPU.m_invCellSize[3] = 0.f;  	m_paramsGPU.push_back(m_paramsCPU); -	cl_int errNum=0; +	cl_int errNum = 0;  	{  		const char* sapSrc = sapCL; -		cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); -		b3Assert(errNum==CL_SUCCESS); -		m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); -		m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); -		b3Assert(errNum==CL_SUCCESS); +		cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH); +		b3Assert(errNum == CL_SUCCESS); +		m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg); +		m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg); +		b3Assert(errNum == CL_SUCCESS);  	}  	{ -		 -		cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH); -		b3Assert(errNum==CL_SUCCESS); - -		kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg); -		b3Assert(errNum==CL_SUCCESS); -	 -		kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg); -		b3Assert(errNum==CL_SUCCESS); - -		kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg); -		b3Assert(errNum==CL_SUCCESS); - -	 -		kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg); -		b3Assert(errNum==CL_SUCCESS); - -		 -		 -		 -	} +		cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH); +		b3Assert(errNum == CL_SUCCESS); -	m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); +		kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg); +		b3Assert(errNum == CL_SUCCESS); +		kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg); +		b3Assert(errNum == CL_SUCCESS); + +		kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg); +		b3Assert(errNum == CL_SUCCESS); + +		kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg); +		b3Assert(errNum == CL_SUCCESS); +	} + +	m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);  }  b3GpuGridBroadphase::~b3GpuGridBroadphase()  { -	clReleaseKernel( kCalcHashAABB); -	clReleaseKernel( kClearCellStart); -	clReleaseKernel( kFindCellStart); -	clReleaseKernel( kFindOverlappingPairs); -	clReleaseKernel( m_sap2Kernel); -	clReleaseKernel( m_copyAabbsKernel); -	 -	 -	 +	clReleaseKernel(kCalcHashAABB); +	clReleaseKernel(kClearCellStart); +	clReleaseKernel(kFindCellStart); +	clReleaseKernel(kFindOverlappingPairs); +	clReleaseKernel(m_sap2Kernel); +	clReleaseKernel(m_copyAabbsKernel); +  	delete m_sorter;  } - - -void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)  {  	b3SapAabb aabb;  	aabb.m_minVec = aabbMin;  	aabb.m_maxVec = aabbMax;  	aabb.m_minIndices[3] = userPtr; -	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; +	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();  //NOT userPtr;  	m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());  	m_allAabbsCPU1.push_back(aabb); -  } -void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)  {  	b3SapAabb aabb;  	aabb.m_minVec = aabbMin;  	aabb.m_maxVec = aabbMax;  	aabb.m_minIndices[3] = userPtr; -	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr; +	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();  //NOT userPtr;  	m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());  	m_allAabbsCPU1.push_back(aabb);  } -void  b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) +void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)  {  	B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs"); -	  	if (0)  	{  		calculateOverlappingPairsHost(maxPairs); -	/* +		/*  		b3AlignedObjectArray<b3Int4> cpuPairs;  		m_gpuPairs.copyToHost(cpuPairs);  		printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size()); @@ -154,57 +132,50 @@ void  b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)  		*/  		return;  	} -	 -	 - -	  	int numSmallAabbs = m_smallAabbsMappingGPU.size(); -	b3OpenCLArray<int> pairCount(m_context,m_queue); +	b3OpenCLArray<int> pairCount(m_context, m_queue);  	pairCount.push_back(0); -	m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody); +	m_gpuPairs.resize(maxPairs);  //numSmallAabbs*maxPairsPerBody);  	{  		int numLargeAabbs = m_largeAabbsMappingGPU.size();  		if (numLargeAabbs && numSmallAabbs)  		{  			B3_PROFILE("sap2Kernel"); -			b3BufferInfoCL bInfo[] = {  -				b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ), -				b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), -				b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ),  -				b3BufferInfoCL( m_gpuPairs.getBufferCL() ),  +			b3BufferInfoCL bInfo[] = { +				b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()), +				b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()), +				b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()), +				b3BufferInfoCL(m_gpuPairs.getBufferCL()),  				b3BufferInfoCL(pairCount.getBufferCL())}; -			b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); -			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); -			launcher.setConst(   numLargeAabbs  ); -			launcher.setConst( numSmallAabbs); -			launcher.setConst( 0  );//axis is not used -			launcher.setConst( maxPairs  ); -	//@todo: use actual maximum work item sizes of the device instead of hardcoded values -			launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); -                 +			b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel"); +			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); +			launcher.setConst(numLargeAabbs); +			launcher.setConst(numSmallAabbs); +			launcher.setConst(0);  //axis is not used +			launcher.setConst(maxPairs); +			//@todo: use actual maximum work item sizes of the device instead of hardcoded values +			launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64); +  			int numPairs = pairCount.at(0); -			 -			if (numPairs >maxPairs) + +			if (numPairs > maxPairs)  			{  				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); -				numPairs =maxPairs; +				numPairs = maxPairs;  			}  		}  	} - - -  	if (numSmallAabbs)  	{  		B3_PROFILE("gridKernel");  		m_hashGpu.resize(numSmallAabbs);  		{  			B3_PROFILE("kCalcHashAABB"); -			b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB"); +			b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB");  			launch.setConst(numSmallAabbs);  			launch.setBuffer(m_allAabbsGPU1.getBufferCL());  			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); @@ -214,117 +185,104 @@ void  b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)  		}  		m_sorter->execute(m_hashGpu); -		 -		int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2]; + +		int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2];  		m_cellStartGpu.resize(numCells);  		//b3AlignedObjectArray<int >			cellStartCpu; -				 -		 +  		{  			B3_PROFILE("kClearCellStart"); -			b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart"); +			b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart");  			launch.setConst(numCells);  			launch.setBuffer(m_cellStartGpu.getBufferCL());  			launch.launch1D(numCells);  			//m_cellStartGpu.copyToHost(cellStartCpu);  			//printf("??\n"); -  		} -  		{  			B3_PROFILE("kFindCellStart"); -			b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart"); +			b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart");  			launch.setConst(numSmallAabbs);  			launch.setBuffer(m_hashGpu.getBufferCL());  			launch.setBuffer(m_cellStartGpu.getBufferCL());  			launch.launch1D(numSmallAabbs);  			//m_cellStartGpu.copyToHost(cellStartCpu);  			//printf("??\n"); -  		} -		 +  		{  			B3_PROFILE("kFindOverlappingPairs"); -			 -			 -			b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs"); + +			b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs");  			launch.setConst(numSmallAabbs);  			launch.setBuffer(m_allAabbsGPU1.getBufferCL());  			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());  			launch.setBuffer(m_hashGpu.getBufferCL());  			launch.setBuffer(m_cellStartGpu.getBufferCL()); -			 +  			launch.setBuffer(m_paramsGPU.getBufferCL());  			//launch.setBuffer(0);  			launch.setBuffer(pairCount.getBufferCL());  			launch.setBuffer(m_gpuPairs.getBufferCL()); -			 +  			launch.setConst(maxPairs);  			launch.launch1D(numSmallAabbs); -			  			int numPairs = pairCount.at(0); -			if (numPairs >maxPairs) +			if (numPairs > maxPairs)  			{  				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); -				numPairs =maxPairs; +				numPairs = maxPairs;  			} -			 +  			m_gpuPairs.resize(numPairs); -	 +  			if (0)  			{  				b3AlignedObjectArray<b3Int4> pairsCpu;  				m_gpuPairs.copyToHost(pairsCpu);  				int sz = m_gpuPairs.size(); -				printf("m_gpuPairs.size()=%d\n",sz); -				for (int i=0;i<m_gpuPairs.size();i++) +				printf("m_gpuPairs.size()=%d\n", sz); +				for (int i = 0; i < m_gpuPairs.size(); i++)  				{ -					printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y); +					printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y);  				}  				printf("?!?\n");  			} -			  		} -	 -  	} -	 - - -  	//calculateOverlappingPairsHost(maxPairs);  } -void  b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) +void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)  { -  	m_hostPairs.resize(0);  	m_allAabbsGPU1.copyToHost(m_allAabbsCPU1); -	for (int i=0;i<m_allAabbsCPU1.size();i++) +	for (int i = 0; i < m_allAabbsCPU1.size(); i++)  	{ -		for (int j=i+1;j<m_allAabbsCPU1.size();j++) +		for (int j = i + 1; j < m_allAabbsCPU1.size(); j++)  		{  			if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec, -				m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec)) +									   m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec))  			{  				b3Int4 pair;  				int a = m_allAabbsCPU1[j].m_minIndices[3];  				int b = m_allAabbsCPU1[i].m_minIndices[3]; -				if (a<=b) +				if (a <= b)  				{ -					pair.x = a;  -					pair.y = b;//store the original index in the unsorted aabb array -				} else +					pair.x = a; +					pair.y = b;  //store the original index in the unsorted aabb array +				} +				else  				{  					pair.x = b; -					pair.y = a;//store the original index in the unsorted aabb array +					pair.y = a;  //store the original index in the unsorted aabb array  				} -					 -				if (m_hostPairs.size()<maxPairs) + +				if (m_hostPairs.size() < maxPairs)  				{  					m_hostPairs.push_back(pair);  				} @@ -332,40 +290,36 @@ void  b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)  		}  	} -  	m_gpuPairs.copyFromHost(m_hostPairs); - -  } -	//call writeAabbsToGpu after done making all changes (createProxy etc) +//call writeAabbsToGpu after done making all changes (createProxy etc)  void b3GpuGridBroadphase::writeAabbsToGpu()  {  	m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);  	m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);  	m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); -  } -cl_mem	b3GpuGridBroadphase::getAabbBufferWS() +cl_mem b3GpuGridBroadphase::getAabbBufferWS()  {  	return this->m_allAabbsGPU1.getBufferCL();  } -int	b3GpuGridBroadphase::getNumOverlap() +int b3GpuGridBroadphase::getNumOverlap()  {  	return m_gpuPairs.size();  } -cl_mem	b3GpuGridBroadphase::getOverlappingPairBuffer() +cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()  {  	return m_gpuPairs.getBufferCL();  } -b3OpenCLArray<b3SapAabb>&	b3GpuGridBroadphase::getAllAabbsGPU() +b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()  {  	return m_allAabbsGPU1;  } -b3AlignedObjectArray<b3SapAabb>&	b3GpuGridBroadphase::getAllAabbsCPU() +b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()  {  	return m_allAabbsCPU1;  } @@ -382,4 +336,3 @@ b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()  {  	return m_largeAabbsMappingGPU;  } - diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h index ec18c9f716..b76cb43b68 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h @@ -6,83 +6,75 @@  struct b3ParamsGridBroadphaseCL  { -  	float m_invCellSize[4]; -	int   m_gridSize[4]; +	int m_gridSize[4]; -	int	getMaxBodiesPerCell() const +	int getMaxBodiesPerCell() const  	{  		return m_gridSize[3];  	} -	void setMaxBodiesPerCell(int maxOverlap)  +	void setMaxBodiesPerCell(int maxOverlap)  	{  		m_gridSize[3] = maxOverlap;  	}  }; -  class b3GpuGridBroadphase : public b3GpuBroadphaseInterface  {  protected: -	cl_context				m_context; -	cl_device_id			m_device; -	cl_command_queue		m_queue; +	cl_context m_context; +	cl_device_id m_device; +	cl_command_queue m_queue; -	b3OpenCLArray<b3SapAabb>	m_allAabbsGPU1; -	b3AlignedObjectArray<b3SapAabb>	m_allAabbsCPU1; +	b3OpenCLArray<b3SapAabb> m_allAabbsGPU1; +	b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1; -	b3OpenCLArray<int>	m_smallAabbsMappingGPU; +	b3OpenCLArray<int> m_smallAabbsMappingGPU;  	b3AlignedObjectArray<int> m_smallAabbsMappingCPU; -	b3OpenCLArray<int>	m_largeAabbsMappingGPU; +	b3OpenCLArray<int> m_largeAabbsMappingGPU;  	b3AlignedObjectArray<int> m_largeAabbsMappingCPU;  	b3AlignedObjectArray<b3Int4> m_hostPairs; -	b3OpenCLArray<b3Int4>			m_gpuPairs; +	b3OpenCLArray<b3Int4> m_gpuPairs; -	b3OpenCLArray<b3SortData>			m_hashGpu; -	b3OpenCLArray<int>			m_cellStartGpu; -	 +	b3OpenCLArray<b3SortData> m_hashGpu; +	b3OpenCLArray<int> m_cellStartGpu; -	b3ParamsGridBroadphaseCL		m_paramsCPU; -	b3OpenCLArray<b3ParamsGridBroadphaseCL>		m_paramsGPU; +	b3ParamsGridBroadphaseCL m_paramsCPU; +	b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU; -	class b3RadixSort32CL*			m_sorter; +	class b3RadixSort32CL* m_sorter;  public: - -	b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q ); +	b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q);  	virtual ~b3GpuGridBroadphase(); -	static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue  q) +	static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q)  	{ -		return new b3GpuGridBroadphase(ctx,device,q); +		return new b3GpuGridBroadphase(ctx, device, q);  	} -	 - +	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); +	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); -	virtual void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); -	virtual void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); - -	virtual void  calculateOverlappingPairs(int maxPairs); -	virtual void  calculateOverlappingPairsHost(int maxPairs); +	virtual void calculateOverlappingPairs(int maxPairs); +	virtual void calculateOverlappingPairsHost(int maxPairs);  	//call writeAabbsToGpu after done making all changes (createProxy etc)  	virtual void writeAabbsToGpu(); -	virtual cl_mem	getAabbBufferWS(); -	virtual int	getNumOverlap(); -	virtual cl_mem	getOverlappingPairBuffer(); +	virtual cl_mem getAabbBufferWS(); +	virtual int getNumOverlap(); +	virtual cl_mem getOverlappingPairBuffer(); + +	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU(); +	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU(); -	virtual b3OpenCLArray<b3SapAabb>&	getAllAabbsGPU(); -	virtual b3AlignedObjectArray<b3SapAabb>&	getAllAabbsCPU(); -	  	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();  	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();  	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU(); -  }; -#endif //B3_GPU_GRID_BROADPHASE_H
\ No newline at end of file +#endif  //B3_GPU_GRID_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp index 641df9eb12..0721928684 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp @@ -16,177 +16,174 @@ subject to the following restrictions:  #include "b3GpuParallelLinearBvh.h" -b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : -	m_queue(queue), -	m_radixSorter(context, device, queue), -	 -	m_rootNodeIndex(context, queue), -	m_maxDistanceFromRoot(context, queue), -	m_temp(context, queue), -	 -	m_internalNodeAabbs(context, queue), -	m_internalNodeLeafIndexRanges(context, queue), -	m_internalNodeChildNodes(context, queue), -	m_internalNodeParentNodes(context, queue), -	 -	m_commonPrefixes(context, queue), -	m_commonPrefixLengths(context, queue), -	m_distanceFromRoot(context, queue), -	 -	m_leafNodeParentNodes(context, queue), -	m_mortonCodesAndAabbIndicies(context, queue), -	m_mergedAabb(context, queue), -	m_leafNodeAabbs(context, queue), -	 -	m_largeAabbs(context, queue) +b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue), +																												  m_radixSorter(context, device, queue), + +																												  m_rootNodeIndex(context, queue), +																												  m_maxDistanceFromRoot(context, queue), +																												  m_temp(context, queue), + +																												  m_internalNodeAabbs(context, queue), +																												  m_internalNodeLeafIndexRanges(context, queue), +																												  m_internalNodeChildNodes(context, queue), +																												  m_internalNodeParentNodes(context, queue), + +																												  m_commonPrefixes(context, queue), +																												  m_commonPrefixLengths(context, queue), +																												  m_distanceFromRoot(context, queue), + +																												  m_leafNodeParentNodes(context, queue), +																												  m_mortonCodesAndAabbIndicies(context, queue), +																												  m_mergedAabb(context, queue), +																												  m_leafNodeAabbs(context, queue), + +																												  m_largeAabbs(context, queue)  {  	m_rootNodeIndex.resize(1);  	m_maxDistanceFromRoot.resize(1);  	m_temp.resize(1); -	 +  	//  	const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; -	 -	const char* kernelSource = parallelLinearBvhCL;	//parallelLinearBvhCL.h + +	const char* kernelSource = parallelLinearBvhCL;  //parallelLinearBvhCL.h  	cl_int error;  	char* additionalMacros = 0;  	m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);  	b3Assert(m_parallelLinearBvhProgram); -	 -	m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); + +	m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_separateAabbsKernel); -	m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros ); +	m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_findAllNodesMergedAabbKernel); -	m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); +	m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); -	 -	m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros ); + +	m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_computeAdjacentPairCommonPrefixKernel); -	m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); +	m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_buildBinaryRadixTreeLeafNodesKernel); -	m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); +	m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_buildBinaryRadixTreeInternalNodesKernel); -	m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros ); +	m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_findDistanceFromRootKernel); -	m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros ); +	m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel); -	 -	m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros ); + +	m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_findLeafIndexRangesKernel); -	 -	m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); + +	m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_plbvhCalculateOverlappingPairsKernel); -	m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros ); +	m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_plbvhRayTraverseKernel); -	m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros ); +	m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_plbvhLargeAabbAabbTestKernel); -	m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros ); +	m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);  	b3Assert(m_plbvhLargeAabbRayTestKernel);  } -b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()  +b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()  {  	clReleaseKernel(m_separateAabbsKernel);  	clReleaseKernel(m_findAllNodesMergedAabbKernel);  	clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); -	 +  	clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);  	clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);  	clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);  	clReleaseKernel(m_findDistanceFromRootKernel);  	clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel); -	 +  	clReleaseKernel(m_findLeafIndexRangesKernel); -	 +  	clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);  	clReleaseKernel(m_plbvhRayTraverseKernel);  	clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);  	clReleaseKernel(m_plbvhLargeAabbRayTestKernel); -	 +  	clReleaseProgram(m_parallelLinearBvhProgram);  } -void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,  -									const b3OpenCLArray<int>& largeAabbIndices) +void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, +								   const b3OpenCLArray<int>& largeAabbIndices)  {  	B3_PROFILE("b3ParallelLinearBvh::build()"); -	 +  	int numLargeAabbs = largeAabbIndices.size();  	int numSmallAabbs = smallAabbIndices.size(); -	 -	//Since all AABBs(both large and small) are input as a contiguous array,  + +	//Since all AABBs(both large and small) are input as a contiguous array,  	//with 2 additional arrays used to indicate the indices of large and small AABBs,  	//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.  	{  		B3_PROFILE("Separate large and small AABBs"); -		 +  		m_largeAabbs.resize(numLargeAabbs);  		m_leafNodeAabbs.resize(numSmallAabbs); -		 +  		//Write large AABBs into m_largeAabbs  		{ -			b3BufferInfoCL bufferInfo[] =  -			{ -				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), -				b3BufferInfoCL( largeAabbIndices.getBufferCL() ), -				 -				b3BufferInfoCL( m_largeAabbs.getBufferCL() ) -			}; -			 +			b3BufferInfoCL bufferInfo[] = +				{ +					b3BufferInfoCL(worldSpaceAabbs.getBufferCL()), +					b3BufferInfoCL(largeAabbIndices.getBufferCL()), + +					b3BufferInfoCL(m_largeAabbs.getBufferCL())}; +  			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); -			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  			launcher.setConst(numLargeAabbs); -			 +  			launcher.launch1D(numLargeAabbs);  		} -		 +  		//Write small AABBs into m_leafNodeAabbs  		{ -			b3BufferInfoCL bufferInfo[] =  -			{ -				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), -				b3BufferInfoCL( smallAabbIndices.getBufferCL() ), -				 -				b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ) -			}; -			 +			b3BufferInfoCL bufferInfo[] = +				{ +					b3BufferInfoCL(worldSpaceAabbs.getBufferCL()), +					b3BufferInfoCL(smallAabbIndices.getBufferCL()), + +					b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())}; +  			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); -			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  			launcher.setConst(numSmallAabbs); -			 +  			launcher.launch1D(numSmallAabbs);  		} -		 +  		clFinish(m_queue);  	} -	 +  	// -	int numLeaves = numSmallAabbs;	//Number of leaves in the BVH == Number of rigid bodies with small AABBs +	int numLeaves = numSmallAabbs;  //Number of leaves in the BVH == Number of rigid bodies with small AABBs  	int numInternalNodes = numLeaves - 1; -	 -	if(numLeaves < 2) + +	if (numLeaves < 2)  	{  		//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),  		//so it does not matter if numLeaves == 0 and rootNodeIndex == -1  		int rootNodeIndex = numLeaves - 1;  		m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1); -		 +  		//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,  		//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index  		//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.  		//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs ) -		if(numLeaves == 1) +		if (numLeaves == 1)  		{  			b3SortData leaf; -			leaf.m_value = 0;		//1 leaf so index is always 0; leaf.m_key does not need to be set -			 +			leaf.m_value = 0;  //1 leaf so index is always 0; leaf.m_key does not need to be set +  			m_mortonCodesAndAabbIndicies.resize(1);  			m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);  		} -		 +  		return;  	} -	 +  	//  	{  		m_internalNodeAabbs.resize(numInternalNodes); @@ -197,37 +194,37 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab  		m_commonPrefixes.resize(numInternalNodes);  		m_commonPrefixLengths.resize(numInternalNodes);  		m_distanceFromRoot.resize(numInternalNodes); -	 +  		m_leafNodeParentNodes.resize(numLeaves);  		m_mortonCodesAndAabbIndicies.resize(numLeaves);  		m_mergedAabb.resize(numLeaves);  	} -	 -	//Find the merged AABB of all small AABBs; this is used to define the size of  + +	//Find the merged AABB of all small AABBs; this is used to define the size of  	//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).  	{  		B3_PROFILE("Find AABB of merged nodes"); -	 -		m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs);	//Need to make a copy since the kernel modifies the array -			 -		for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;  -				numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) + +		m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs);  //Need to make a copy since the kernel modifies the array + +		for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; +			 numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)  		{ -			b3BufferInfoCL bufferInfo[] =  -			{ -				b3BufferInfoCL( m_mergedAabb.getBufferCL() )		//Resulting AABB is stored in m_mergedAabb[0] -			}; -			 +			b3BufferInfoCL bufferInfo[] = +				{ +					b3BufferInfoCL(m_mergedAabb.getBufferCL())  //Resulting AABB is stored in m_mergedAabb[0] +				}; +  			b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); -			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  			launcher.setConst(numAabbsNeedingMerge); -			 +  			launcher.launch1D(numAabbsNeedingMerge);  		} -		 +  		clFinish(m_queue);  	} -	 +  	//Insert the center of the AABBs into a virtual grid,  	//then convert the discrete grid coordinates into a morton code  	//For each element in m_mortonCodesAndAabbIndicies, set @@ -235,34 +232,32 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab  	//	m_value == small AABB index  	{  		B3_PROFILE("Assign morton codes"); -	 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), -			b3BufferInfoCL( m_mergedAabb.getBufferCL() ), -			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), +				b3BufferInfoCL(m_mergedAabb.getBufferCL()), +				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(numLeaves); -		 +  		launcher.launch1D(numLeaves);  		clFinish(m_queue);  	} -	 +  	//  	{  		B3_PROFILE("Sort leaves by morton codes"); -	 +  		m_radixSorter.execute(m_mortonCodesAndAabbIndicies);  		clFinish(m_queue);  	} -	 +  	//  	constructBinaryRadixTree(); -	 -	 +  	//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.  	//The root node contains leaf node indices in the range [0, numLeafNodes - 1].  	//The child nodes of each node split their parent's index range into 2 contiguous halves. @@ -273,17 +268,16 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab  	//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice  	{  		B3_PROFILE("m_findLeafIndexRangesKernel"); -	 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), +				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(numInternalNodes); -		 +  		launcher.launch1D(numInternalNodes);  		clFinish(m_queue);  	} @@ -293,285 +287,271 @@ void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& ou  {  	int maxPairs = out_overlappingPairs.size();  	b3OpenCLArray<int>& numPairsGpu = m_temp; -	 +  	int reset = 0;  	numPairsGpu.copyFromHostPointer(&reset, 1); -	 +  	// -	if( m_leafNodeAabbs.size() > 1 ) +	if (m_leafNodeAabbs.size() > 1)  	{  		B3_PROFILE("PLBVH small-small AABB test"); -	 +  		int numQueryAabbs = m_leafNodeAabbs.size(); -		 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), -			 -			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), -			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), -			 -			b3BufferInfoCL( numPairsGpu.getBufferCL() ), -			b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + +				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()), +				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), +				b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()), +				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()), +				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + +				b3BufferInfoCL(numPairsGpu.getBufferCL()), +				b3BufferInfoCL(out_overlappingPairs.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(maxPairs);  		launcher.setConst(numQueryAabbs); -		 +  		launcher.launch1D(numQueryAabbs);  		clFinish(m_queue);  	} -	 +  	int numLargeAabbRigids = m_largeAabbs.size(); -	if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 ) +	if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)  	{  		B3_PROFILE("PLBVH large-small AABB test"); -	 +  		int numQueryAabbs = m_leafNodeAabbs.size(); -		 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), -			b3BufferInfoCL( m_largeAabbs.getBufferCL() ), -			 -			b3BufferInfoCL( numPairsGpu.getBufferCL() ), -			b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), +				b3BufferInfoCL(m_largeAabbs.getBufferCL()), + +				b3BufferInfoCL(numPairsGpu.getBufferCL()), +				b3BufferInfoCL(out_overlappingPairs.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(maxPairs);  		launcher.setConst(numLargeAabbRigids);  		launcher.setConst(numQueryAabbs); -		 +  		launcher.launch1D(numQueryAabbs);  		clFinish(m_queue);  	} -	 -	 +  	//  	int numPairs = -1;  	numPairsGpu.copyToHostPointer(&numPairs, 1); -	if(numPairs > maxPairs) +	if (numPairs > maxPairs)  	{  		b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);  		numPairs = maxPairs;  		numPairsGpu.copyFromHostPointer(&maxPairs, 1);  	} -	 +  	out_overlappingPairs.resize(numPairs);  } - -void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,  -							b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs) +void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, +													 b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)  {  	B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()"); -	 +  	int numRays = rays.size();  	int maxRayRigidPairs = out_rayRigidPairs.size(); -	 +  	int reset = 0;  	out_numRayRigidPairs.copyFromHostPointer(&reset, 1); -	 +  	// -	if( m_leafNodeAabbs.size() > 0 ) +	if (m_leafNodeAabbs.size() > 0)  	{  		B3_PROFILE("PLBVH ray test small AABB"); -	 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), -			 -			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), -			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), -			 -			b3BufferInfoCL( rays.getBufferCL() ), -			 -			b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), -			b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), + +				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()), +				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), +				b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()), +				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()), +				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), + +				b3BufferInfoCL(rays.getBufferCL()), + +				b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()), +				b3BufferInfoCL(out_rayRigidPairs.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(maxRayRigidPairs);  		launcher.setConst(numRays); -		 +  		launcher.launch1D(numRays);  		clFinish(m_queue);  	} -	 +  	int numLargeAabbRigids = m_largeAabbs.size(); -	if(numLargeAabbRigids > 0) +	if (numLargeAabbRigids > 0)  	{  		B3_PROFILE("PLBVH ray test large AABB"); -		 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_largeAabbs.getBufferCL() ), -			b3BufferInfoCL( rays.getBufferCL() ), -			 -			b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), -			b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_largeAabbs.getBufferCL()), +				b3BufferInfoCL(rays.getBufferCL()), + +				b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()), +				b3BufferInfoCL(out_rayRigidPairs.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(numLargeAabbRigids);  		launcher.setConst(maxRayRigidPairs);  		launcher.setConst(numRays); -		 +  		launcher.launch1D(numRays);  		clFinish(m_queue);  	} -	 +  	//  	int numRayRigidPairs = -1;  	out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1); -	 -	if(numRayRigidPairs > maxRayRigidPairs) + +	if (numRayRigidPairs > maxRayRigidPairs)  		b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs); -	  }  void b3GpuParallelLinearBvh::constructBinaryRadixTree()  {  	B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()"); -	 +  	int numLeaves = m_leafNodeAabbs.size();  	int numInternalNodes = numLeaves - 1; -	 +  	//Each internal node is placed in between 2 leaf nodes.  	//By using this arrangement and computing the common prefix between  	//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.  	{  		B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel"); -		 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), -			b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), -			b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), +				b3BufferInfoCL(m_commonPrefixes.getBufferCL()), +				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(numInternalNodes); -		 +  		launcher.launch1D(numInternalNodes);  		clFinish(m_queue);  	} -	 -	//For each leaf node, select its parent node by  + +	//For each leaf node, select its parent node by  	//comparing the 2 nearest internal nodes and assign child node indices  	{  		B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel"); -		 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), -			b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()), +				b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()), +				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(numLeaves); -		 +  		launcher.launch1D(numLeaves);  		clFinish(m_queue);  	} -	 +  	//For each internal node, perform 2 binary searches among the other internal nodes  	//to its left and right to find its potential parent nodes and assign child node indices  	{  		B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel"); -		 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), -			b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), -			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_commonPrefixes.getBufferCL()), +				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()), +				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), +				b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()), +				b3BufferInfoCL(m_rootNodeIndex.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(numInternalNodes); -		 +  		launcher.launch1D(numInternalNodes);  		clFinish(m_queue);  	} -	 +  	//Find the number of nodes seperating each internal node and the root node  	//so that the AABBs can be set using the next kernel.  	//Also determine the maximum number of nodes separating an internal node and the root node.  	{  		B3_PROFILE("m_findDistanceFromRootKernel"); -	 -		b3BufferInfoCL bufferInfo[] =  -		{ -			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), -			b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), -			b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ), -			b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ) -		}; -		 + +		b3BufferInfoCL bufferInfo[] = +			{ +				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()), +				b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()), +				b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()), +				b3BufferInfoCL(m_distanceFromRoot.getBufferCL())}; +  		b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel"); -		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  		launcher.setConst(numInternalNodes); -		 +  		launcher.launch1D(numInternalNodes);  		clFinish(m_queue);  	} -	 +  	//Starting from the internal nodes nearest to the leaf nodes, recursively move up  	//the tree towards the root to set the AABBs of each internal node; each internal node  	//checks its children and merges their AABBs  	{  		B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel"); -		 +  		int maxDistanceFromRoot = -1;  		{  			B3_PROFILE("copy maxDistanceFromRoot to CPU");  			m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);  			clFinish(m_queue);  		} -		 -		for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot) + +		for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)  		{ -			b3BufferInfoCL bufferInfo[] =  -			{ -				b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ), -				b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), -				b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), -				b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), -				b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) -			}; -			 +			b3BufferInfoCL bufferInfo[] = +				{ +					b3BufferInfoCL(m_distanceFromRoot.getBufferCL()), +					b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()), +					b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()), +					b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()), +					b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())}; +  			b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel"); -			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); +			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));  			launcher.setConst(maxDistanceFromRoot);  			launcher.setConst(distanceFromRoot);  			launcher.setConst(numInternalNodes); -			 +  			//It may seem inefficent to launch a thread for each internal node when a  			//much smaller number of nodes is actually processed, but this is actually -			//faster than determining the exact nodes that are ready to merge their child AABBs.  +			//faster than determining the exact nodes that are ready to merge their child AABBs.  			launcher.launch1D(numInternalNodes);  		} -		 +  		clFinish(m_queue);  	}  } - -	
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h index effe617b7b..b390775129 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -37,10 +37,10 @@ subject to the following restrictions:  ///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n  ///@par  ///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages: -/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)  +/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)  /// - [fully parallel] Sort morton codes -/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)  -/// - [somewhat parallel] Set internal node AABBs  +/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) +/// - [somewhat parallel] Set internal node AABBs  ///@par  ///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.  ///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree. @@ -49,75 +49,75 @@ subject to the following restrictions:  class b3GpuParallelLinearBvh  {  	cl_command_queue m_queue; -	 +  	cl_program m_parallelLinearBvhProgram; -	 +  	cl_kernel m_separateAabbsKernel;  	cl_kernel m_findAllNodesMergedAabbKernel;  	cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; -	 +  	//Binary radix tree construction kernels  	cl_kernel m_computeAdjacentPairCommonPrefixKernel;  	cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;  	cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;  	cl_kernel m_findDistanceFromRootKernel;  	cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel; -	 +  	cl_kernel m_findLeafIndexRangesKernel; -	 +  	//Traversal kernels  	cl_kernel m_plbvhCalculateOverlappingPairsKernel;  	cl_kernel m_plbvhRayTraverseKernel;  	cl_kernel m_plbvhLargeAabbAabbTestKernel;  	cl_kernel m_plbvhLargeAabbRayTestKernel; -	 +  	b3RadixSort32CL m_radixSorter; -	 +  	//1 element -	b3OpenCLArray<int> m_rootNodeIndex;							//Most significant bit(0x80000000) is set to indicate internal node -	b3OpenCLArray<int> m_maxDistanceFromRoot;					//Max number of internal nodes between an internal node and the root node -	b3OpenCLArray<int> m_temp;									//Used to hold the number of pairs in calculateOverlappingPairs() -	 +	b3OpenCLArray<int> m_rootNodeIndex;        //Most significant bit(0x80000000) is set to indicate internal node +	b3OpenCLArray<int> m_maxDistanceFromRoot;  //Max number of internal nodes between an internal node and the root node +	b3OpenCLArray<int> m_temp;                 //Used to hold the number of pairs in calculateOverlappingPairs() +  	//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)  	b3OpenCLArray<b3SapAabb> m_internalNodeAabbs; -	b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges;		//x == min leaf index, y == max leaf index -	b3OpenCLArray<b3Int2> m_internalNodeChildNodes;				//x == left child, y == right child; msb(0x80000000) is set to indicate internal node -	b3OpenCLArray<int> m_internalNodeParentNodes;				//For parent node index, msb(0x80000000) is not set since it is always internal -	 +	b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges;  //x == min leaf index, y == max leaf index +	b3OpenCLArray<b3Int2> m_internalNodeChildNodes;       //x == left child, y == right child; msb(0x80000000) is set to indicate internal node +	b3OpenCLArray<int> m_internalNodeParentNodes;         //For parent node index, msb(0x80000000) is not set since it is always internal +  	//1 element per internal node; for binary radix tree construction  	b3OpenCLArray<b3Int64> m_commonPrefixes;  	b3OpenCLArray<int> m_commonPrefixLengths; -	b3OpenCLArray<int> m_distanceFromRoot;						//Number of internal nodes between this node and the root -	 +	b3OpenCLArray<int> m_distanceFromRoot;  //Number of internal nodes between this node and the root +  	//1 element per leaf node (leaf nodes only include small AABBs) -	b3OpenCLArray<int> m_leafNodeParentNodes;					//For parent node index, msb(0x80000000) is not set since it is always internal -	b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies;		//m_key == morton code, m_value == aabb index in m_leafNodeAabbs -	b3OpenCLArray<b3SapAabb> m_mergedAabb;						//m_mergedAabb[0] contains the merged AABB of all leaf nodes -	b3OpenCLArray<b3SapAabb> m_leafNodeAabbs;					//Contains only small AABBs -	 +	b3OpenCLArray<int> m_leafNodeParentNodes;                //For parent node index, msb(0x80000000) is not set since it is always internal +	b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies;  //m_key == morton code, m_value == aabb index in m_leafNodeAabbs +	b3OpenCLArray<b3SapAabb> m_mergedAabb;                   //m_mergedAabb[0] contains the merged AABB of all leaf nodes +	b3OpenCLArray<b3SapAabb> m_leafNodeAabbs;                //Contains only small AABBs +  	//1 element per large AABB, which is not stored in the BVH  	b3OpenCLArray<b3SapAabb> m_largeAabbs; -	 +  public:  	b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);  	virtual ~b3GpuParallelLinearBvh(); -	 +  	///Must be called before any other function -	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,  -				const b3OpenCLArray<int>& largeAabbIndices); -	 +	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, +			   const b3OpenCLArray<int>& largeAabbIndices); +  	///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.  	///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.  	///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.  	void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs); -	 +  	///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;  	///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.  	///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.  	///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded. -	void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,  -								b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs); -								 +	void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, +								 b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs); +  private:  	void constructBinaryRadixTree();  }; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp index d2618024ac..62ea7a32df 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp @@ -13,45 +13,44 @@ subject to the following restrictions:  #include "b3GpuParallelLinearBvhBroadphase.h" -b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) :  -	m_plbvh(context, device, queue), -	 -	m_overlappingPairsGpu(context, queue), -	 -	m_aabbsGpu(context, queue), -	m_smallAabbsMappingGpu(context, queue), -	m_largeAabbsMappingGpu(context, queue) +b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue), + +																																	  m_overlappingPairsGpu(context, queue), + +																																	  m_aabbsGpu(context, queue), +																																	  m_smallAabbsMappingGpu(context, queue), +																																	  m_largeAabbsMappingGpu(context, queue)  {  } -void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr,  int collisionFilterGroup,  int collisionFilterMask) +void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)  {  	int newAabbIndex = m_aabbsCpu.size();  	b3SapAabb aabb;  	aabb.m_minVec = aabbMin;  	aabb.m_maxVec = aabbMax; -	 +  	aabb.m_minIndices[3] = userPtr;  	aabb.m_signedMaxIndices[3] = newAabbIndex; -	 +  	m_smallAabbsMappingCpu.push_back(newAabbIndex); -	 +  	m_aabbsCpu.push_back(aabb);  } -void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr,  int collisionFilterGroup,  int collisionFilterMask) +void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)  {  	int newAabbIndex = m_aabbsCpu.size();  	b3SapAabb aabb;  	aabb.m_minVec = aabbMin;  	aabb.m_maxVec = aabbMax; -	 +  	aabb.m_minIndices[3] = userPtr;  	aabb.m_signedMaxIndices[3] = newAabbIndex; -	 +  	m_largeAabbsMappingCpu.push_back(newAabbIndex); -	 +  	m_aabbsCpu.push_back(aabb);  } @@ -59,22 +58,19 @@ void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)  {  	//Reconstruct BVH  	m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu); -	 +  	//  	m_overlappingPairsGpu.resize(maxPairs);  	m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);  }  void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)  { -	b3Assert(0);	//CPU version not implemented +	b3Assert(0);  //CPU version not implemented  } -void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()  -{  -	m_aabbsGpu.copyFromHost(m_aabbsCpu);  +void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() +{ +	m_aabbsGpu.copyFromHost(m_aabbsCpu);  	m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);  	m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);  } -	 -	 -	 diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h index e518500637..dda0eea7be 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -21,42 +21,42 @@ subject to the following restrictions:  class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface  {  	b3GpuParallelLinearBvh m_plbvh; -	 +  	b3OpenCLArray<b3Int4> m_overlappingPairsGpu; -	 +  	b3OpenCLArray<b3SapAabb> m_aabbsGpu;  	b3OpenCLArray<int> m_smallAabbsMappingGpu;  	b3OpenCLArray<int> m_largeAabbsMappingGpu; -	 +  	b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;  	b3AlignedObjectArray<int> m_smallAabbsMappingCpu;  	b3AlignedObjectArray<int> m_largeAabbsMappingCpu; -	 +  public:  	b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);  	virtual ~b3GpuParallelLinearBvhBroadphase() {} -	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr,  int collisionFilterGroup,  int collisionFilterMask); -	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr,  int collisionFilterGroup,  int collisionFilterMask); -	 +	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); +	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); +  	virtual void calculateOverlappingPairs(int maxPairs);  	virtual void calculateOverlappingPairsHost(int maxPairs);  	//call writeAabbsToGpu after done making all changes (createProxy etc)  	virtual void writeAabbsToGpu(); -	 -	virtual int	getNumOverlap() { return m_overlappingPairsGpu.size(); } + +	virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }  	virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }  	virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }  	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; } -	 +  	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }  	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }  	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; } -	 +  	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; } -	 +  	static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)  	{  		return new b3GpuParallelLinearBvhBroadphase(context, device, queue); diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp index c45fbbdcaa..4126d03ed0 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp @@ -6,7 +6,6 @@ bool searchIncremental3dSapOnGpu = true;  #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"  #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h" -  #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"  #include "kernels/sapKernels.h" @@ -56,110 +55,105 @@ bool searchIncremental3dSapOnGpu = true;  	class b3PrefixScanFloat4CL*		m_prefixScanFloat4;   */ -b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q , b3GpuSapKernelType kernelType) -:m_context(ctx), -m_device(device), -m_queue(q), - -m_objectMinMaxIndexGPUaxis0(ctx,q), -m_objectMinMaxIndexGPUaxis1(ctx,q), -m_objectMinMaxIndexGPUaxis2(ctx,q), -m_objectMinMaxIndexGPUaxis0prev(ctx,q), -m_objectMinMaxIndexGPUaxis1prev(ctx,q), -m_objectMinMaxIndexGPUaxis2prev(ctx,q), -m_sortedAxisGPU0(ctx,q), -m_sortedAxisGPU1(ctx,q), -m_sortedAxisGPU2(ctx,q), -m_sortedAxisGPU0prev(ctx,q), -m_sortedAxisGPU1prev(ctx,q), -m_sortedAxisGPU2prev(ctx,q), -m_addedHostPairsGPU(ctx,q), -m_removedHostPairsGPU(ctx,q), -m_addedCountGPU(ctx,q), -m_removedCountGPU(ctx,q), -m_currentBuffer(-1), -m_pairCount(ctx,q), -m_allAabbsGPU(ctx,q), -m_sum(ctx,q), -m_sum2(ctx,q), -m_dst(ctx,q), -m_smallAabbsMappingGPU(ctx,q), -m_largeAabbsMappingGPU(ctx,q), -m_overlappingPairs(ctx,q), -m_gpuSmallSortData(ctx,q), -m_gpuSmallSortedAabbs(ctx,q) +b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType) +	: m_context(ctx), +	  m_device(device), +	  m_queue(q), + +	  m_objectMinMaxIndexGPUaxis0(ctx, q), +	  m_objectMinMaxIndexGPUaxis1(ctx, q), +	  m_objectMinMaxIndexGPUaxis2(ctx, q), +	  m_objectMinMaxIndexGPUaxis0prev(ctx, q), +	  m_objectMinMaxIndexGPUaxis1prev(ctx, q), +	  m_objectMinMaxIndexGPUaxis2prev(ctx, q), +	  m_sortedAxisGPU0(ctx, q), +	  m_sortedAxisGPU1(ctx, q), +	  m_sortedAxisGPU2(ctx, q), +	  m_sortedAxisGPU0prev(ctx, q), +	  m_sortedAxisGPU1prev(ctx, q), +	  m_sortedAxisGPU2prev(ctx, q), +	  m_addedHostPairsGPU(ctx, q), +	  m_removedHostPairsGPU(ctx, q), +	  m_addedCountGPU(ctx, q), +	  m_removedCountGPU(ctx, q), +	  m_currentBuffer(-1), +	  m_pairCount(ctx, q), +	  m_allAabbsGPU(ctx, q), +	  m_sum(ctx, q), +	  m_sum2(ctx, q), +	  m_dst(ctx, q), +	  m_smallAabbsMappingGPU(ctx, q), +	  m_largeAabbsMappingGPU(ctx, q), +	  m_overlappingPairs(ctx, q), +	  m_gpuSmallSortData(ctx, q), +	  m_gpuSmallSortedAabbs(ctx, q)  {  	const char* sapSrc = sapCL; -     -     -	cl_int errNum=0; + +	cl_int errNum = 0;  	b3Assert(m_context);  	b3Assert(m_device); -	cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); -	b3Assert(errNum==CL_SUCCESS); - +	cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH); +	b3Assert(errNum == CL_SUCCESS); -	b3Assert(errNum==CL_SUCCESS); +	b3Assert(errNum == CL_SUCCESS);  #ifndef __APPLE__ -	m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue); +	m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context, m_device, m_queue);  #else  	m_prefixScanFloat4 = 0;  #endif  	m_sapKernel = 0; -	 +  	switch (kernelType)  	{  		case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU:  		{ -			m_sapKernel=0; +			m_sapKernel = 0;  			break;  		} -		case 		B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: +		case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU:  		{ -			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBruteForce",&errNum,sapProg ); +			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBruteForce", &errNum, sapProg);  			break;  		}  		case B3_GPU_SAP_KERNEL_ORIGINAL:  		{ -			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); +			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelOriginal", &errNum, sapProg);  			break;  		}  		case B3_GPU_SAP_KERNEL_BARRIER:  		{ -			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg ); +			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBarrier", &errNum, sapProg);  			break;  		}  		case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY:  		{ -			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); +			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg);  			break;  		}  		default:  		{ -			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); +			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg);  			b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory");  		}  	}; -	 -		 -	 -	m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); -	b3Assert(errNum==CL_SUCCESS); -	m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg ); -	b3Assert(errNum==CL_SUCCESS); +	m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg); +	b3Assert(errNum == CL_SUCCESS); -		 -	m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg ); +	m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "prepareSumVarianceKernel", &errNum, sapProg); +	b3Assert(errNum == CL_SUCCESS); -	m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); +	m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "flipFloatKernel", &errNum, sapProg); -	m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg ); +	m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg); -	m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); +	m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "scatterKernel", &errNum, sapProg); + +	m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);  }  b3GpuSapBroadphase::~b3GpuSapBroadphase() @@ -173,13 +167,11 @@ b3GpuSapBroadphase::~b3GpuSapBroadphase()  	clReleaseKernel(m_sapKernel);  	clReleaseKernel(m_sap2Kernel);  	clReleaseKernel(m_prepareSumVarianceKernel); -	 -  }  /// conservative test for overlap between two aabbs -static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1, -								const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2) +static bool TestAabbAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1, +								 const b3Vector3& aabbMin2, const b3Vector3& aabbMax2)  {  	bool overlap = true;  	overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; @@ -188,8 +180,6 @@ static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aab  	return overlap;  } - -  //http://stereopsis.com/radix.html  static unsigned int FloatFlip(float fl)  { @@ -198,79 +188,77 @@ static unsigned int FloatFlip(float fl)  	return f ^ mask;  }; -void  b3GpuSapBroadphase::init3dSap() +void b3GpuSapBroadphase::init3dSap()  { -	if (m_currentBuffer<0) +	if (m_currentBuffer < 0)  	{  		m_allAabbsGPU.copyToHost(m_allAabbsCPU);  		m_currentBuffer = 0; -		for (int axis=0;axis<3;axis++) +		for (int axis = 0; axis < 3; axis++)  		{ -			for (int buf=0;buf<2;buf++) +			for (int buf = 0; buf < 2; buf++)  			{  				int totalNumAabbs = m_allAabbsCPU.size(); -				int numEndPoints = 2*totalNumAabbs; +				int numEndPoints = 2 * totalNumAabbs;  				m_sortedAxisCPU[axis][buf].resize(numEndPoints); -				if (buf==m_currentBuffer) +				if (buf == m_currentBuffer)  				{ -					for (int i=0;i<totalNumAabbs;i++) +					for (int i = 0; i < totalNumAabbs; i++)  					{ -						m_sortedAxisCPU[axis][buf][i*2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis])-1; -						m_sortedAxisCPU[axis][buf][i*2].m_value = i*2; -						m_sortedAxisCPU[axis][buf][i*2+1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis])+1; -						m_sortedAxisCPU[axis][buf][i*2+1].m_value = i*2+1; +						m_sortedAxisCPU[axis][buf][i * 2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis]) - 1; +						m_sortedAxisCPU[axis][buf][i * 2].m_value = i * 2; +						m_sortedAxisCPU[axis][buf][i * 2 + 1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis]) + 1; +						m_sortedAxisCPU[axis][buf][i * 2 + 1].m_value = i * 2 + 1;  					}  				}  			}  		} -		for (int axis=0;axis<3;axis++) +		for (int axis = 0; axis < 3; axis++)  		{  			m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);  		} -		for (int axis=0;axis<3;axis++) +		for (int axis = 0; axis < 3; axis++)  		{  			//int totalNumAabbs = m_allAabbsCPU.size();  			int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();  			m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints); -			for (int i=0;i<numEndPoints;i++) +			for (int i = 0; i < numEndPoints; i++)  			{  				int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; -				int newDest = destIndex/2; -				if (destIndex&1) +				int newDest = destIndex / 2; +				if (destIndex & 1)  				{ -					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; -				} else +					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i; +				} +				else  				{ -					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; +					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i;  				}  			}  		} -  	}  } -  static bool b3PairCmp(const b3Int4& p, const b3Int4& q)  { -	return ((p.x<q.x) || ((p.x==q.x) && (p.y<q.y))); +	return ((p.x < q.x) || ((p.x == q.x) && (p.y < q.y)));  } - -static bool operator==(const b3Int4& a,const b3Int4& b) +static bool operator==(const b3Int4& a, const b3Int4& b)  {  	return a.x == b.x && a.y == b.y;  }; -static bool operator<(const b3Int4& a,const b3Int4& b) +static bool operator<(const b3Int4& a, const b3Int4& b)  {  	return a.x < b.x || (a.x == b.x && a.y < b.y);  }; -static bool operator>(const b3Int4& a,const b3Int4& b) +static bool operator>(const b3Int4& a, const b3Int4& b)  {  	return a.x > b.x || (a.x == b.x && a.y > b.y);  }; @@ -278,31 +266,29 @@ static bool operator>(const b3Int4& a,const b3Int4& b)  b3AlignedObjectArray<b3Int4> addedHostPairs;  b3AlignedObjectArray<b3Int4> removedHostPairs; -b3AlignedObjectArray<b3SapAabb>	preAabbs; +b3AlignedObjectArray<b3SapAabb> preAabbs; -void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() +void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  {  	//static int framepje = 0;  	//printf("framepje=%d\n",framepje++); -	  	B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap");  	addedHostPairs.resize(0);  	removedHostPairs.resize(0); -	b3Assert(m_currentBuffer>=0); -	 +	b3Assert(m_currentBuffer >= 0); +  	{  		preAabbs.resize(m_allAabbsCPU.size()); -		for (int i=0;i<preAabbs.size();i++) +		for (int i = 0; i < preAabbs.size(); i++)  		{ -			preAabbs[i]=m_allAabbsCPU[i]; +			preAabbs[i] = m_allAabbsCPU[i];  		}  	} -	 -	if (m_currentBuffer<0) +	if (m_currentBuffer < 0)  		return;  	{  		B3_PROFILE("m_allAabbsGPU.copyToHost"); @@ -316,100 +302,87 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  	}  	if (0)  	{ -	{ -		printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", -		m_allAabbsCPU[40].m_min[0],	m_allAabbsCPU[40].m_min[1],m_allAabbsCPU[40].m_min[2], -		m_allAabbsCPU[40].m_max[0],	m_allAabbsCPU[40].m_max[1],m_allAabbsCPU[40].m_max[2]); -	} - -	{ -		printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", -		m_allAabbsCPU[53].m_min[0],	m_allAabbsCPU[53].m_min[1],m_allAabbsCPU[53].m_min[2], -		m_allAabbsCPU[53].m_max[0],	m_allAabbsCPU[53].m_max[1],m_allAabbsCPU[53].m_max[2]); -	} -	 - -	{ -	b3Int4 newPair; -	newPair.x = 40; -	newPair.y = 53; -		int index = allPairs.findBinarySearch(newPair); -		printf("hasPair(40,53)=%d out of %d\n",index, allPairs.size()); -  		{ -			int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max,(const b3Vector3&)m_allAabbsCPU[53].m_min,(const b3Vector3&)m_allAabbsCPU[53].m_max); -			printf("overlap=%d\n",overlap); +			printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", +				   m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1], m_allAabbsCPU[40].m_min[2], +				   m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1], m_allAabbsCPU[40].m_max[2]);  		} -		if (preAabbs.size()) -		{ -			int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max,(const b3Vector3&)preAabbs[53].m_min,(const b3Vector3&)preAabbs[53].m_max); -			printf("prevoverlap=%d\n",prevOverlap); -		} else  		{ -			printf("unknown prevoverlap\n"); +			printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", +				   m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1], m_allAabbsCPU[53].m_min[2], +				   m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1], m_allAabbsCPU[53].m_max[2]);  		} -	} -	} +		{ +			b3Int4 newPair; +			newPair.x = 40; +			newPair.y = 53; +			int index = allPairs.findBinarySearch(newPair); +			printf("hasPair(40,53)=%d out of %d\n", index, allPairs.size()); +			{ +				int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max, (const b3Vector3&)m_allAabbsCPU[53].m_min, (const b3Vector3&)m_allAabbsCPU[53].m_max); +				printf("overlap=%d\n", overlap); +			} + +			if (preAabbs.size()) +			{ +				int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max, (const b3Vector3&)preAabbs[53].m_min, (const b3Vector3&)preAabbs[53].m_max); +				printf("prevoverlap=%d\n", prevOverlap); +			} +			else +			{ +				printf("unknown prevoverlap\n"); +			} +		} +	}  	if (0)  	{ -		for (int i=0;i<m_allAabbsCPU.size();i++) +		for (int i = 0; i < m_allAabbsCPU.size(); i++)  		{  			//printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2],			m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]); - -  		} -		for (int axis=0;axis<3;axis++) +		for (int axis = 0; axis < 3; axis++)  		{ -			for (int buf=0;buf<2;buf++) +			for (int buf = 0; buf < 2; buf++)  			{ -				b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size()*2); +				b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size() * 2);  			}  		}  	} - - -	m_currentBuffer = 1-m_currentBuffer; -	 - +	m_currentBuffer = 1 - m_currentBuffer;  	int totalNumAabbs = m_allAabbsCPU.size();  	{  		B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)"); -		for (int i=0;i<totalNumAabbs;i++) +		for (int i = 0; i < totalNumAabbs; i++)  		{ -		 -  			unsigned int keyMin[3];  			unsigned int keyMax[3]; -			for (int axis=0;axis<3;axis++) +			for (int axis = 0; axis < 3; axis++)  			{ -				float vmin=m_allAabbsCPU[i].m_min[axis]; +				float vmin = m_allAabbsCPU[i].m_min[axis];  				float vmax = m_allAabbsCPU[i].m_max[axis];  				keyMin[axis] = FloatFlip(vmin);  				keyMax[axis] = FloatFlip(vmax); -			 -				m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_key = keyMin[axis]-1; -				m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_value = i*2; -				m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_key = keyMax[axis]+1; -				m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_value = i*2+1; + +				m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_key = keyMin[axis] - 1; +				m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_value = i * 2; +				m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_key = keyMax[axis] + 1; +				m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_value = i * 2 + 1;  			}  			//printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]); -  		}  	} -	 -	  	{  		B3_PROFILE("sort m_sortedAxisCPU"); -		for (int axis=0;axis<3;axis++) +		for (int axis = 0; axis < 3; axis++)  			m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);  	} @@ -432,21 +405,22 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  	{  		B3_PROFILE("assign m_objectMinMaxIndexCPU"); -		for (int axis=0;axis<3;axis++) +		for (int axis = 0; axis < 3; axis++)  		{  			int totalNumAabbs = m_allAabbsCPU.size();  			int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();  			m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs); -			for (int i=0;i<numEndPoints;i++) +			for (int i = 0; i < numEndPoints; i++)  			{  				int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; -				int newDest = destIndex/2; -				if (destIndex&1) +				int newDest = destIndex / 2; +				if (destIndex & 1)  				{ -					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i; -				} else +					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i; +				} +				else  				{ -					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i; +					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i;  				}  			}  		} @@ -485,12 +459,11 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  	}  #endif -  	int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();  	int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size();  	int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size(); -	b3Assert(a==b); -	b3Assert(b==c); +	b3Assert(a == b); +	b3Assert(b == c);  	/*  	if (searchIncremental3dSapOnGpu)  	{ @@ -574,175 +547,170 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  		int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();  		B3_PROFILE("actual search"); -		for (int i=0;i<numObjects;i++) +		for (int i = 0; i < numObjects; i++)  		{  			//int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size();  			//int checkObjects[]={40,53};  			//int numCheckObjects = sizeof(checkObjects)/sizeof(int); -			 +  			//for (int a=0;a<numCheckObjects ;a++) -			 -			for (int axis=0;axis<3;axis++) + +			for (int axis = 0; axis < 3; axis++)  			{  				//int i = checkObjects[a];  				unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x;  				unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y; -				unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].x; +				unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].x;  				int dmin = curMinIndex - prevMinIndex; -				 -				unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].y; -				 +				unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].y;  				int dmax = curMaxIndex - prevMaxIndex; -				if (dmin!=0) +				if (dmin != 0)  				{  					//printf("for object %d, dmin=%d\n",i,dmin);  				} -				if (dmax!=0) +				if (dmax != 0)  				{  					//printf("for object %d, dmax=%d\n",i,dmax);  				} -				for (int otherbuffer = 0;otherbuffer<2;otherbuffer++) +				for (int otherbuffer = 0; otherbuffer < 2; otherbuffer++)  				{ -					if (dmin!=0) +					if (dmin != 0)  					{ -						int stepMin = dmin<0 ? -1 : 1; -						for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin) +						int stepMin = dmin < 0 ? -1 : 1; +						for (int j = prevMinIndex; j != curMinIndex; j += stepMin)  						{  							int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; -							int otherIndex = otherIndex2/2; -							if (otherIndex!=i) +							int otherIndex = otherIndex2 / 2; +							if (otherIndex != i)  							{ -								bool otherIsMax = ((otherIndex2&1)!=0); +								bool otherIsMax = ((otherIndex2 & 1) != 0);  								if (otherIsMax)  								{  									//bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);  									//bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); -									 +  									bool overlap = true; -									for (int ax=0;ax<3;ax++) +									for (int ax = 0; ax < 3; ax++)  									{  										if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||  											(m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) -											overlap=false; +											overlap = false;  									} -								//	b3Assert(overlap2==overlap); +									//	b3Assert(overlap2==overlap);  									bool prevOverlap = true; -									for (int ax=0;ax<3;ax++) +									for (int ax = 0; ax < 3; ax++)  									{ -										if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || -											(m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) -											prevOverlap=false; +										if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || +											(m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) +											prevOverlap = false;  									} -									  									//b3Assert(overlap==overlap2); -								 - -									if (dmin<0) +									if (dmin < 0)  									{  										if (overlap && !prevOverlap)  										{  											//add a pair  											b3Int4 newPair; -											if (i<=otherIndex) +											if (i <= otherIndex)  											{  												newPair.x = i;  												newPair.y = otherIndex; -											} else +											} +											else  											{  												newPair.x = otherIndex;  												newPair.y = i;  											}  											addedHostPairs.push_back(newPair);  										} -									}  +									}  									else  									{  										if (!overlap && prevOverlap)  										{ -									  											//remove a pair  											b3Int4 removedPair; -											if (i<=otherIndex) +											if (i <= otherIndex)  											{  												removedPair.x = i;  												removedPair.y = otherIndex; -											} else +											} +											else  											{  												removedPair.x = otherIndex;  												removedPair.y = i;  											}  											removedHostPairs.push_back(removedPair);  										} -									}//otherisMax -								}//if (dmin<0) -							}//if (otherIndex!=i) -						}//for (int j= +									}  //otherisMax +								}      //if (dmin<0) +							}          //if (otherIndex!=i) +						}              //for (int j=  					} -				 -					if (dmax!=0) + +					if (dmax != 0)  					{ -						int stepMax = dmax<0 ? -1 : 1; -						for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax) +						int stepMax = dmax < 0 ? -1 : 1; +						for (int j = prevMaxIndex; j != curMaxIndex; j += stepMax)  						{  							int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; -							int otherIndex = otherIndex2/2; -							if (otherIndex!=i) +							int otherIndex = otherIndex2 / 2; +							if (otherIndex != i)  							{  								//bool otherIsMin = ((otherIndex2&1)==0);  								//if (otherIsMin)  								{  									//bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);  									//bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); -									 +  									bool overlap = true; -									for (int ax=0;ax<3;ax++) +									for (int ax = 0; ax < 3; ax++)  									{  										if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||  											(m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) -											overlap=false; +											overlap = false;  									}  									//b3Assert(overlap2==overlap);  									bool prevOverlap = true; -									for (int ax=0;ax<3;ax++) +									for (int ax = 0; ax < 3; ax++)  									{ -										if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) || -											(m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x)) -											prevOverlap=false; +										if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || +											(m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) +											prevOverlap = false;  									} -									 -									if (dmax>0) +									if (dmax > 0)  									{  										if (overlap && !prevOverlap)  										{  											//add a pair  											b3Int4 newPair; -											if (i<=otherIndex) +											if (i <= otherIndex)  											{  												newPair.x = i;  												newPair.y = otherIndex; -											} else +											} +											else  											{  												newPair.x = otherIndex;  												newPair.y = i;  											}  											addedHostPairs.push_back(newPair); -							  										} -									}  +									}  									else  									{  										if (!overlap && prevOverlap) @@ -750,33 +718,31 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  											//if (otherIndex2&1==0) -> min?  											//remove a pair  											b3Int4 removedPair; -											if (i<=otherIndex) +											if (i <= otherIndex)  											{  												removedPair.x = i;  												removedPair.y = otherIndex; -											} else +											} +											else  											{  												removedPair.x = otherIndex;  												removedPair.y = i;  											}  											removedHostPairs.push_back(removedPair); -								  										}  									} -							 -								}//if (dmin<0) -							}//if (otherIndex!=i) -						}//for (int j= + +								}  //if (dmin<0) +							}      //if (otherIndex!=i) +						}          //for (int j=  					} -				}//for (int otherbuffer -			}//for (int axis=0; -		}//for (int i=0;i<numObjects +				}  //for (int otherbuffer +			}      //for (int axis=0; +		}          //for (int i=0;i<numObjects  	}  	//remove duplicates and add/remove then to existing m_overlappingPairs -	 -	 -	 +  	{  		{  			B3_PROFILE("sort allPairs"); @@ -795,31 +761,28 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  	b3Int4 prevPair;  	prevPair.x = -1;  	prevPair.y = -1; -	 +  	int uniqueRemovedPairs = 0;  	b3AlignedObjectArray<int> removedPositions;  	{  		B3_PROFILE("actual removing"); -		for (int i=0;i<removedHostPairs.size();i++) +		for (int i = 0; i < removedHostPairs.size(); i++)  		{  			b3Int4 removedPair = removedHostPairs[i];  			if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y))  			{ +				int index1 = allPairs.findBinarySearch(removedPair); -			int index1 = allPairs.findBinarySearch(removedPair); +				//#ifdef _DEBUG -	//#ifdef _DEBUG -				 -				 -	  				int index2 = allPairs.findLinearSearch(removedPair); -				b3Assert(index1==index2); -	 +				b3Assert(index1 == index2); +  				//b3Assert(index1!=allPairs.size()); -				if (index1<allPairs.size()) -	//#endif//_DEBUG +				if (index1 < allPairs.size()) +				//#endif//_DEBUG  				{  					uniqueRemovedPairs++;  					removedPositions.push_back(index1); @@ -833,13 +796,13 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  		if (uniqueRemovedPairs)  		{ -			for (int i=0;i<removedPositions.size();i++) +			for (int i = 0; i < removedPositions.size(); i++)  			{ -				allPairs[removedPositions[i]].x = INT_MAX ; -				allPairs[removedPositions[i]].y = INT_MAX ; +				allPairs[removedPositions[i]].x = INT_MAX; +				allPairs[removedPositions[i]].y = INT_MAX;  			}  			allPairs.quickSort(b3PairCmp); -			allPairs.resize(allPairs.size()-uniqueRemovedPairs); +			allPairs.resize(allPairs.size() - uniqueRemovedPairs);  		}  	}  	//if (uniqueRemovedPairs) @@ -848,33 +811,31 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  	prevPair.x = -1;  	prevPair.y = -1; -	 -	int uniqueAddedPairs=0; + +	int uniqueAddedPairs = 0;  	b3AlignedObjectArray<b3Int4> actualAddedPairs;  	{  		B3_PROFILE("actual adding"); -		for (int i=0;i<addedHostPairs.size();i++) +		for (int i = 0; i < addedHostPairs.size(); i++)  		{  			b3Int4 newPair = addedHostPairs[i];  			if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y))  			{ -//#ifdef _DEBUG		 +				//#ifdef _DEBUG  				int index1 = allPairs.findBinarySearch(newPair); -				 -	 +  				int index2 = allPairs.findLinearSearch(newPair); -				b3Assert(index1==index2); -	 +				b3Assert(index1 == index2); -				b3Assert(index1==allPairs.size()); -				if (index1!=allPairs.size()) +				b3Assert(index1 == allPairs.size()); +				if (index1 != allPairs.size())  				{  					printf("??\n");  				} -				if (index1==allPairs.size()) -//#endif //_DEBUG +				if (index1 == allPairs.size()) +				//#endif //_DEBUG  				{  					uniqueAddedPairs++;  					actualAddedPairs.push_back(newPair); @@ -882,94 +843,83 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()  			}  			prevPair = newPair;  		} -		for (int i=0;i<actualAddedPairs.size();i++) +		for (int i = 0; i < actualAddedPairs.size(); i++)  		{  			//printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y);  			allPairs.push_back(actualAddedPairs[i]);  		}  	} -	 +  	//if (uniqueAddedPairs)  	//	printf("uniqueAddedPairs=%d\n", uniqueAddedPairs); -  	{  		B3_PROFILE("m_overlappingPairs.copyFromHost");  		m_overlappingPairs.copyFromHost(allPairs);  	} - -	  } - - - -void  b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) +void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)  {  	//test -//	if (m_currentBuffer>=0) +	//	if (m_currentBuffer>=0)  	//	return calculateOverlappingPairsHostIncremental3Sap();  	b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());  	m_allAabbsGPU.copyToHost(m_allAabbsCPU); - -	 -	int axis=0; +	int axis = 0;  	{  		B3_PROFILE("CPU compute best variance axis"); -		b3Vector3 s=b3MakeVector3(0,0,0),s2=b3MakeVector3(0,0,0); +		b3Vector3 s = b3MakeVector3(0, 0, 0), s2 = b3MakeVector3(0, 0, 0);  		int numRigidBodies = m_smallAabbsMappingCPU.size(); -		for(int i=0;i<numRigidBodies;i++)  +		for (int i = 0; i < numRigidBodies; i++)  		{  			b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; -			b3Vector3 maxAabb=b3MakeVector3(aabb.m_max[0],aabb.m_max[1],aabb.m_max[2]); -			b3Vector3 minAabb=b3MakeVector3(aabb.m_min[0],aabb.m_min[1],aabb.m_min[2]); -			b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f; -						 +			b3Vector3 maxAabb = b3MakeVector3(aabb.m_max[0], aabb.m_max[1], aabb.m_max[2]); +			b3Vector3 minAabb = b3MakeVector3(aabb.m_min[0], aabb.m_min[1], aabb.m_min[2]); +			b3Vector3 centerAabb = (maxAabb + minAabb) * 0.5f; +  			s += centerAabb; -			s2 += centerAabb*centerAabb; +			s2 += centerAabb * centerAabb;  		} -		b3Vector3 v = s2 - (s*s) / (float)numRigidBodies; -		 -		if(v[1] > v[0])  +		b3Vector3 v = s2 - (s * s) / (float)numRigidBodies; + +		if (v[1] > v[0])  			axis = 1; -		if(v[2] > v[axis])  +		if (v[2] > v[axis])  			axis = 2;  	} - -	 -  	b3AlignedObjectArray<b3Int4> hostPairs;  	{  		int numSmallAabbs = m_smallAabbsMappingCPU.size(); -		for (int i=0;i<numSmallAabbs;i++) +		for (int i = 0; i < numSmallAabbs; i++)  		{  			b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];  			//float reference = smallAabbi.m_max[axis]; -			for (int j=i+1;j<numSmallAabbs;j++) +			for (int j = i + 1; j < numSmallAabbs; j++)  			{ -  				b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]];  				if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, -					(b3Vector3&)smallAabbj.m_min,(b3Vector3&)smallAabbj.m_max)) +										 (b3Vector3&)smallAabbj.m_min, (b3Vector3&)smallAabbj.m_max))  				{  					b3Int4 pair;  					int a = smallAabbi.m_minIndices[3];  					int b = smallAabbj.m_minIndices[3]; -					if (a<=b) +					if (a <= b)  					{ -						pair.x = a;//store the original index in the unsorted aabb array +						pair.x = a;  //store the original index in the unsorted aabb array  						pair.y = b; -					} else +					} +					else  					{ -						pair.x = b;//store the original index in the unsorted aabb array +						pair.x = b;  //store the original index in the unsorted aabb array  						pair.y = a;  					}  					hostPairs.push_back(pair); @@ -978,35 +928,35 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)  		}  	} -	  	{  		int numSmallAabbs = m_smallAabbsMappingCPU.size(); -		for (int i=0;i<numSmallAabbs;i++) +		for (int i = 0; i < numSmallAabbs; i++)  		{  			b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];  			//float reference = smallAabbi.m_max[axis];  			int numLargeAabbs = m_largeAabbsMappingCPU.size(); -			for (int j=0;j<numLargeAabbs;j++) +			for (int j = 0; j < numLargeAabbs; j++)  			{  				b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]];  				if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, -					(b3Vector3&)largeAabbj.m_min,(b3Vector3&)largeAabbj.m_max)) +										 (b3Vector3&)largeAabbj.m_min, (b3Vector3&)largeAabbj.m_max))  				{  					b3Int4 pair;  					int a = largeAabbj.m_minIndices[3];  					int b = smallAabbi.m_minIndices[3]; -					if (a<=b) +					if (a <= b)  					{ -						pair.x = a;  -						pair.y = b;//store the original index in the unsorted aabb array -					} else +						pair.x = a; +						pair.y = b;  //store the original index in the unsorted aabb array +					} +					else  					{  						pair.x = b; -						pair.y = a;//store the original index in the unsorted aabb array +						pair.y = a;  //store the original index in the unsorted aabb array  					} -					 +  					hostPairs.push_back(pair);  				}  			} @@ -1021,21 +971,20 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)  	if (hostPairs.size())  	{  		m_overlappingPairs.copyFromHost(hostPairs); -	} else +	} +	else  	{  		m_overlappingPairs.resize(0);  	}  	//init3dSap(); -  } -void  b3GpuSapBroadphase::reset() +void b3GpuSapBroadphase::reset()  {  	m_allAabbsGPU.resize(0);  	m_allAabbsCPU.resize(0); -  	m_smallAabbsMappingGPU.resize(0);  	m_smallAabbsMappingCPU.resize(0); @@ -1043,13 +992,11 @@ void  b3GpuSapBroadphase::reset()  	m_largeAabbsMappingGPU.resize(0);  	m_largeAabbsMappingCPU.resize(0); -  } - -void  b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) +void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)  { -	if (m_sapKernel==0) +	if (m_sapKernel == 0)  	{  		calculateOverlappingPairsHost(maxPairs);  		return; @@ -1065,68 +1012,62 @@ void  b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)  	int axis = 0;  	{ +		//bool syncOnHost = false; -	//bool syncOnHost = false; - -	int numSmallAabbs = m_smallAabbsMappingCPU.size(); -	if (m_prefixScanFloat4 && numSmallAabbs) -	{ -		B3_PROFILE("GPU compute best variance axis"); -		 -		if (m_dst.size()!=(numSmallAabbs+1)) +		int numSmallAabbs = m_smallAabbsMappingCPU.size(); +		if (m_prefixScanFloat4 && numSmallAabbs)  		{ -			m_dst.resize(numSmallAabbs+128); -			m_sum.resize(numSmallAabbs+128); -			m_sum2.resize(numSmallAabbs+128); -			m_sum.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? -			m_sum2.at(numSmallAabbs)=b3MakeVector3(0,0,0); //slow? -		} +			B3_PROFILE("GPU compute best variance axis"); -		b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel ,"m_prepareSumVarianceKernel"); -		launcher.setBuffer(m_allAabbsGPU.getBufferCL()); -		 -		launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); -		launcher.setBuffer(m_sum.getBufferCL()); -		launcher.setBuffer(m_sum2.getBufferCL()); -		launcher.setConst( numSmallAabbs  ); -		int num = numSmallAabbs; -		launcher.launch1D( num); -		 +			if (m_dst.size() != (numSmallAabbs + 1)) +			{ +				m_dst.resize(numSmallAabbs + 128); +				m_sum.resize(numSmallAabbs + 128); +				m_sum2.resize(numSmallAabbs + 128); +				m_sum.at(numSmallAabbs) = b3MakeVector3(0, 0, 0);   //slow? +				m_sum2.at(numSmallAabbs) = b3MakeVector3(0, 0, 0);  //slow? +			} -		b3Vector3 s; -		b3Vector3 s2; -		m_prefixScanFloat4->execute(m_sum,m_dst,numSmallAabbs+1,&s); -		m_prefixScanFloat4->execute(m_sum2,m_dst,numSmallAabbs+1,&s2); +			b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel, "m_prepareSumVarianceKernel"); +			launcher.setBuffer(m_allAabbsGPU.getBufferCL()); -		b3Vector3 v = s2 - (s*s) / (float)numSmallAabbs; -		 -		if(v[1] > v[0])  -			axis = 1; -		if(v[2] > v[axis])  -			axis = 2; -	} +			launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); +			launcher.setBuffer(m_sum.getBufferCL()); +			launcher.setBuffer(m_sum2.getBufferCL()); +			launcher.setConst(numSmallAabbs); +			int num = numSmallAabbs; +			launcher.launch1D(num); +			b3Vector3 s; +			b3Vector3 s2; +			m_prefixScanFloat4->execute(m_sum, m_dst, numSmallAabbs + 1, &s); +			m_prefixScanFloat4->execute(m_sum2, m_dst, numSmallAabbs + 1, &s2); + +			b3Vector3 v = s2 - (s * s) / (float)numSmallAabbs; + +			if (v[1] > v[0]) +				axis = 1; +			if (v[2] > v[axis]) +				axis = 2; +		} -		  		m_gpuSmallSortData.resize(numSmallAabbs); -		  #if 1  		if (m_smallAabbsMappingGPU.size())  		{ -			  			B3_PROFILE("flipFloatKernel"); -			b3BufferInfoCL bInfo[] = {  -				b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),  -				b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), -				b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())}; -			b3LauncherCL launcher(m_queue, m_flipFloatKernel ,"m_flipFloatKernel"); -			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); -			launcher.setConst( numSmallAabbs  ); -			launcher.setConst( axis  ); -			 +			b3BufferInfoCL bInfo[] = { +				b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), +				b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), +				b3BufferInfoCL(m_gpuSmallSortData.getBufferCL())}; +			b3LauncherCL launcher(m_queue, m_flipFloatKernel, "m_flipFloatKernel"); +			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); +			launcher.setConst(numSmallAabbs); +			launcher.setConst(axis); +  			int num = numSmallAabbs; -			launcher.launch1D( num); +			launcher.launch1D(num);  			clFinish(m_queue);  		} @@ -1141,69 +1082,66 @@ void  b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)  		if (numSmallAabbs)  		{  			B3_PROFILE("scatterKernel"); -			 -			b3BufferInfoCL bInfo[] = {  -				b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), -				b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL(), true), -				b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true), + +			b3BufferInfoCL bInfo[] = { +				b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), +				b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), +				b3BufferInfoCL(m_gpuSmallSortData.getBufferCL(), true),  				b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; -			b3LauncherCL launcher(m_queue, m_scatterKernel ,"m_scatterKernel "); -			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); -			launcher.setConst( numSmallAabbs); +			b3LauncherCL launcher(m_queue, m_scatterKernel, "m_scatterKernel "); +			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); +			launcher.setConst(numSmallAabbs);  			int num = numSmallAabbs; -			launcher.launch1D( num); +			launcher.launch1D(num);  			clFinish(m_queue); -			  		} -         -			m_overlappingPairs.resize(maxPairs); +		m_overlappingPairs.resize(maxPairs); -			m_pairCount.resize(0); -			m_pairCount.push_back(0); -            int numPairs=0; +		m_pairCount.resize(0); +		m_pairCount.push_back(0); +		int numPairs = 0; +		{ +			int numLargeAabbs = m_largeAabbsMappingGPU.size(); +			if (numLargeAabbs && numSmallAabbs)  			{ -				int numLargeAabbs = m_largeAabbsMappingGPU.size(); -				if (numLargeAabbs && numSmallAabbs) +				//@todo +				B3_PROFILE("sap2Kernel"); +				b3BufferInfoCL bInfo[] = { +					b3BufferInfoCL(m_allAabbsGPU.getBufferCL()), +					b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()), +					b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()), +					b3BufferInfoCL(m_overlappingPairs.getBufferCL()), +					b3BufferInfoCL(m_pairCount.getBufferCL())}; +				b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel"); +				launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); +				launcher.setConst(numLargeAabbs); +				launcher.setConst(numSmallAabbs); +				launcher.setConst(axis); +				launcher.setConst(maxPairs); +				//@todo: use actual maximum work item sizes of the device instead of hardcoded values +				launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64); + +				numPairs = m_pairCount.at(0); +				if (numPairs > maxPairs)  				{ -					//@todo -					B3_PROFILE("sap2Kernel"); -					b3BufferInfoCL bInfo[] = {  -						b3BufferInfoCL( m_allAabbsGPU.getBufferCL() ), -						b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ), -						b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ),  -						b3BufferInfoCL( m_overlappingPairs.getBufferCL() ),  -						b3BufferInfoCL(m_pairCount.getBufferCL())}; -					b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel"); -					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); -					launcher.setConst(   numLargeAabbs  ); -					launcher.setConst( numSmallAabbs); -					launcher.setConst( axis  ); -					launcher.setConst( maxPairs  ); -//@todo: use actual maximum work item sizes of the device instead of hardcoded values -					launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); -                 -					numPairs = m_pairCount.at(0); -					if (numPairs >maxPairs) -					{ -						b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); -						numPairs =maxPairs; -					} +					b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); +					numPairs = maxPairs;  				}  			} -			if (m_gpuSmallSortedAabbs.size()) -			{ -				B3_PROFILE("sapKernel"); -				b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(m_pairCount.getBufferCL())}; -				b3LauncherCL launcher(m_queue, m_sapKernel,"m_sapKernel"); -				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); -				launcher.setConst( numSmallAabbs  ); -				launcher.setConst( axis  ); -				launcher.setConst( maxPairs  ); - -			 -				int num = numSmallAabbs; +		} +		if (m_gpuSmallSortedAabbs.size()) +		{ +			B3_PROFILE("sapKernel"); +			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL()), b3BufferInfoCL(m_overlappingPairs.getBufferCL()), b3BufferInfoCL(m_pairCount.getBufferCL())}; +			b3LauncherCL launcher(m_queue, m_sapKernel, "m_sapKernel"); +			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); +			launcher.setConst(numSmallAabbs); +			launcher.setConst(axis); +			launcher.setConst(maxPairs); + +			int num = numSmallAabbs;  #if 0                                  int buffSize = launcher.getSerializationBufferSize();                  unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; @@ -1225,73 +1163,71 @@ void  b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)                  FILE* f = fopen("m_sapKernelArgs.bin","wb");                  fwrite(buf,buffSize+sizeof(int),1,f);                  fclose(f); -#endif// +#endif  // -                launcher.launch1D( num); -				clFinish(m_queue); -                 -                numPairs = m_pairCount.at(0); -                if (numPairs>maxPairs) -				{ -					b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); -					numPairs = maxPairs; -					m_pairCount.resize(0); -					m_pairCount.push_back(maxPairs); -				} +			launcher.launch1D(num); +			clFinish(m_queue); + +			numPairs = m_pairCount.at(0); +			if (numPairs > maxPairs) +			{ +				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); +				numPairs = maxPairs; +				m_pairCount.resize(0); +				m_pairCount.push_back(maxPairs);  			} -			 +		} +  #else -        int numPairs = 0; -         -         -        b3LauncherCL launcher(m_queue, m_sapKernel); - -        const char* fileName = "m_sapKernelArgs.bin"; -        FILE* f = fopen(fileName,"rb"); -        if (f) -        { -            int sizeInBytes=0; -            if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))  -            { -                printf("error, cannot get file size\n"); -                exit(0); -            } -             -            unsigned char* buf = (unsigned char*) malloc(sizeInBytes); -            fread(buf,sizeInBytes,1,f); -            int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); -            int num = *(int*)&buf[serializedBytes]; -            launcher.launch1D( num); -             -            b3OpenCLArray<int> pairCount(m_context, m_queue); -            int numElements = launcher.m_arrays[2]->size()/sizeof(int); -            pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements); -            numPairs = pairCount.at(0); -            //printf("overlapping pairs = %d\n",numPairs); -            b3AlignedObjectArray<b3Int4>		hostOoverlappingPairs; -            b3OpenCLArray<b3Int4> tmpGpuPairs(m_context,m_queue); -            tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs ); -    -            tmpGpuPairs.copyToHost(hostOoverlappingPairs); -            m_overlappingPairs.copyFromHost(hostOoverlappingPairs); -            //printf("hello %d\n", m_overlappingPairs.size()); -            free(buf); -            fclose(f); -             -        } else { -            printf("error: cannot find file %s\n",fileName); -        } -         -        clFinish(m_queue); - -         +		int numPairs = 0; + +		b3LauncherCL launcher(m_queue, m_sapKernel); + +		const char* fileName = "m_sapKernelArgs.bin"; +		FILE* f = fopen(fileName, "rb"); +		if (f) +		{ +			int sizeInBytes = 0; +			if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) +			{ +				printf("error, cannot get file size\n"); +				exit(0); +			} + +			unsigned char* buf = (unsigned char*)malloc(sizeInBytes); +			fread(buf, sizeInBytes, 1, f); +			int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context); +			int num = *(int*)&buf[serializedBytes]; +			launcher.launch1D(num); + +			b3OpenCLArray<int> pairCount(m_context, m_queue); +			int numElements = launcher.m_arrays[2]->size() / sizeof(int); +			pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(), numElements); +			numPairs = pairCount.at(0); +			//printf("overlapping pairs = %d\n",numPairs); +			b3AlignedObjectArray<b3Int4> hostOoverlappingPairs; +			b3OpenCLArray<b3Int4> tmpGpuPairs(m_context, m_queue); +			tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(), numPairs); + +			tmpGpuPairs.copyToHost(hostOoverlappingPairs); +			m_overlappingPairs.copyFromHost(hostOoverlappingPairs); +			//printf("hello %d\n", m_overlappingPairs.size()); +			free(buf); +			fclose(f); +		} +		else +		{ +			printf("error: cannot find file %s\n", fileName); +		} + +		clFinish(m_queue); +  #endif -			 -        m_overlappingPairs.resize(numPairs); -		 -	}//B3_PROFILE("GPU_RADIX SORT"); -	//init3dSap(); +		m_overlappingPairs.resize(numPairs); + +	}  //B3_PROFILE("GPU_RADIX SORT"); +	   //init3dSap();  }  void b3GpuSapBroadphase::writeAabbsToGpu() @@ -1299,17 +1235,14 @@ void b3GpuSapBroadphase::writeAabbsToGpu()  	m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);  	m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); -	m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this -	 -	 - +	m_allAabbsGPU.copyFromHost(m_allAabbsCPU);  //might not be necessary, the 'setupGpuAabbsFull' already takes care of this  } -void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)  {  	int index = userPtr;  	b3SapAabb aabb; -	for (int i=0;i<4;i++) +	for (int i = 0; i < 4; i++)  	{  		aabb.m_min[i] = aabbMin[i];  		aabb.m_max[i] = aabbMax[i]; @@ -1317,15 +1250,15 @@ void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin,  const b3Vec  	aabb.m_minIndices[3] = index;  	aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();  	m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size()); -	 +  	m_allAabbsCPU.push_back(aabb);  } -void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask) +void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)  {  	int index = userPtr;  	b3SapAabb aabb; -	for (int i=0;i<4;i++) +	for (int i = 0; i < 4; i++)  	{  		aabb.m_min[i] = aabbMin[i];  		aabb.m_max[i] = aabbMax[i]; @@ -1334,20 +1267,19 @@ void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin,  const b3Vector3&  	aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();  	m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size()); -  	m_allAabbsCPU.push_back(aabb);  } -cl_mem	b3GpuSapBroadphase::getAabbBufferWS() +cl_mem b3GpuSapBroadphase::getAabbBufferWS()  {  	return m_allAabbsGPU.getBufferCL();  } -int	b3GpuSapBroadphase::getNumOverlap() +int b3GpuSapBroadphase::getNumOverlap()  {  	return m_overlappingPairs.size();  } -cl_mem	b3GpuSapBroadphase::getOverlappingPairBuffer() +cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()  {  	return m_overlappingPairs.getBufferCL();  } diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h index 8d36ac78f2..d17590b14a 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h @@ -2,7 +2,7 @@  #define B3_GPU_SAP_BROADPHASE_H  #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" -#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2 +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"  //b3Int2  class b3Vector3;  #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" @@ -11,141 +11,133 @@ class b3Vector3;  #include "b3GpuBroadphaseInterface.h" -  class b3GpuSapBroadphase : public b3GpuBroadphaseInterface  { -	 -	cl_context				m_context; -	cl_device_id			m_device; -	cl_command_queue		m_queue; -	cl_kernel				m_flipFloatKernel; -	cl_kernel				m_scatterKernel ; -	cl_kernel				m_copyAabbsKernel; -	cl_kernel				m_sapKernel; -	cl_kernel				m_sap2Kernel; -	cl_kernel				m_prepareSumVarianceKernel; -	 +	cl_context m_context; +	cl_device_id m_device; +	cl_command_queue m_queue; +	cl_kernel m_flipFloatKernel; +	cl_kernel m_scatterKernel; +	cl_kernel m_copyAabbsKernel; +	cl_kernel m_sapKernel; +	cl_kernel m_sap2Kernel; +	cl_kernel m_prepareSumVarianceKernel;  	class b3RadixSort32CL* m_sorter;  	///test for 3d SAP -	b3AlignedObjectArray<b3SortData>		m_sortedAxisCPU[3][2]; -	b3AlignedObjectArray<b3UnsignedInt2>	m_objectMinMaxIndexCPU[3][2]; -	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis0; -	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis1; -	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis2; -	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis0prev; -	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis1prev; -	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis2prev; - -	b3OpenCLArray<b3SortData>				m_sortedAxisGPU0; -	b3OpenCLArray<b3SortData>				m_sortedAxisGPU1; -	b3OpenCLArray<b3SortData>				m_sortedAxisGPU2; -	b3OpenCLArray<b3SortData>				m_sortedAxisGPU0prev; -	b3OpenCLArray<b3SortData>				m_sortedAxisGPU1prev; -	b3OpenCLArray<b3SortData>				m_sortedAxisGPU2prev; - - -	b3OpenCLArray<b3Int4>					m_addedHostPairsGPU; -	b3OpenCLArray<b3Int4>					m_removedHostPairsGPU; -	b3OpenCLArray<int>						m_addedCountGPU; -	b3OpenCLArray<int>						m_removedCountGPU; -	 -	int	m_currentBuffer; +	b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2]; +	b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2]; +	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0; +	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1; +	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2; +	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev; +	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev; +	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev; + +	b3OpenCLArray<b3SortData> m_sortedAxisGPU0; +	b3OpenCLArray<b3SortData> m_sortedAxisGPU1; +	b3OpenCLArray<b3SortData> m_sortedAxisGPU2; +	b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev; +	b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev; +	b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev; + +	b3OpenCLArray<b3Int4> m_addedHostPairsGPU; +	b3OpenCLArray<b3Int4> m_removedHostPairsGPU; +	b3OpenCLArray<int> m_addedCountGPU; +	b3OpenCLArray<int> m_removedCountGPU; + +	int m_currentBuffer;  public: -  	b3OpenCLArray<int> m_pairCount; +	b3OpenCLArray<b3SapAabb> m_allAabbsGPU; +	b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU; -	b3OpenCLArray<b3SapAabb>	m_allAabbsGPU; -	b3AlignedObjectArray<b3SapAabb>	m_allAabbsCPU; - -	virtual b3OpenCLArray<b3SapAabb>&	getAllAabbsGPU() +	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()  	{  		return m_allAabbsGPU;  	} -	virtual b3AlignedObjectArray<b3SapAabb>&	getAllAabbsCPU() +	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()  	{  		return m_allAabbsCPU;  	} -	b3OpenCLArray<b3Vector3>	m_sum; -	b3OpenCLArray<b3Vector3>	m_sum2; -	b3OpenCLArray<b3Vector3>	m_dst; +	b3OpenCLArray<b3Vector3> m_sum; +	b3OpenCLArray<b3Vector3> m_sum2; +	b3OpenCLArray<b3Vector3> m_dst; -	b3OpenCLArray<int>	m_smallAabbsMappingGPU; +	b3OpenCLArray<int> m_smallAabbsMappingGPU;  	b3AlignedObjectArray<int> m_smallAabbsMappingCPU; -	b3OpenCLArray<int>	m_largeAabbsMappingGPU; +	b3OpenCLArray<int> m_largeAabbsMappingGPU;  	b3AlignedObjectArray<int> m_largeAabbsMappingCPU; -	 -	b3OpenCLArray<b3Int4>		m_overlappingPairs; +	b3OpenCLArray<b3Int4> m_overlappingPairs;  	//temporary gpu work memory -	b3OpenCLArray<b3SortData>	m_gpuSmallSortData; -	b3OpenCLArray<b3SapAabb>	m_gpuSmallSortedAabbs; +	b3OpenCLArray<b3SortData> m_gpuSmallSortData; +	b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs; -	class b3PrefixScanFloat4CL*		m_prefixScanFloat4; +	class b3PrefixScanFloat4CL* m_prefixScanFloat4;  	enum b3GpuSapKernelType  	{ -		B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1, +		B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1,  		B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,  		B3_GPU_SAP_KERNEL_ORIGINAL,  		B3_GPU_SAP_KERNEL_BARRIER,  		B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY  	}; -	b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); +	b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);  	virtual ~b3GpuSapBroadphase(); -	 -	static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue  q) + +	static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q)  	{ -		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU); +		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);  	} -	static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue  q) +	static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q)  	{ -		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU); +		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);  	} -	static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue  q) +	static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q)  	{ -		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL); +		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL);  	} -	static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue  q) +	static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q)  	{ -		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER); +		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER);  	} -	static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue  q) +	static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q)  	{ -		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY); +		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);  	} -	 -	virtual void  calculateOverlappingPairs(int maxPairs); -	virtual void  calculateOverlappingPairsHost(int maxPairs); -	 -	void  reset(); +	virtual void calculateOverlappingPairs(int maxPairs); +	virtual void calculateOverlappingPairsHost(int maxPairs); + +	void reset();  	void init3dSap();  	virtual void calculateOverlappingPairsHostIncremental3Sap(); -	virtual void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); -	virtual void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask); +	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask); +	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);  	//call writeAabbsToGpu after done making all changes (createProxy etc)  	virtual void writeAabbsToGpu(); -	virtual cl_mem	getAabbBufferWS(); -	virtual int	getNumOverlap(); -	virtual cl_mem	getOverlappingPairBuffer(); -	 +	virtual cl_mem getAabbBufferWS(); +	virtual int getNumOverlap(); +	virtual cl_mem getOverlappingPairBuffer(); +  	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();  	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();  	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();  }; -#endif //B3_GPU_SAP_BROADPHASE_H
\ No newline at end of file +#endif  //B3_GPU_SAP_BROADPHASE_H
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h index ea6550fede..60570f2605 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h @@ -5,10 +5,9 @@  #include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"  ///just make sure that the b3Aabb is 16-byte aligned -B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb -{ +B3_ATTRIBUTE_ALIGNED16(struct) +b3SapAabb : public b3Aabb{ -}; +			}; - -#endif //B3_SAP_AABB_H +#endif  //B3_SAP_AABB_H diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h index dad42477c3..0185417786 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h @@ -1,199 +1,198 @@  //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* gridBroadphaseCL= \ -"int getPosHash(int4 gridPos, __global float4* pParams)\n" -"{\n" -"	int4 gridDim = *((__global int4*)(pParams + 1));\n" -"	gridPos.x &= gridDim.x - 1;\n" -"	gridPos.y &= gridDim.y - 1;\n" -"	gridPos.z &= gridDim.z - 1;\n" -"	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n" -"	return hash;\n" -"} \n" -"int4 getGridPos(float4 worldPos, __global float4* pParams)\n" -"{\n" -"    int4 gridPos;\n" -"	int4 gridDim = *((__global int4*)(pParams + 1));\n" -"    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n" -"    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n" -"    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n" -"    return gridPos;\n" -"}\n" -"// calculate grid hash value for each body using its AABB\n" -"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n" -"{\n" -"    int index = get_global_id(0);\n" -"    if(index >= numObjects)\n" -"	{\n" -"		return;\n" -"	}\n" -"	float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n" -"	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n" -"	float4 pos;\n" -"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" -"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" -"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" -"	pos.w = 0.f;\n" -"    // get address in grid\n" -"    int4 gridPos = getGridPos(pos, pParams);\n" -"    int gridHash = getPosHash(gridPos, pParams);\n" -"    // store grid hash and body index\n" -"    int2 hashVal;\n" -"    hashVal.x = gridHash;\n" -"    hashVal.y = index;\n" -"    pHash[index] = hashVal;\n" -"}\n" -"__kernel void kClearCellStart(	int numCells, \n" -"								__global int* pCellStart )\n" -"{\n" -"    int index = get_global_id(0);\n" -"    if(index >= numCells)\n" -"	{\n" -"		return;\n" -"	}\n" -"	pCellStart[index] = -1;\n" -"}\n" -"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n" -"{\n" -"	__local int sharedHash[513];\n" -"    int index = get_global_id(0);\n" -"	int2 sortedData;\n" -"    if(index < numObjects)\n" -"	{\n" -"		sortedData = pHash[index];\n" -"		// Load hash data into shared memory so that we can look \n" -"		// at neighboring body's hash value without loading\n" -"		// two hash values per thread\n" -"		sharedHash[get_local_id(0) + 1] = sortedData.x;\n" -"		if((index > 0) && (get_local_id(0) == 0))\n" -"		{\n" -"			// first thread in block must load neighbor body hash\n" -"			sharedHash[0] = pHash[index-1].x;\n" -"		}\n" -"	}\n" -"    barrier(CLK_LOCAL_MEM_FENCE);\n" -"    if(index < numObjects)\n" -"	{\n" -"		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" -"		{\n" -"			cellStart[sortedData.x] = index;\n" -"		}\n" -"	}\n" -"}\n" -"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n" -"{\n" -"	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && \n" -"			(min0.y <= max1.y)&& (min1.y <= max0.y) && \n" -"			(min0.z <= max1.z)&& (min1.z <= max0.z); \n" -"}\n" -"//search for AABB 'index' against other AABBs' in this cell\n" -"void findPairsInCell(	int numObjects,\n" -"						int4	gridPos,\n" -"						int    index,\n" -"						__global int2*  pHash,\n" -"						__global int*   pCellStart,\n" -"						__global float4* allpAABB, \n" -"						__global const int* smallAabbMapping,\n" -"						__global float4* pParams,\n" -"							volatile  __global int* pairCount,\n" -"						__global int4*   pPairBuff2,\n" -"						int maxPairs\n" -"						)\n" -"{\n" -"	int4 pGridDim = *((__global int4*)(pParams + 1));\n" -"	int maxBodiesPerCell = pGridDim.w;\n" -"    int gridHash = getPosHash(gridPos, pParams);\n" -"    // get start of bucket for this cell\n" -"    int bucketStart = pCellStart[gridHash];\n" -"    if (bucketStart == -1)\n" -"	{\n" -"        return;   // cell empty\n" -"	}\n" -"	// iterate over bodies in this cell\n" -"    int2 sortedData = pHash[index];\n" -"	int unsorted_indx = sortedData.y;\n" -"    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n" -"	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" -"	int handleIndex =  as_int(min0.w);\n" -"	\n" -"	int bucketEnd = bucketStart + maxBodiesPerCell;\n" -"	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n" -"	for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n" -"	{\n" -"        int2 cellData = pHash[index2];\n" -"        if (cellData.x != gridHash)\n" -"        {\n" -"			break;   // no longer in same bucket\n" -"		}\n" -"		int unsorted_indx2 = cellData.y;\n" -"        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" -"		if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n" -"        {   \n" -"			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n" -"			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n" -"			if(testAABBOverlap(min0, max0, min1, max1))\n" -"			{\n" -"				if (pairCount)\n" -"				{\n" -"					int handleIndex2 = as_int(min1.w);\n" -"					if (handleIndex<handleIndex2)\n" -"					{\n" -"						int curPair = atomic_add(pairCount,1);\n" -"						if (curPair<maxPairs)\n" -"						{\n" -"							int4 newpair;\n" -"							newpair.x = handleIndex;\n" -"							newpair.y = handleIndex2;\n" -"							newpair.z = -1;\n" -"							newpair.w = -1;\n" -"							pPairBuff2[curPair] = newpair;\n" -"						}\n" -"					}\n" -"				\n" -"				}\n" -"			}\n" -"		}\n" -"	}\n" -"}\n" -"__kernel void kFindOverlappingPairs(	int numObjects,\n" -"										__global float4* allpAABB, \n" -"										__global const int* smallAabbMapping,\n" -"										__global int2* pHash, \n" -"										__global int* pCellStart, \n" -"										__global float4* pParams ,\n" -"										volatile  __global int* pairCount,\n" -"										__global int4*   pPairBuff2,\n" -"										int maxPairs\n" -"										)\n" -"{\n" -"    int index = get_global_id(0);\n" -"    if(index >= numObjects)\n" -"	{\n" -"		return;\n" -"	}\n" -"    int2 sortedData = pHash[index];\n" -"	int unsorted_indx = sortedData.y;\n" -"	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n" -"	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" -"	float4 pos;\n" -"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" -"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" -"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" -"    // get address in grid\n" -"    int4 gridPosA = getGridPos(pos, pParams);\n" -"    int4 gridPosB; \n" -"    // examine only neighbouring cells\n" -"    for(int z=-1; z<=1; z++) \n" -"    {\n" -"		gridPosB.z = gridPosA.z + z;\n" -"        for(int y=-1; y<=1; y++) \n" -"        {\n" -"			gridPosB.y = gridPosA.y + y;\n" -"            for(int x=-1; x<=1; x++) \n" -"            {\n" -"				gridPosB.x = gridPosA.x + x;\n" -"                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n" -"            }\n" -"        }\n" -"    }\n" -"}\n" -; +static const char* gridBroadphaseCL = +	"int getPosHash(int4 gridPos, __global float4* pParams)\n" +	"{\n" +	"	int4 gridDim = *((__global int4*)(pParams + 1));\n" +	"	gridPos.x &= gridDim.x - 1;\n" +	"	gridPos.y &= gridDim.y - 1;\n" +	"	gridPos.z &= gridDim.z - 1;\n" +	"	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n" +	"	return hash;\n" +	"} \n" +	"int4 getGridPos(float4 worldPos, __global float4* pParams)\n" +	"{\n" +	"    int4 gridPos;\n" +	"	int4 gridDim = *((__global int4*)(pParams + 1));\n" +	"    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n" +	"    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n" +	"    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n" +	"    return gridPos;\n" +	"}\n" +	"// calculate grid hash value for each body using its AABB\n" +	"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n" +	"{\n" +	"    int index = get_global_id(0);\n" +	"    if(index >= numObjects)\n" +	"	{\n" +	"		return;\n" +	"	}\n" +	"	float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n" +	"	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n" +	"	float4 pos;\n" +	"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" +	"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" +	"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" +	"	pos.w = 0.f;\n" +	"    // get address in grid\n" +	"    int4 gridPos = getGridPos(pos, pParams);\n" +	"    int gridHash = getPosHash(gridPos, pParams);\n" +	"    // store grid hash and body index\n" +	"    int2 hashVal;\n" +	"    hashVal.x = gridHash;\n" +	"    hashVal.y = index;\n" +	"    pHash[index] = hashVal;\n" +	"}\n" +	"__kernel void kClearCellStart(	int numCells, \n" +	"								__global int* pCellStart )\n" +	"{\n" +	"    int index = get_global_id(0);\n" +	"    if(index >= numCells)\n" +	"	{\n" +	"		return;\n" +	"	}\n" +	"	pCellStart[index] = -1;\n" +	"}\n" +	"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n" +	"{\n" +	"	__local int sharedHash[513];\n" +	"    int index = get_global_id(0);\n" +	"	int2 sortedData;\n" +	"    if(index < numObjects)\n" +	"	{\n" +	"		sortedData = pHash[index];\n" +	"		// Load hash data into shared memory so that we can look \n" +	"		// at neighboring body's hash value without loading\n" +	"		// two hash values per thread\n" +	"		sharedHash[get_local_id(0) + 1] = sortedData.x;\n" +	"		if((index > 0) && (get_local_id(0) == 0))\n" +	"		{\n" +	"			// first thread in block must load neighbor body hash\n" +	"			sharedHash[0] = pHash[index-1].x;\n" +	"		}\n" +	"	}\n" +	"    barrier(CLK_LOCAL_MEM_FENCE);\n" +	"    if(index < numObjects)\n" +	"	{\n" +	"		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" +	"		{\n" +	"			cellStart[sortedData.x] = index;\n" +	"		}\n" +	"	}\n" +	"}\n" +	"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n" +	"{\n" +	"	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && \n" +	"			(min0.y <= max1.y)&& (min1.y <= max0.y) && \n" +	"			(min0.z <= max1.z)&& (min1.z <= max0.z); \n" +	"}\n" +	"//search for AABB 'index' against other AABBs' in this cell\n" +	"void findPairsInCell(	int numObjects,\n" +	"						int4	gridPos,\n" +	"						int    index,\n" +	"						__global int2*  pHash,\n" +	"						__global int*   pCellStart,\n" +	"						__global float4* allpAABB, \n" +	"						__global const int* smallAabbMapping,\n" +	"						__global float4* pParams,\n" +	"							volatile  __global int* pairCount,\n" +	"						__global int4*   pPairBuff2,\n" +	"						int maxPairs\n" +	"						)\n" +	"{\n" +	"	int4 pGridDim = *((__global int4*)(pParams + 1));\n" +	"	int maxBodiesPerCell = pGridDim.w;\n" +	"    int gridHash = getPosHash(gridPos, pParams);\n" +	"    // get start of bucket for this cell\n" +	"    int bucketStart = pCellStart[gridHash];\n" +	"    if (bucketStart == -1)\n" +	"	{\n" +	"        return;   // cell empty\n" +	"	}\n" +	"	// iterate over bodies in this cell\n" +	"    int2 sortedData = pHash[index];\n" +	"	int unsorted_indx = sortedData.y;\n" +	"    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n" +	"	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" +	"	int handleIndex =  as_int(min0.w);\n" +	"	\n" +	"	int bucketEnd = bucketStart + maxBodiesPerCell;\n" +	"	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n" +	"	for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n" +	"	{\n" +	"        int2 cellData = pHash[index2];\n" +	"        if (cellData.x != gridHash)\n" +	"        {\n" +	"			break;   // no longer in same bucket\n" +	"		}\n" +	"		int unsorted_indx2 = cellData.y;\n" +	"        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" +	"		if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n" +	"        {   \n" +	"			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n" +	"			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n" +	"			if(testAABBOverlap(min0, max0, min1, max1))\n" +	"			{\n" +	"				if (pairCount)\n" +	"				{\n" +	"					int handleIndex2 = as_int(min1.w);\n" +	"					if (handleIndex<handleIndex2)\n" +	"					{\n" +	"						int curPair = atomic_add(pairCount,1);\n" +	"						if (curPair<maxPairs)\n" +	"						{\n" +	"							int4 newpair;\n" +	"							newpair.x = handleIndex;\n" +	"							newpair.y = handleIndex2;\n" +	"							newpair.z = -1;\n" +	"							newpair.w = -1;\n" +	"							pPairBuff2[curPair] = newpair;\n" +	"						}\n" +	"					}\n" +	"				\n" +	"				}\n" +	"			}\n" +	"		}\n" +	"	}\n" +	"}\n" +	"__kernel void kFindOverlappingPairs(	int numObjects,\n" +	"										__global float4* allpAABB, \n" +	"										__global const int* smallAabbMapping,\n" +	"										__global int2* pHash, \n" +	"										__global int* pCellStart, \n" +	"										__global float4* pParams ,\n" +	"										volatile  __global int* pairCount,\n" +	"										__global int4*   pPairBuff2,\n" +	"										int maxPairs\n" +	"										)\n" +	"{\n" +	"    int index = get_global_id(0);\n" +	"    if(index >= numObjects)\n" +	"	{\n" +	"		return;\n" +	"	}\n" +	"    int2 sortedData = pHash[index];\n" +	"	int unsorted_indx = sortedData.y;\n" +	"	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n" +	"	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n" +	"	float4 pos;\n" +	"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" +	"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" +	"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" +	"    // get address in grid\n" +	"    int4 gridPosA = getGridPos(pos, pParams);\n" +	"    int4 gridPosB; \n" +	"    // examine only neighbouring cells\n" +	"    for(int z=-1; z<=1; z++) \n" +	"    {\n" +	"		gridPosB.z = gridPosA.z + z;\n" +	"        for(int y=-1; y<=1; y++) \n" +	"        {\n" +	"			gridPosB.y = gridPosA.y + y;\n" +	"            for(int x=-1; x<=1; x++) \n" +	"            {\n" +	"				gridPosB.x = gridPosA.x + x;\n" +	"                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n" +	"            }\n" +	"        }\n" +	"    }\n" +	"}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h index 5eb8f45b16..c02877dde9 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -1,729 +1,728 @@  //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* parallelLinearBvhCL= \ -"/*\n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose,\n" -"including commercial applications, and to alter it and redistribute it freely,\n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Initial Author Jackson Lee, 2014\n" -"typedef float b3Scalar;\n" -"typedef float4 b3Vector3;\n" -"#define b3Max max\n" -"#define b3Min min\n" -"#define b3Sqrt sqrt\n" -"typedef struct\n" -"{\n" -"	unsigned int m_key;\n" -"	unsigned int m_value;\n" -"} SortDataCL;\n" -"typedef struct \n" -"{\n" -"	union\n" -"	{\n" -"		float4	m_min;\n" -"		float   m_minElems[4];\n" -"		int			m_minIndices[4];\n" -"	};\n" -"	union\n" -"	{\n" -"		float4	m_max;\n" -"		float   m_maxElems[4];\n" -"		int			m_maxIndices[4];\n" -"	};\n" -"} b3AabbCL;\n" -"unsigned int interleaveBits(unsigned int x)\n" -"{\n" -"	//........ ........ ......12 3456789A	//x\n" -"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits\n" -"	\n" -"	//......12 3456789A ......12 3456789A	//x ^ (x << 16)\n" -"	//11111111 ........ ........ 11111111	//0x FF 00 00 FF\n" -"	//......12 ........ ........ 3456789A	//x = (x ^ (x << 16)) & 0xFF0000FF;\n" -"	\n" -"	//......12 ........ 3456789A 3456789A	//x ^ (x <<  8)\n" -"	//......11 ........ 1111.... ....1111	//0x 03 00 F0 0F\n" -"	//......12 ........ 3456.... ....789A	//x = (x ^ (x <<  8)) & 0x0300F00F;\n" -"	\n" -"	//..12..12 ....3456 3456.... 789A789A	//x ^ (x <<  4)\n" -"	//......11 ....11.. ..11.... 11....11	//0x 03 0C 30 C3\n" -"	//......12 ....34.. ..56.... 78....9A	//x = (x ^ (x <<  4)) & 0x030C30C3;\n" -"	\n" -"	//....1212 ..3434.. 5656..78 78..9A9A	//x ^ (x <<  2)\n" -"	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 24 92 49\n" -"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = (x ^ (x <<  2)) & 0x09249249;\n" -"	\n" -"	//........ ........ ......11 11111111	//0x000003FF\n" -"	x &= 0x000003FF;		//Clear all bits above bit 10\n" -"	\n" -"	x = (x ^ (x << 16)) & 0xFF0000FF;\n" -"	x = (x ^ (x <<  8)) & 0x0300F00F;\n" -"	x = (x ^ (x <<  4)) & 0x030C30C3;\n" -"	x = (x ^ (x <<  2)) & 0x09249249;\n" -"	\n" -"	return x;\n" -"}\n" -"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" -"{\n" -"	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" -"}\n" -"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n" -"{\n" -"	int separatedAabbIndex = get_global_id(0);\n" -"	if(separatedAabbIndex >= numAabbsToSeparate) return;\n" -"	int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n" -"	out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n" -"}\n" -"//Should replace with an optimized parallel reduction\n" -"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" -"{\n" -"	//Each time this kernel is added to the command queue, \n" -"	//the number of AABBs needing to be merged is halved\n" -"	//\n" -"	//Example with 159 AABBs:\n" -"	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" -"	//	numMergedAabbs == 159 - 80 == 79\n" -"	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" -"	\n" -"	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" -"	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" -"	\n" -"	int aabbIndex = get_global_id(0);\n" -"	if(aabbIndex >= numMergedAabbs) return;\n" -"	\n" -"	int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" -"	\n" -"	b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" -"	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" -"		\n" -"	b3AabbCL mergedAabb;\n" -"	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" -"	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" -"	out_mergedAabb[aabbIndex] = mergedAabb;\n" -"}\n" -"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" -"												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" -"{\n" -"	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index\n" -"	if(leafNodeIndex >= numAabbs) return;\n" -"	\n" -"	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" -"	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" -"	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" -"	\n" -"	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" -"	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" -"	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" -"	\n" -"	//Quantize into integer coordinates\n" -"	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" -"	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" -"	\n" -"	int4 discretePosition;\n" -"	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" -"	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" -"	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" -"	\n" -"	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" -"	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" -"	discretePosition += 512;\n" -"	\n" -"	//Interleave bits(assign a morton code, also known as a z-curve)\n" -"	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" -"	\n" -"	//\n" -"	SortDataCL mortonCodeIndexPair;\n" -"	mortonCodeIndexPair.m_key = mortonCode;\n" -"	mortonCodeIndexPair.m_value = leafNodeIndex;\n" -"	\n" -"	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" -"}\n" -"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" -"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" -"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" -"//In both cases, the bit should be cleared to access the actual node index.\n" -"int isLeafNode(int index) { return (index >> 31 == 0); }\n" -"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" -"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" -"//From sap.cl\n" -"#define NEW_PAIR_MARKER -1\n" -"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" -"{\n" -"	bool overlap = true;\n" -"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -"	return overlap;\n" -"}\n" -"//From sap.cl\n" -"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" -"											__global int* rootNodeIndex, \n" -"											__global int2* internalNodeChildIndices, \n" -"											__global b3AabbCL* internalNodeAabbs,\n" -"											__global int2* internalNodeLeafIndexRanges,\n" -"											\n" -"											__global SortDataCL* mortonCodesAndAabbIndices,\n" -"											__global int* out_numPairs, __global int4* out_overlappingPairs, \n" -"											int maxPairs, int numQueryAabbs)\n" -"{\n" -"	//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n" -"	//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n" -"	int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" -"	if(queryBvhNodeIndex >= numQueryAabbs) return;\n" -"	\n" -"	int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n" -"	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" -"	\n" -"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" -"	\n" -"	int stackSize = 1;\n" -"	stack[0] = *rootNodeIndex;\n" -"	\n" -"	while(stackSize)\n" -"	{\n" -"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" -"		--stackSize;\n" -"		\n" -"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n" -"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" -"		\n" -"		//Optimization - if the BVH is structured as a binary radix tree, then\n" -"		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n" -"		//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n" -"		{\n" -"			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" -"			if(highestLeafIndex <= queryBvhNodeIndex) continue;\n" -"		}\n" -"		\n" -"		//bvhRigidIndex is not used if internal node\n" -"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" -"	\n" -"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" -"		if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" -"		{\n" -"			if(isLeaf)\n" -"			{\n" -"				int4 pair;\n" -"				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" -"				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" -"				pair.z = NEW_PAIR_MARKER;\n" -"				pair.w = NEW_PAIR_MARKER;\n" -"				\n" -"				int pairIndex = atomic_inc(out_numPairs);\n" -"				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" -"			}\n" -"			\n" -"			if(!isLeaf)	//Internal node\n" -"			{\n" -"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" -"				{\n" -"					//Error\n" -"				}\n" -"				else\n" -"				{\n" -"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" -"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" -"				}\n" -"			}\n" -"		}\n" -"		\n" -"	}\n" -"}\n" -"//From rayCastKernels.cl\n" -"typedef struct\n" -"{\n" -"	float4 m_from;\n" -"	float4 m_to;\n" -"} b3RayInfo;\n" -"//From rayCastKernels.cl\n" -"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n" -"{\n" -"	b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n" -"	return normalize(normal);	//OpenCL normalize == vector4 normalize\n" -"}\n" -"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n" -"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n" -"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n" -"{\n" -"	//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n" -"	//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n" -"	//\n" -"	//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n" -"	//and min.x will be the far plane; otherwise, it is reversed.\n" -"	//\n" -"	//In order for there to be a collision, the t_min and t_max of each pair must overlap.\n" -"	//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n" -"	\n" -"	int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) );	//isless(x,y) returns (x < y)\n" -"	\n" -"	//When using vector types, the select() function checks the most signficant bit, \n" -"	//but isless() sets the least significant bit.\n" -"	isNegative <<= 31;\n" -"	//select(b, a, condition) == condition ? a : b\n" -"	//When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n" -"	b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" -"	b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" -"	\n" -"	b3Scalar t_min_final = 0.0f;\n" -"	b3Scalar t_max_final = rayLength;\n" -"	\n" -"	//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n" -"	//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n" -"	//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n" -"	t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n" -"	t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n" -"	\n" -"	return (t_min_final <= t_max_final);\n" -"}\n" -"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n" -"								__global int* rootNodeIndex, \n" -"								__global int2* internalNodeChildIndices, \n" -"								__global b3AabbCL* internalNodeAabbs,\n" -"								__global int2* internalNodeLeafIndexRanges,\n" -"								__global SortDataCL* mortonCodesAndAabbIndices,\n" -"								\n" -"								__global b3RayInfo* rays,\n" -"								\n" -"								__global int* out_numRayRigidPairs, \n" -"								__global int2* out_rayRigidPairs,\n" -"								int maxRayRigidPairs, int numRays)\n" -"{\n" -"	int rayIndex = get_global_id(0);\n" -"	if(rayIndex >= numRays) return;\n" -"	\n" -"	//\n" -"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n" -"	b3Vector3 rayTo = rays[rayIndex].m_to;\n" -"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" -"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" -"	\n" -"	//\n" -"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" -"	\n" -"	int stackSize = 1;\n" -"	stack[0] = *rootNodeIndex;\n" -"	\n" -"	while(stackSize)\n" -"	{\n" -"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" -"		--stackSize;\n" -"		\n" -"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n" -"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" -"		\n" -"		//bvhRigidIndex is not used if internal node\n" -"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" -"	\n" -"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" -"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb)  )\n" -"		{\n" -"			if(isLeaf)\n" -"			{\n" -"				int2 rayRigidPair;\n" -"				rayRigidPair.x = rayIndex;\n" -"				rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" -"				\n" -"				int pairIndex = atomic_inc(out_numRayRigidPairs);\n" -"				if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" -"			}\n" -"			\n" -"			if(!isLeaf)	//Internal node\n" -"			{\n" -"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" -"				{\n" -"					//Error\n" -"				}\n" -"				else\n" -"				{\n" -"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" -"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" -"				}\n" -"			}\n" -"		}\n" -"	}\n" -"}\n" -"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n" -"									__global int* out_numPairs, __global int4* out_overlappingPairs, \n" -"									int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n" -"{\n" -"	int smallAabbIndex = get_global_id(0);\n" -"	if(smallAabbIndex >= numSmallAabbRigids) return;\n" -"	\n" -"	b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n" -"	for(int i = 0; i < numLargeAabbRigids; ++i)\n" -"	{\n" -"		b3AabbCL largeAabb = largeAabbs[i];\n" -"		if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n" -"		{\n" -"			int4 pair;\n" -"			pair.x = largeAabb.m_minIndices[3];\n" -"			pair.y = smallAabb.m_minIndices[3];\n" -"			pair.z = NEW_PAIR_MARKER;\n" -"			pair.w = NEW_PAIR_MARKER;\n" -"			\n" -"			int pairIndex = atomic_inc(out_numPairs);\n" -"			if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" -"		}\n" -"	}\n" -"}\n" -"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n" -"									__global int* out_numRayRigidPairs,  __global int2* out_rayRigidPairs,\n" -"									int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n" -"{\n" -"	int rayIndex = get_global_id(0);\n" -"	if(rayIndex >= numRays) return;\n" -"	\n" -"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n" -"	b3Vector3 rayTo = rays[rayIndex].m_to;\n" -"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" -"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" -"	\n" -"	for(int i = 0; i < numLargeAabbRigids; ++i)\n" -"	{\n" -"		b3AabbCL rigidAabb = largeRigidAabbs[i];\n" -"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n" -"		{\n" -"			int2 rayRigidPair;\n" -"			rayRigidPair.x = rayIndex;\n" -"			rayRigidPair.y = rigidAabb.m_minIndices[3];\n" -"			\n" -"			int pairIndex = atomic_inc(out_numRayRigidPairs);\n" -"			if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" -"		}\n" -"	}\n" -"}\n" -"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" -"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" -"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n" -"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n" -"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" -"#define B3_PLBVH_ROOT_NODE_MARKER -1\n" -"#define b3Int64 long\n" -"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" -"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" -"{\n" -"	//This function only needs to return (i & j) in order for the algorithm to work,\n" -"	//but it may help with debugging to mask out the lower bits.\n" -"	b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" -"	b3Int64 sharedBits = i & j;\n" -"	b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength);	//Set all bits after the common prefix to 0\n" -"	\n" -"	return sharedBits & bitmask;\n" -"}\n" -"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n" -"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" -"{\n" -"	return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" -"}\n" -"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" -"											__global b3Int64* out_commonPrefixes,\n" -"											__global int* out_commonPrefixLengths,\n" -"											int numInternalNodes)\n" -"{\n" -"	int internalNodeIndex = get_global_id(0);\n" -"	if (internalNodeIndex >= numInternalNodes) return;\n" -"	\n" -"	//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" -"	//and the number of internal nodes is always numLeafNodes - 1\n" -"	int leftLeafIndex = internalNodeIndex;\n" -"	int rightLeafIndex = internalNodeIndex + 1;\n" -"	\n" -"	int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" -"	int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" -"	\n" -"	//Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" -"	//Append the index of each leaf node to each morton code so that there are no duplicates.\n" -"	//The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" -"	//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" -"	//\n" -"	//upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" -"	b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" -"	b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" -"	\n" -"	out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" -"	out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" -"}\n" -"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" -"											__global int2* out_childNodes, int numLeafNodes)\n" -"{\n" -"	int leafNodeIndex = get_global_id(0);\n" -"	if (leafNodeIndex >= numLeafNodes) return;\n" -"	\n" -"	int numInternalNodes = numLeafNodes - 1;\n" -"	\n" -"	int leftSplitIndex = leafNodeIndex - 1;\n" -"	int rightSplitIndex = leafNodeIndex;\n" -"	\n" -"	int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -"	int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -"	\n" -"	//Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" -"	//Leaf nodes are considered as having the highest common prefix\n" -"	int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" -"	\n" -"	//Handle cases for the edge nodes; the first and last node\n" -"	//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n" -"	if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" -"	if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" -"	\n" -"	int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" -"	out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" -"	\n" -"	int isRightChild = (isLeftHigherCommonPrefix);	//If the left node is the parent, then this node is its right child and vice versa\n" -"	\n" -"	//out_childNodesAsInt[0] == int2.x == left child\n" -"	//out_childNodesAsInt[1] == int2.y == right child\n" -"	int isLeaf = 1;\n" -"	__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" -"	out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" -"}\n" -"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" -"												__global int2* out_childNodes,\n" -"												__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" -"												int numInternalNodes)\n" -"{\n" -"	int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" -"	if(internalNodeIndex >= numInternalNodes) return;\n" -"	\n" -"	b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" -"	int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" -"	\n" -"//#define USE_LINEAR_SEARCH\n" -"#ifdef USE_LINEAR_SEARCH\n" -"	int leftIndex = -1;\n" -"	int rightIndex = -1;\n" -"	\n" -"	//Find nearest element to left with a lower common prefix\n" -"	for(int i = internalNodeIndex - 1; i >= 0; --i)\n" -"	{\n" -"		int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" -"		if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" -"		{\n" -"			leftIndex = i;\n" -"			break;\n" -"		}\n" -"	}\n" -"	\n" -"	//Find nearest element to right with a lower common prefix\n" -"	for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" -"	{\n" -"		int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" -"		if(nodeRightSharedPrefixLength < nodePrefixLength)\n" -"		{\n" -"			rightIndex = i;\n" -"			break;\n" -"		}\n" -"	}\n" -"	\n" -"#else //Use binary search\n" -"	//Find nearest element to left with a lower common prefix\n" -"	int leftIndex = -1;\n" -"	{\n" -"		int lower = 0;\n" -"		int upper = internalNodeIndex - 1;\n" -"		\n" -"		while(lower <= upper)\n" -"		{\n" -"			int mid = (lower + upper) / 2;\n" -"			b3Int64 midPrefix = commonPrefixes[mid];\n" -"			int midPrefixLength = commonPrefixLengths[mid];\n" -"			\n" -"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" -"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n" -"			{\n" -"				int right = mid + 1;\n" -"				if(right < internalNodeIndex)\n" -"				{\n" -"					b3Int64 rightPrefix = commonPrefixes[right];\n" -"					int rightPrefixLength = commonPrefixLengths[right];\n" -"					\n" -"					int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" -"					if(nodeRightSharedPrefixLength < nodePrefixLength) \n" -"					{\n" -"						lower = right;\n" -"						leftIndex = right;\n" -"					}\n" -"					else \n" -"					{\n" -"						leftIndex = mid;\n" -"						break;\n" -"					}\n" -"				}\n" -"				else \n" -"				{\n" -"					leftIndex = mid;\n" -"					break;\n" -"				}\n" -"			}\n" -"			else upper = mid - 1;\n" -"		}\n" -"	}\n" -"	\n" -"	//Find nearest element to right with a lower common prefix\n" -"	int rightIndex = -1;\n" -"	{\n" -"		int lower = internalNodeIndex + 1;\n" -"		int upper = numInternalNodes - 1;\n" -"		\n" -"		while(lower <= upper)\n" -"		{\n" -"			int mid = (lower + upper) / 2;\n" -"			b3Int64 midPrefix = commonPrefixes[mid];\n" -"			int midPrefixLength = commonPrefixLengths[mid];\n" -"			\n" -"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" -"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n" -"			{\n" -"				int left = mid - 1;\n" -"				if(left > internalNodeIndex)\n" -"				{\n" -"					b3Int64 leftPrefix = commonPrefixes[left];\n" -"					int leftPrefixLength = commonPrefixLengths[left];\n" -"				\n" -"					int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" -"					if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" -"					{\n" -"						upper = left;\n" -"						rightIndex = left;\n" -"					}\n" -"					else \n" -"					{\n" -"						rightIndex = mid;\n" -"						break;\n" -"					}\n" -"				}\n" -"				else \n" -"				{\n" -"					rightIndex = mid;\n" -"					break;\n" -"				}\n" -"			}\n" -"			else lower = mid + 1;\n" -"		}\n" -"	}\n" -"#endif\n" -"	\n" -"	//Select parent\n" -"	{\n" -"		int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -"		int rightPrefixLength =  (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -"		\n" -"		int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" -"		\n" -"		if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" -"		else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" -"		\n" -"		int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" -"		\n" -"		int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" -"		out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" -"		\n" -"		int isLeaf = 0;\n" -"		if(!isRootNode)\n" -"		{\n" -"			int isRightChild = (isLeftHigherPrefixLength);	//If the left node is the parent, then this node is its right child and vice versa\n" -"			\n" -"			//out_childNodesAsInt[0] == int2.x == left child\n" -"			//out_childNodesAsInt[1] == int2.y == right child\n" -"			__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" -"			out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" -"		}\n" -"		else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" -"	}\n" -"}\n" -"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" -"									__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" -"{\n" -"	if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" -"	int internalNodeIndex = get_global_id(0);\n" -"	if(internalNodeIndex >= numInternalNodes) return;\n" -"	\n" -"	//\n" -"	int distanceFromRoot = 0;\n" -"	{\n" -"		int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" -"		while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" -"		{\n" -"			parentIndex = internalNodeParentNodes[parentIndex];\n" -"			++distanceFromRoot;\n" -"		}\n" -"	}\n" -"	out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" -"	\n" -"	//\n" -"	__local int localMaxDistanceFromRoot;\n" -"	if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" -"	barrier(CLK_LOCAL_MEM_FENCE);\n" -"	\n" -"	atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" -"	barrier(CLK_LOCAL_MEM_FENCE);\n" -"	\n" -"	if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" -"}\n" -"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" -"												__global int2* childNodes,\n" -"												__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" -"												int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" -"{\n" -"	int internalNodeIndex = get_global_id(0);\n" -"	if(internalNodeIndex >= numInternalNodes) return;\n" -"	\n" -"	int distance = distanceFromRoot[internalNodeIndex];\n" -"	\n" -"	if(distance == processedDistance)\n" -"	{\n" -"		int leftChildIndex = childNodes[internalNodeIndex].x;\n" -"		int rightChildIndex = childNodes[internalNodeIndex].y;\n" -"		\n" -"		int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" -"		int isRightChildLeaf = isLeafNode(rightChildIndex);\n" -"		\n" -"		leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n" -"		rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n" -"		\n" -"		//leftRigidIndex/rightRigidIndex is not used if internal node\n" -"		int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" -"		int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" -"		\n" -"		b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n" -"		b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n" -"		\n" -"		b3AabbCL mergedAabb;\n" -"		mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" -"		mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" -"		internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" -"	}\n" -"}\n" -"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n" -"{\n" -"	int internalNodeIndex = get_global_id(0);\n" -"	if(internalNodeIndex >= numInternalNodes) return;\n" -"	\n" -"	int numLeafNodes = numInternalNodes + 1;\n" -"	\n" -"	int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n" -"	\n" -"	int2 leafIndexRange;	//x == min leaf index, y == max leaf index\n" -"	\n" -"	//Find lowest leaf index covered by this internal node\n" -"	{\n" -"		int lowestIndex = childNodes.x;		//childNodes.x == Left child\n" -"		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n" -"		leafIndexRange.x = lowestIndex;\n" -"	}\n" -"	\n" -"	//Find highest leaf index covered by this internal node\n" -"	{\n" -"		int highestIndex = childNodes.y;	//childNodes.y == Right child\n" -"		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n" -"		leafIndexRange.y = highestIndex;\n" -"	}\n" -"	\n" -"	//\n" -"	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n" -"}\n" -; +static const char* parallelLinearBvhCL = +	"/*\n" +	"This software is provided 'as-is', without any express or implied warranty.\n" +	"In no event will the authors be held liable for any damages arising from the use of this software.\n" +	"Permission is granted to anyone to use this software for any purpose,\n" +	"including commercial applications, and to alter it and redistribute it freely,\n" +	"subject to the following restrictions:\n" +	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +	"3. This notice may not be removed or altered from any source distribution.\n" +	"*/\n" +	"//Initial Author Jackson Lee, 2014\n" +	"typedef float b3Scalar;\n" +	"typedef float4 b3Vector3;\n" +	"#define b3Max max\n" +	"#define b3Min min\n" +	"#define b3Sqrt sqrt\n" +	"typedef struct\n" +	"{\n" +	"	unsigned int m_key;\n" +	"	unsigned int m_value;\n" +	"} SortDataCL;\n" +	"typedef struct \n" +	"{\n" +	"	union\n" +	"	{\n" +	"		float4	m_min;\n" +	"		float   m_minElems[4];\n" +	"		int			m_minIndices[4];\n" +	"	};\n" +	"	union\n" +	"	{\n" +	"		float4	m_max;\n" +	"		float   m_maxElems[4];\n" +	"		int			m_maxIndices[4];\n" +	"	};\n" +	"} b3AabbCL;\n" +	"unsigned int interleaveBits(unsigned int x)\n" +	"{\n" +	"	//........ ........ ......12 3456789A	//x\n" +	"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x after interleaving bits\n" +	"	\n" +	"	//......12 3456789A ......12 3456789A	//x ^ (x << 16)\n" +	"	//11111111 ........ ........ 11111111	//0x FF 00 00 FF\n" +	"	//......12 ........ ........ 3456789A	//x = (x ^ (x << 16)) & 0xFF0000FF;\n" +	"	\n" +	"	//......12 ........ 3456789A 3456789A	//x ^ (x <<  8)\n" +	"	//......11 ........ 1111.... ....1111	//0x 03 00 F0 0F\n" +	"	//......12 ........ 3456.... ....789A	//x = (x ^ (x <<  8)) & 0x0300F00F;\n" +	"	\n" +	"	//..12..12 ....3456 3456.... 789A789A	//x ^ (x <<  4)\n" +	"	//......11 ....11.. ..11.... 11....11	//0x 03 0C 30 C3\n" +	"	//......12 ....34.. ..56.... 78....9A	//x = (x ^ (x <<  4)) & 0x030C30C3;\n" +	"	\n" +	"	//....1212 ..3434.. 5656..78 78..9A9A	//x ^ (x <<  2)\n" +	"	//....1..1 ..1..1.. 1..1..1. .1..1..1	//0x 09 24 92 49\n" +	"	//....1..2 ..3..4.. 5..6..7. .8..9..A	//x = (x ^ (x <<  2)) & 0x09249249;\n" +	"	\n" +	"	//........ ........ ......11 11111111	//0x000003FF\n" +	"	x &= 0x000003FF;		//Clear all bits above bit 10\n" +	"	\n" +	"	x = (x ^ (x << 16)) & 0xFF0000FF;\n" +	"	x = (x ^ (x <<  8)) & 0x0300F00F;\n" +	"	x = (x ^ (x <<  4)) & 0x030C30C3;\n" +	"	x = (x ^ (x <<  2)) & 0x09249249;\n" +	"	\n" +	"	return x;\n" +	"}\n" +	"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" +	"{\n" +	"	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" +	"}\n" +	"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n" +	"{\n" +	"	int separatedAabbIndex = get_global_id(0);\n" +	"	if(separatedAabbIndex >= numAabbsToSeparate) return;\n" +	"	int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n" +	"	out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n" +	"}\n" +	"//Should replace with an optimized parallel reduction\n" +	"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" +	"{\n" +	"	//Each time this kernel is added to the command queue, \n" +	"	//the number of AABBs needing to be merged is halved\n" +	"	//\n" +	"	//Example with 159 AABBs:\n" +	"	//	numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" +	"	//	numMergedAabbs == 159 - 80 == 79\n" +	"	//So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" +	"	\n" +	"	int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" +	"	int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" +	"	\n" +	"	int aabbIndex = get_global_id(0);\n" +	"	if(aabbIndex >= numMergedAabbs) return;\n" +	"	\n" +	"	int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" +	"	\n" +	"	b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" +	"	b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" +	"		\n" +	"	b3AabbCL mergedAabb;\n" +	"	mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" +	"	mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" +	"	out_mergedAabb[aabbIndex] = mergedAabb;\n" +	"}\n" +	"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" +	"												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" +	"{\n" +	"	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index\n" +	"	if(leafNodeIndex >= numAabbs) return;\n" +	"	\n" +	"	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" +	"	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" +	"	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" +	"	\n" +	"	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" +	"	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" +	"	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" +	"	\n" +	"	//Quantize into integer coordinates\n" +	"	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" +	"	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" +	"	\n" +	"	int4 discretePosition;\n" +	"	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" +	"	discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" +	"	discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" +	"	\n" +	"	//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" +	"	discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" +	"	discretePosition += 512;\n" +	"	\n" +	"	//Interleave bits(assign a morton code, also known as a z-curve)\n" +	"	unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" +	"	\n" +	"	//\n" +	"	SortDataCL mortonCodeIndexPair;\n" +	"	mortonCodeIndexPair.m_key = mortonCode;\n" +	"	mortonCodeIndexPair.m_value = leafNodeIndex;\n" +	"	\n" +	"	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" +	"}\n" +	"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" +	"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" +	"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" +	"//In both cases, the bit should be cleared to access the actual node index.\n" +	"int isLeafNode(int index) { return (index >> 31 == 0); }\n" +	"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" +	"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" +	"//From sap.cl\n" +	"#define NEW_PAIR_MARKER -1\n" +	"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" +	"{\n" +	"	bool overlap = true;\n" +	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +	"	return overlap;\n" +	"}\n" +	"//From sap.cl\n" +	"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" +	"											__global int* rootNodeIndex, \n" +	"											__global int2* internalNodeChildIndices, \n" +	"											__global b3AabbCL* internalNodeAabbs,\n" +	"											__global int2* internalNodeLeafIndexRanges,\n" +	"											\n" +	"											__global SortDataCL* mortonCodesAndAabbIndices,\n" +	"											__global int* out_numPairs, __global int4* out_overlappingPairs, \n" +	"											int maxPairs, int numQueryAabbs)\n" +	"{\n" +	"	//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n" +	"	//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n" +	"	int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +	"	if(queryBvhNodeIndex >= numQueryAabbs) return;\n" +	"	\n" +	"	int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n" +	"	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" +	"	\n" +	"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" +	"	\n" +	"	int stackSize = 1;\n" +	"	stack[0] = *rootNodeIndex;\n" +	"	\n" +	"	while(stackSize)\n" +	"	{\n" +	"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" +	"		--stackSize;\n" +	"		\n" +	"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n" +	"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" +	"		\n" +	"		//Optimization - if the BVH is structured as a binary radix tree, then\n" +	"		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n" +	"		//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n" +	"		{\n" +	"			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" +	"			if(highestLeafIndex <= queryBvhNodeIndex) continue;\n" +	"		}\n" +	"		\n" +	"		//bvhRigidIndex is not used if internal node\n" +	"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" +	"	\n" +	"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" +	"		if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" +	"		{\n" +	"			if(isLeaf)\n" +	"			{\n" +	"				int4 pair;\n" +	"				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" +	"				pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" +	"				pair.z = NEW_PAIR_MARKER;\n" +	"				pair.w = NEW_PAIR_MARKER;\n" +	"				\n" +	"				int pairIndex = atomic_inc(out_numPairs);\n" +	"				if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" +	"			}\n" +	"			\n" +	"			if(!isLeaf)	//Internal node\n" +	"			{\n" +	"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" +	"				{\n" +	"					//Error\n" +	"				}\n" +	"				else\n" +	"				{\n" +	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" +	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" +	"				}\n" +	"			}\n" +	"		}\n" +	"		\n" +	"	}\n" +	"}\n" +	"//From rayCastKernels.cl\n" +	"typedef struct\n" +	"{\n" +	"	float4 m_from;\n" +	"	float4 m_to;\n" +	"} b3RayInfo;\n" +	"//From rayCastKernels.cl\n" +	"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n" +	"{\n" +	"	b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n" +	"	return normalize(normal);	//OpenCL normalize == vector4 normalize\n" +	"}\n" +	"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n" +	"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n" +	"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n" +	"{\n" +	"	//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n" +	"	//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n" +	"	//\n" +	"	//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n" +	"	//and min.x will be the far plane; otherwise, it is reversed.\n" +	"	//\n" +	"	//In order for there to be a collision, the t_min and t_max of each pair must overlap.\n" +	"	//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n" +	"	\n" +	"	int4 isNegative = isless( rayNormalizedDirection, ((b3Vector3){0.0f, 0.0f, 0.0f, 0.0f}) );	//isless(x,y) returns (x < y)\n" +	"	\n" +	"	//When using vector types, the select() function checks the most signficant bit, \n" +	"	//but isless() sets the least significant bit.\n" +	"	isNegative <<= 31;\n" +	"	//select(b, a, condition) == condition ? a : b\n" +	"	//When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n" +	"	b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" +	"	b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" +	"	\n" +	"	b3Scalar t_min_final = 0.0f;\n" +	"	b3Scalar t_max_final = rayLength;\n" +	"	\n" +	"	//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n" +	"	//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n" +	"	//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n" +	"	t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n" +	"	t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n" +	"	\n" +	"	return (t_min_final <= t_max_final);\n" +	"}\n" +	"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n" +	"								__global int* rootNodeIndex, \n" +	"								__global int2* internalNodeChildIndices, \n" +	"								__global b3AabbCL* internalNodeAabbs,\n" +	"								__global int2* internalNodeLeafIndexRanges,\n" +	"								__global SortDataCL* mortonCodesAndAabbIndices,\n" +	"								\n" +	"								__global b3RayInfo* rays,\n" +	"								\n" +	"								__global int* out_numRayRigidPairs, \n" +	"								__global int2* out_rayRigidPairs,\n" +	"								int maxRayRigidPairs, int numRays)\n" +	"{\n" +	"	int rayIndex = get_global_id(0);\n" +	"	if(rayIndex >= numRays) return;\n" +	"	\n" +	"	//\n" +	"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n" +	"	b3Vector3 rayTo = rays[rayIndex].m_to;\n" +	"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" +	"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" +	"	\n" +	"	//\n" +	"	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" +	"	\n" +	"	int stackSize = 1;\n" +	"	stack[0] = *rootNodeIndex;\n" +	"	\n" +	"	while(stackSize)\n" +	"	{\n" +	"		int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" +	"		--stackSize;\n" +	"		\n" +	"		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n" +	"		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" +	"		\n" +	"		//bvhRigidIndex is not used if internal node\n" +	"		int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" +	"	\n" +	"		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" +	"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb)  )\n" +	"		{\n" +	"			if(isLeaf)\n" +	"			{\n" +	"				int2 rayRigidPair;\n" +	"				rayRigidPair.x = rayIndex;\n" +	"				rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" +	"				\n" +	"				int pairIndex = atomic_inc(out_numRayRigidPairs);\n" +	"				if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" +	"			}\n" +	"			\n" +	"			if(!isLeaf)	//Internal node\n" +	"			{\n" +	"				if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" +	"				{\n" +	"					//Error\n" +	"				}\n" +	"				else\n" +	"				{\n" +	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" +	"					stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" +	"				}\n" +	"			}\n" +	"		}\n" +	"	}\n" +	"}\n" +	"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n" +	"									__global int* out_numPairs, __global int4* out_overlappingPairs, \n" +	"									int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n" +	"{\n" +	"	int smallAabbIndex = get_global_id(0);\n" +	"	if(smallAabbIndex >= numSmallAabbRigids) return;\n" +	"	\n" +	"	b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n" +	"	for(int i = 0; i < numLargeAabbRigids; ++i)\n" +	"	{\n" +	"		b3AabbCL largeAabb = largeAabbs[i];\n" +	"		if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n" +	"		{\n" +	"			int4 pair;\n" +	"			pair.x = largeAabb.m_minIndices[3];\n" +	"			pair.y = smallAabb.m_minIndices[3];\n" +	"			pair.z = NEW_PAIR_MARKER;\n" +	"			pair.w = NEW_PAIR_MARKER;\n" +	"			\n" +	"			int pairIndex = atomic_inc(out_numPairs);\n" +	"			if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" +	"		}\n" +	"	}\n" +	"}\n" +	"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n" +	"									__global int* out_numRayRigidPairs,  __global int2* out_rayRigidPairs,\n" +	"									int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n" +	"{\n" +	"	int rayIndex = get_global_id(0);\n" +	"	if(rayIndex >= numRays) return;\n" +	"	\n" +	"	b3Vector3 rayFrom = rays[rayIndex].m_from;\n" +	"	b3Vector3 rayTo = rays[rayIndex].m_to;\n" +	"	b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" +	"	b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" +	"	\n" +	"	for(int i = 0; i < numLargeAabbRigids; ++i)\n" +	"	{\n" +	"		b3AabbCL rigidAabb = largeRigidAabbs[i];\n" +	"		if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n" +	"		{\n" +	"			int2 rayRigidPair;\n" +	"			rayRigidPair.x = rayIndex;\n" +	"			rayRigidPair.y = rigidAabb.m_minIndices[3];\n" +	"			\n" +	"			int pairIndex = atomic_inc(out_numRayRigidPairs);\n" +	"			if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" +	"		}\n" +	"	}\n" +	"}\n" +	"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" +	"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" +	"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n" +	"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n" +	"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" +	"#define B3_PLBVH_ROOT_NODE_MARKER -1\n" +	"#define b3Int64 long\n" +	"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" +	"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" +	"{\n" +	"	//This function only needs to return (i & j) in order for the algorithm to work,\n" +	"	//but it may help with debugging to mask out the lower bits.\n" +	"	b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" +	"	b3Int64 sharedBits = i & j;\n" +	"	b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength);	//Set all bits after the common prefix to 0\n" +	"	\n" +	"	return sharedBits & bitmask;\n" +	"}\n" +	"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n" +	"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" +	"{\n" +	"	return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" +	"}\n" +	"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" +	"											__global b3Int64* out_commonPrefixes,\n" +	"											__global int* out_commonPrefixLengths,\n" +	"											int numInternalNodes)\n" +	"{\n" +	"	int internalNodeIndex = get_global_id(0);\n" +	"	if (internalNodeIndex >= numInternalNodes) return;\n" +	"	\n" +	"	//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" +	"	//and the number of internal nodes is always numLeafNodes - 1\n" +	"	int leftLeafIndex = internalNodeIndex;\n" +	"	int rightLeafIndex = internalNodeIndex + 1;\n" +	"	\n" +	"	int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" +	"	int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" +	"	\n" +	"	//Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" +	"	//Append the index of each leaf node to each morton code so that there are no duplicates.\n" +	"	//The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" +	"	//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" +	"	//\n" +	"	//upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" +	"	b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" +	"	b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" +	"	\n" +	"	out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" +	"	out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" +	"}\n" +	"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" +	"											__global int2* out_childNodes, int numLeafNodes)\n" +	"{\n" +	"	int leafNodeIndex = get_global_id(0);\n" +	"	if (leafNodeIndex >= numLeafNodes) return;\n" +	"	\n" +	"	int numInternalNodes = numLeafNodes - 1;\n" +	"	\n" +	"	int leftSplitIndex = leafNodeIndex - 1;\n" +	"	int rightSplitIndex = leafNodeIndex;\n" +	"	\n" +	"	int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +	"	int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +	"	\n" +	"	//Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" +	"	//Leaf nodes are considered as having the highest common prefix\n" +	"	int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" +	"	\n" +	"	//Handle cases for the edge nodes; the first and last node\n" +	"	//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n" +	"	if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" +	"	if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" +	"	\n" +	"	int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" +	"	out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" +	"	\n" +	"	int isRightChild = (isLeftHigherCommonPrefix);	//If the left node is the parent, then this node is its right child and vice versa\n" +	"	\n" +	"	//out_childNodesAsInt[0] == int2.x == left child\n" +	"	//out_childNodesAsInt[1] == int2.y == right child\n" +	"	int isLeaf = 1;\n" +	"	__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +	"	out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" +	"}\n" +	"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" +	"												__global int2* out_childNodes,\n" +	"												__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" +	"												int numInternalNodes)\n" +	"{\n" +	"	int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +	"	if(internalNodeIndex >= numInternalNodes) return;\n" +	"	\n" +	"	b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" +	"	int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" +	"	\n" +	"//#define USE_LINEAR_SEARCH\n" +	"#ifdef USE_LINEAR_SEARCH\n" +	"	int leftIndex = -1;\n" +	"	int rightIndex = -1;\n" +	"	\n" +	"	//Find nearest element to left with a lower common prefix\n" +	"	for(int i = internalNodeIndex - 1; i >= 0; --i)\n" +	"	{\n" +	"		int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +	"		if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" +	"		{\n" +	"			leftIndex = i;\n" +	"			break;\n" +	"		}\n" +	"	}\n" +	"	\n" +	"	//Find nearest element to right with a lower common prefix\n" +	"	for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" +	"	{\n" +	"		int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +	"		if(nodeRightSharedPrefixLength < nodePrefixLength)\n" +	"		{\n" +	"			rightIndex = i;\n" +	"			break;\n" +	"		}\n" +	"	}\n" +	"	\n" +	"#else //Use binary search\n" +	"	//Find nearest element to left with a lower common prefix\n" +	"	int leftIndex = -1;\n" +	"	{\n" +	"		int lower = 0;\n" +	"		int upper = internalNodeIndex - 1;\n" +	"		\n" +	"		while(lower <= upper)\n" +	"		{\n" +	"			int mid = (lower + upper) / 2;\n" +	"			b3Int64 midPrefix = commonPrefixes[mid];\n" +	"			int midPrefixLength = commonPrefixLengths[mid];\n" +	"			\n" +	"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +	"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +	"			{\n" +	"				int right = mid + 1;\n" +	"				if(right < internalNodeIndex)\n" +	"				{\n" +	"					b3Int64 rightPrefix = commonPrefixes[right];\n" +	"					int rightPrefixLength = commonPrefixLengths[right];\n" +	"					\n" +	"					int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" +	"					if(nodeRightSharedPrefixLength < nodePrefixLength) \n" +	"					{\n" +	"						lower = right;\n" +	"						leftIndex = right;\n" +	"					}\n" +	"					else \n" +	"					{\n" +	"						leftIndex = mid;\n" +	"						break;\n" +	"					}\n" +	"				}\n" +	"				else \n" +	"				{\n" +	"					leftIndex = mid;\n" +	"					break;\n" +	"				}\n" +	"			}\n" +	"			else upper = mid - 1;\n" +	"		}\n" +	"	}\n" +	"	\n" +	"	//Find nearest element to right with a lower common prefix\n" +	"	int rightIndex = -1;\n" +	"	{\n" +	"		int lower = internalNodeIndex + 1;\n" +	"		int upper = numInternalNodes - 1;\n" +	"		\n" +	"		while(lower <= upper)\n" +	"		{\n" +	"			int mid = (lower + upper) / 2;\n" +	"			b3Int64 midPrefix = commonPrefixes[mid];\n" +	"			int midPrefixLength = commonPrefixLengths[mid];\n" +	"			\n" +	"			int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +	"			if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +	"			{\n" +	"				int left = mid - 1;\n" +	"				if(left > internalNodeIndex)\n" +	"				{\n" +	"					b3Int64 leftPrefix = commonPrefixes[left];\n" +	"					int leftPrefixLength = commonPrefixLengths[left];\n" +	"				\n" +	"					int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" +	"					if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" +	"					{\n" +	"						upper = left;\n" +	"						rightIndex = left;\n" +	"					}\n" +	"					else \n" +	"					{\n" +	"						rightIndex = mid;\n" +	"						break;\n" +	"					}\n" +	"				}\n" +	"				else \n" +	"				{\n" +	"					rightIndex = mid;\n" +	"					break;\n" +	"				}\n" +	"			}\n" +	"			else lower = mid + 1;\n" +	"		}\n" +	"	}\n" +	"#endif\n" +	"	\n" +	"	//Select parent\n" +	"	{\n" +	"		int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +	"		int rightPrefixLength =  (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +	"		\n" +	"		int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" +	"		\n" +	"		if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" +	"		else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" +	"		\n" +	"		int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" +	"		\n" +	"		int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" +	"		out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" +	"		\n" +	"		int isLeaf = 0;\n" +	"		if(!isRootNode)\n" +	"		{\n" +	"			int isRightChild = (isLeftHigherPrefixLength);	//If the left node is the parent, then this node is its right child and vice versa\n" +	"			\n" +	"			//out_childNodesAsInt[0] == int2.x == left child\n" +	"			//out_childNodesAsInt[1] == int2.y == right child\n" +	"			__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +	"			out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +	"		}\n" +	"		else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +	"	}\n" +	"}\n" +	"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" +	"									__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" +	"{\n" +	"	if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" +	"	int internalNodeIndex = get_global_id(0);\n" +	"	if(internalNodeIndex >= numInternalNodes) return;\n" +	"	\n" +	"	//\n" +	"	int distanceFromRoot = 0;\n" +	"	{\n" +	"		int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" +	"		while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" +	"		{\n" +	"			parentIndex = internalNodeParentNodes[parentIndex];\n" +	"			++distanceFromRoot;\n" +	"		}\n" +	"	}\n" +	"	out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" +	"	\n" +	"	//\n" +	"	__local int localMaxDistanceFromRoot;\n" +	"	if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" +	"	barrier(CLK_LOCAL_MEM_FENCE);\n" +	"	\n" +	"	atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" +	"	barrier(CLK_LOCAL_MEM_FENCE);\n" +	"	\n" +	"	if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" +	"}\n" +	"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" +	"												__global int2* childNodes,\n" +	"												__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" +	"												int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" +	"{\n" +	"	int internalNodeIndex = get_global_id(0);\n" +	"	if(internalNodeIndex >= numInternalNodes) return;\n" +	"	\n" +	"	int distance = distanceFromRoot[internalNodeIndex];\n" +	"	\n" +	"	if(distance == processedDistance)\n" +	"	{\n" +	"		int leftChildIndex = childNodes[internalNodeIndex].x;\n" +	"		int rightChildIndex = childNodes[internalNodeIndex].y;\n" +	"		\n" +	"		int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" +	"		int isRightChildLeaf = isLeafNode(rightChildIndex);\n" +	"		\n" +	"		leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n" +	"		rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n" +	"		\n" +	"		//leftRigidIndex/rightRigidIndex is not used if internal node\n" +	"		int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" +	"		int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" +	"		\n" +	"		b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n" +	"		b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n" +	"		\n" +	"		b3AabbCL mergedAabb;\n" +	"		mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" +	"		mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" +	"		internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" +	"	}\n" +	"}\n" +	"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n" +	"{\n" +	"	int internalNodeIndex = get_global_id(0);\n" +	"	if(internalNodeIndex >= numInternalNodes) return;\n" +	"	\n" +	"	int numLeafNodes = numInternalNodes + 1;\n" +	"	\n" +	"	int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n" +	"	\n" +	"	int2 leafIndexRange;	//x == min leaf index, y == max leaf index\n" +	"	\n" +	"	//Find lowest leaf index covered by this internal node\n" +	"	{\n" +	"		int lowestIndex = childNodes.x;		//childNodes.x == Left child\n" +	"		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n" +	"		leafIndexRange.x = lowestIndex;\n" +	"	}\n" +	"	\n" +	"	//Find highest leaf index covered by this internal node\n" +	"	{\n" +	"		int highestIndex = childNodes.y;	//childNodes.y == Right child\n" +	"		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n" +	"		leafIndexRange.y = highestIndex;\n" +	"	}\n" +	"	\n" +	"	//\n" +	"	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n" +	"}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h index 04d40fcf26..d6999b94cb 100644 --- a/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h +++ b/thirdparty/bullet/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h @@ -1,342 +1,341 @@  //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* sapCL= \ -"/*\n" -"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n" -"This software is provided 'as-is', without any express or implied warranty.\n" -"In no event will the authors be held liable for any damages arising from the use of this software.\n" -"Permission is granted to anyone to use this software for any purpose, \n" -"including commercial applications, and to alter it and redistribute it freely, \n" -"subject to the following restrictions:\n" -"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" -"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" -"3. This notice may not be removed or altered from any source distribution.\n" -"*/\n" -"//Originally written by Erwin Coumans\n" -"#define NEW_PAIR_MARKER -1\n" -"typedef struct \n" -"{\n" -"	union\n" -"	{\n" -"		float4	m_min;\n" -"		float   m_minElems[4];\n" -"		int			m_minIndices[4];\n" -"	};\n" -"	union\n" -"	{\n" -"		float4	m_max;\n" -"		float   m_maxElems[4];\n" -"		int			m_maxIndices[4];\n" -"	};\n" -"} btAabbCL;\n" -"/// conservative test for overlap between two aabbs\n" -"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n" -"{\n" -"	bool overlap = true;\n" -"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -"	return overlap;\n" -"}\n" -"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" -"{\n" -"	bool overlap = true;\n" -"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -"	return overlap;\n" -"}\n" -"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" -"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" -"{\n" -"	bool overlap = true;\n" -"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" -"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" -"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" -"	return overlap;\n" -"}\n" -"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n" -"{\n" -"	int i = get_global_id(0);\n" -"	if (i>=numUnsortedAabbs)\n" -"		return;\n" -"	int j = get_global_id(1);\n" -"	if (j>=numUnSortedAabbs2)\n" -"		return;\n" -"	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n" -"	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n" -"	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n" -"	{\n" -"		int4 myPair;\n" -"		\n" -"		int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n" -"		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n" -"		if (xIndex>yIndex)\n" -"		{\n" -"			int tmp = xIndex;\n" -"			xIndex=yIndex;\n" -"			yIndex=tmp;\n" -"		}\n" -"		\n" -"		myPair.x = xIndex;\n" -"		myPair.y = yIndex;\n" -"		myPair.z = NEW_PAIR_MARKER;\n" -"		myPair.w = NEW_PAIR_MARKER;\n" -"		int curPair = atomic_inc (pairCount);\n" -"		if (curPair<maxPairs)\n" -"		{\n" -"				pairsOut[curPair] = myPair; //flush to main memory\n" -"		}\n" -"	}\n" -"}\n" -"__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -"	int i = get_global_id(0);\n" -"	if (i>=numObjects)\n" -"		return;\n" -"	for (int j=i+1;j<numObjects;j++)\n" -"	{\n" -"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" -"		{\n" -"			int4 myPair;\n" -"			myPair.x = aabbs[i].m_minIndices[3];\n" -"			myPair.y = aabbs[j].m_minIndices[3];\n" -"			myPair.z = NEW_PAIR_MARKER;\n" -"			myPair.w = NEW_PAIR_MARKER;\n" -"			int curPair = atomic_inc (pairCount);\n" -"			if (curPair<maxPairs)\n" -"			{\n" -"					pairsOut[curPair] = myPair; //flush to main memory\n" -"			}\n" -"		}\n" -"	}\n" -"}\n" -"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -"	int i = get_global_id(0);\n" -"	if (i>=numObjects)\n" -"		return;\n" -"	for (int j=i+1;j<numObjects;j++)\n" -"	{\n" -"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" -"		{\n" -"			break;\n" -"		}\n" -"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" -"		{\n" -"			int4 myPair;\n" -"			myPair.x = aabbs[i].m_minIndices[3];\n" -"			myPair.y = aabbs[j].m_minIndices[3];\n" -"			myPair.z = NEW_PAIR_MARKER;\n" -"			myPair.w = NEW_PAIR_MARKER;\n" -"			int curPair = atomic_inc (pairCount);\n" -"			if (curPair<maxPairs)\n" -"			{\n" -"					pairsOut[curPair] = myPair; //flush to main memory\n" -"			}\n" -"		}\n" -"	}\n" -"}\n" -"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -"	int i = get_global_id(0);\n" -"	int localId = get_local_id(0);\n" -"	__local int numActiveWgItems[1];\n" -"	__local int breakRequest[1];\n" -"	if (localId==0)\n" -"	{\n" -"		numActiveWgItems[0] = 0;\n" -"		breakRequest[0] = 0;\n" -"	}\n" -"	barrier(CLK_LOCAL_MEM_FENCE);\n" -"	atomic_inc(numActiveWgItems);\n" -"	barrier(CLK_LOCAL_MEM_FENCE);\n" -"	int localBreak = 0;\n" -"	int j=i+1;\n" -"	do\n" -"	{\n" -"		barrier(CLK_LOCAL_MEM_FENCE);\n" -"	\n" -"		if (j<numObjects)\n" -"		{\n" -"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" -"			{\n" -"				if (!localBreak)\n" -"				{\n" -"					atomic_inc(breakRequest);\n" -"					localBreak = 1;\n" -"				}\n" -"			}\n" -"		}\n" -"		\n" -"		barrier(CLK_LOCAL_MEM_FENCE);\n" -"		\n" -"		if (j>=numObjects && !localBreak)\n" -"		{\n" -"			atomic_inc(breakRequest);\n" -"			localBreak = 1;\n" -"		}\n" -"		barrier(CLK_LOCAL_MEM_FENCE);\n" -"		\n" -"		if (!localBreak)\n" -"		{\n" -"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" -"			{\n" -"				int4 myPair;\n" -"				myPair.x = aabbs[i].m_minIndices[3];\n" -"				myPair.y = aabbs[j].m_minIndices[3];\n" -"				myPair.z = NEW_PAIR_MARKER;\n" -"				myPair.w = NEW_PAIR_MARKER;\n" -"				int curPair = atomic_inc (pairCount);\n" -"				if (curPair<maxPairs)\n" -"				{\n" -"						pairsOut[curPair] = myPair; //flush to main memory\n" -"				}\n" -"			}\n" -"		}\n" -"		j++;\n" -"	} while (breakRequest[0]<numActiveWgItems[0]);\n" -"}\n" -"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n" -"{\n" -"	int i = get_global_id(0);\n" -"	int localId = get_local_id(0);\n" -"	__local int numActiveWgItems[1];\n" -"	__local int breakRequest[1];\n" -"	__local btAabbCL localAabbs[128];// = aabbs[i];\n" -"	\n" -"	btAabbCL myAabb;\n" -"	\n" -"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n" -"	float testValue = 	myAabb.m_maxElems[axis];\n" -"	\n" -"	if (localId==0)\n" -"	{\n" -"		numActiveWgItems[0] = 0;\n" -"		breakRequest[0] = 0;\n" -"	}\n" -"	int localCount=0;\n" -"	int block=0;\n" -"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n" -"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n" -"	\n" -"	barrier(CLK_LOCAL_MEM_FENCE);\n" -"	atomic_inc(numActiveWgItems);\n" -"	barrier(CLK_LOCAL_MEM_FENCE);\n" -"	int localBreak = 0;\n" -"	\n" -"	int j=i+1;\n" -"	do\n" -"	{\n" -"		barrier(CLK_LOCAL_MEM_FENCE);\n" -"	\n" -"		if (j<numObjects)\n" -"		{\n" -"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n" -"			{\n" -"				if (!localBreak)\n" -"				{\n" -"					atomic_inc(breakRequest);\n" -"					localBreak = 1;\n" -"				}\n" -"			}\n" -"		}\n" -"		\n" -"		barrier(CLK_LOCAL_MEM_FENCE);\n" -"		\n" -"		if (j>=numObjects && !localBreak)\n" -"		{\n" -"			atomic_inc(breakRequest);\n" -"			localBreak = 1;\n" -"		}\n" -"		barrier(CLK_LOCAL_MEM_FENCE);\n" -"		\n" -"		if (!localBreak)\n" -"		{\n" -"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" -"			{\n" -"				int4 myPair;\n" -"				myPair.x = myAabb.m_minIndices[3];\n" -"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" -"				myPair.z = NEW_PAIR_MARKER;\n" -"				myPair.w = NEW_PAIR_MARKER;\n" -"				int curPair = atomic_inc (pairCount);\n" -"				if (curPair<maxPairs)\n" -"				{\n" -"						pairsOut[curPair] = myPair; //flush to main memory\n" -"				}\n" -"			}\n" -"		}\n" -"		\n" -"		barrier(CLK_LOCAL_MEM_FENCE);\n" -"		localCount++;\n" -"		if (localCount==64)\n" -"		{\n" -"			localCount = 0;\n" -"			block+=64;			\n" -"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n" -"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n" -"		}\n" -"		j++;\n" -"		\n" -"	} while (breakRequest[0]<numActiveWgItems[0]);\n" -"	\n" -"}\n" -"//http://stereopsis.com/radix.html\n" -"unsigned int FloatFlip(float fl);\n" -"unsigned int FloatFlip(float fl)\n" -"{\n" -"	unsigned int f = *(unsigned int*)&fl;\n" -"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n" -"	return f ^ mask;\n" -"}\n" -"float IFloatFlip(unsigned int f);\n" -"float IFloatFlip(unsigned int f)\n" -"{\n" -"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n" -"	unsigned int fl = f ^ mask;\n" -"	return *(float*)&fl;\n" -"}\n" -"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n" -"{\n" -"	int i = get_global_id(0);\n" -"	if (i>=numObjects)\n" -"		return;\n" -"	int src = destAabbs[i].m_maxIndices[3];\n" -"	destAabbs[i] = allAabbs[src];\n" -"	destAabbs[i].m_maxIndices[3] = src;\n" -"}\n" -"__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n" -"{\n" -"	int i = get_global_id(0);\n" -"	if (i>=numObjects)\n" -"		return;\n" -"	\n" -"	\n" -"	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n" -"	sortData[i].y = i;\n" -"		\n" -"}\n" -"__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" -"{\n" -"	int i = get_global_id(0);\n" -"	if (i>=numObjects)\n" -"		return;\n" -"	\n" -"	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n" -"}\n" -"__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n" -"{\n" -"	int i = get_global_id(0);\n" -"	if (i>=numAabbs)\n" -"		return;\n" -"	\n" -"	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n" -"	\n" -"	float4 s;\n" -"	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n" -"	sum[i]=s;\n" -"	sum2[i]=s*s;	\n" -"}\n" -; +static const char* sapCL = +	"/*\n" +	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n" +	"This software is provided 'as-is', without any express or implied warranty.\n" +	"In no event will the authors be held liable for any damages arising from the use of this software.\n" +	"Permission is granted to anyone to use this software for any purpose, \n" +	"including commercial applications, and to alter it and redistribute it freely, \n" +	"subject to the following restrictions:\n" +	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +	"3. This notice may not be removed or altered from any source distribution.\n" +	"*/\n" +	"//Originally written by Erwin Coumans\n" +	"#define NEW_PAIR_MARKER -1\n" +	"typedef struct \n" +	"{\n" +	"	union\n" +	"	{\n" +	"		float4	m_min;\n" +	"		float   m_minElems[4];\n" +	"		int			m_minIndices[4];\n" +	"	};\n" +	"	union\n" +	"	{\n" +	"		float4	m_max;\n" +	"		float   m_maxElems[4];\n" +	"		int			m_maxIndices[4];\n" +	"	};\n" +	"} btAabbCL;\n" +	"/// conservative test for overlap between two aabbs\n" +	"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n" +	"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n" +	"{\n" +	"	bool overlap = true;\n" +	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +	"	return overlap;\n" +	"}\n" +	"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +	"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" +	"{\n" +	"	bool overlap = true;\n" +	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +	"	return overlap;\n" +	"}\n" +	"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n" +	"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n" +	"{\n" +	"	bool overlap = true;\n" +	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +	"	return overlap;\n" +	"}\n" +	"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n" +	"{\n" +	"	int i = get_global_id(0);\n" +	"	if (i>=numUnsortedAabbs)\n" +	"		return;\n" +	"	int j = get_global_id(1);\n" +	"	if (j>=numUnSortedAabbs2)\n" +	"		return;\n" +	"	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n" +	"	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n" +	"	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n" +	"	{\n" +	"		int4 myPair;\n" +	"		\n" +	"		int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n" +	"		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n" +	"		if (xIndex>yIndex)\n" +	"		{\n" +	"			int tmp = xIndex;\n" +	"			xIndex=yIndex;\n" +	"			yIndex=tmp;\n" +	"		}\n" +	"		\n" +	"		myPair.x = xIndex;\n" +	"		myPair.y = yIndex;\n" +	"		myPair.z = NEW_PAIR_MARKER;\n" +	"		myPair.w = NEW_PAIR_MARKER;\n" +	"		int curPair = atomic_inc (pairCount);\n" +	"		if (curPair<maxPairs)\n" +	"		{\n" +	"				pairsOut[curPair] = myPair; //flush to main memory\n" +	"		}\n" +	"	}\n" +	"}\n" +	"__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +	"{\n" +	"	int i = get_global_id(0);\n" +	"	if (i>=numObjects)\n" +	"		return;\n" +	"	for (int j=i+1;j<numObjects;j++)\n" +	"	{\n" +	"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +	"		{\n" +	"			int4 myPair;\n" +	"			myPair.x = aabbs[i].m_minIndices[3];\n" +	"			myPair.y = aabbs[j].m_minIndices[3];\n" +	"			myPair.z = NEW_PAIR_MARKER;\n" +	"			myPair.w = NEW_PAIR_MARKER;\n" +	"			int curPair = atomic_inc (pairCount);\n" +	"			if (curPair<maxPairs)\n" +	"			{\n" +	"					pairsOut[curPair] = myPair; //flush to main memory\n" +	"			}\n" +	"		}\n" +	"	}\n" +	"}\n" +	"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +	"{\n" +	"	int i = get_global_id(0);\n" +	"	if (i>=numObjects)\n" +	"		return;\n" +	"	for (int j=i+1;j<numObjects;j++)\n" +	"	{\n" +	"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" +	"		{\n" +	"			break;\n" +	"		}\n" +	"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +	"		{\n" +	"			int4 myPair;\n" +	"			myPair.x = aabbs[i].m_minIndices[3];\n" +	"			myPair.y = aabbs[j].m_minIndices[3];\n" +	"			myPair.z = NEW_PAIR_MARKER;\n" +	"			myPair.w = NEW_PAIR_MARKER;\n" +	"			int curPair = atomic_inc (pairCount);\n" +	"			if (curPair<maxPairs)\n" +	"			{\n" +	"					pairsOut[curPair] = myPair; //flush to main memory\n" +	"			}\n" +	"		}\n" +	"	}\n" +	"}\n" +	"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +	"{\n" +	"	int i = get_global_id(0);\n" +	"	int localId = get_local_id(0);\n" +	"	__local int numActiveWgItems[1];\n" +	"	__local int breakRequest[1];\n" +	"	if (localId==0)\n" +	"	{\n" +	"		numActiveWgItems[0] = 0;\n" +	"		breakRequest[0] = 0;\n" +	"	}\n" +	"	barrier(CLK_LOCAL_MEM_FENCE);\n" +	"	atomic_inc(numActiveWgItems);\n" +	"	barrier(CLK_LOCAL_MEM_FENCE);\n" +	"	int localBreak = 0;\n" +	"	int j=i+1;\n" +	"	do\n" +	"	{\n" +	"		barrier(CLK_LOCAL_MEM_FENCE);\n" +	"	\n" +	"		if (j<numObjects)\n" +	"		{\n" +	"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n" +	"			{\n" +	"				if (!localBreak)\n" +	"				{\n" +	"					atomic_inc(breakRequest);\n" +	"					localBreak = 1;\n" +	"				}\n" +	"			}\n" +	"		}\n" +	"		\n" +	"		barrier(CLK_LOCAL_MEM_FENCE);\n" +	"		\n" +	"		if (j>=numObjects && !localBreak)\n" +	"		{\n" +	"			atomic_inc(breakRequest);\n" +	"			localBreak = 1;\n" +	"		}\n" +	"		barrier(CLK_LOCAL_MEM_FENCE);\n" +	"		\n" +	"		if (!localBreak)\n" +	"		{\n" +	"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n" +	"			{\n" +	"				int4 myPair;\n" +	"				myPair.x = aabbs[i].m_minIndices[3];\n" +	"				myPair.y = aabbs[j].m_minIndices[3];\n" +	"				myPair.z = NEW_PAIR_MARKER;\n" +	"				myPair.w = NEW_PAIR_MARKER;\n" +	"				int curPair = atomic_inc (pairCount);\n" +	"				if (curPair<maxPairs)\n" +	"				{\n" +	"						pairsOut[curPair] = myPair; //flush to main memory\n" +	"				}\n" +	"			}\n" +	"		}\n" +	"		j++;\n" +	"	} while (breakRequest[0]<numActiveWgItems[0]);\n" +	"}\n" +	"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n" +	"{\n" +	"	int i = get_global_id(0);\n" +	"	int localId = get_local_id(0);\n" +	"	__local int numActiveWgItems[1];\n" +	"	__local int breakRequest[1];\n" +	"	__local btAabbCL localAabbs[128];// = aabbs[i];\n" +	"	\n" +	"	btAabbCL myAabb;\n" +	"	\n" +	"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n" +	"	float testValue = 	myAabb.m_maxElems[axis];\n" +	"	\n" +	"	if (localId==0)\n" +	"	{\n" +	"		numActiveWgItems[0] = 0;\n" +	"		breakRequest[0] = 0;\n" +	"	}\n" +	"	int localCount=0;\n" +	"	int block=0;\n" +	"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n" +	"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n" +	"	\n" +	"	barrier(CLK_LOCAL_MEM_FENCE);\n" +	"	atomic_inc(numActiveWgItems);\n" +	"	barrier(CLK_LOCAL_MEM_FENCE);\n" +	"	int localBreak = 0;\n" +	"	\n" +	"	int j=i+1;\n" +	"	do\n" +	"	{\n" +	"		barrier(CLK_LOCAL_MEM_FENCE);\n" +	"	\n" +	"		if (j<numObjects)\n" +	"		{\n" +	"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n" +	"			{\n" +	"				if (!localBreak)\n" +	"				{\n" +	"					atomic_inc(breakRequest);\n" +	"					localBreak = 1;\n" +	"				}\n" +	"			}\n" +	"		}\n" +	"		\n" +	"		barrier(CLK_LOCAL_MEM_FENCE);\n" +	"		\n" +	"		if (j>=numObjects && !localBreak)\n" +	"		{\n" +	"			atomic_inc(breakRequest);\n" +	"			localBreak = 1;\n" +	"		}\n" +	"		barrier(CLK_LOCAL_MEM_FENCE);\n" +	"		\n" +	"		if (!localBreak)\n" +	"		{\n" +	"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n" +	"			{\n" +	"				int4 myPair;\n" +	"				myPair.x = myAabb.m_minIndices[3];\n" +	"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n" +	"				myPair.z = NEW_PAIR_MARKER;\n" +	"				myPair.w = NEW_PAIR_MARKER;\n" +	"				int curPair = atomic_inc (pairCount);\n" +	"				if (curPair<maxPairs)\n" +	"				{\n" +	"						pairsOut[curPair] = myPair; //flush to main memory\n" +	"				}\n" +	"			}\n" +	"		}\n" +	"		\n" +	"		barrier(CLK_LOCAL_MEM_FENCE);\n" +	"		localCount++;\n" +	"		if (localCount==64)\n" +	"		{\n" +	"			localCount = 0;\n" +	"			block+=64;			\n" +	"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n" +	"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n" +	"		}\n" +	"		j++;\n" +	"		\n" +	"	} while (breakRequest[0]<numActiveWgItems[0]);\n" +	"	\n" +	"}\n" +	"//http://stereopsis.com/radix.html\n" +	"unsigned int FloatFlip(float fl);\n" +	"unsigned int FloatFlip(float fl)\n" +	"{\n" +	"	unsigned int f = *(unsigned int*)&fl;\n" +	"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n" +	"	return f ^ mask;\n" +	"}\n" +	"float IFloatFlip(unsigned int f);\n" +	"float IFloatFlip(unsigned int f)\n" +	"{\n" +	"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n" +	"	unsigned int fl = f ^ mask;\n" +	"	return *(float*)&fl;\n" +	"}\n" +	"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n" +	"{\n" +	"	int i = get_global_id(0);\n" +	"	if (i>=numObjects)\n" +	"		return;\n" +	"	int src = destAabbs[i].m_maxIndices[3];\n" +	"	destAabbs[i] = allAabbs[src];\n" +	"	destAabbs[i].m_maxIndices[3] = src;\n" +	"}\n" +	"__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n" +	"{\n" +	"	int i = get_global_id(0);\n" +	"	if (i>=numObjects)\n" +	"		return;\n" +	"	\n" +	"	\n" +	"	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n" +	"	sortData[i].y = i;\n" +	"		\n" +	"}\n" +	"__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" +	"{\n" +	"	int i = get_global_id(0);\n" +	"	if (i>=numObjects)\n" +	"		return;\n" +	"	\n" +	"	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n" +	"}\n" +	"__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n" +	"{\n" +	"	int i = get_global_id(0);\n" +	"	if (i>=numAabbs)\n" +	"		return;\n" +	"	\n" +	"	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n" +	"	\n" +	"	float4 s;\n" +	"	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n" +	"	sum[i]=s;\n" +	"	sum2[i]=s*s;	\n" +	"}\n";  |