Update Bullet to the latest commit 126b676

author: Oussama <o.boukhelf@gmail.com> 2019-01-03 14:26:51 +0100
committer: Rémi Verschelde <rverschelde@gmail.com> 2019-01-07 12:30:35 +0100
commit: 22b7c9dfa80d0f7abca40f061865c2ab3c136a74 (patch)
tree: 311cd3f22b012329160f9d43810aea429994af48 /thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
parent: a6722cf36251ddcb538e6ebed9fa4950342b68ba (diff)
1 files changed, 333 insertions, 397 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
index f11ae4bcdb..e86af6583f 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
@@ -10,21 +10,20 @@
 #include "kernels/RadixSort32KernelsCL.h"
 
 b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	b3OpenCLDeviceInfo info;
-	b3OpenCLUtils::getDeviceInfo(device,&info);
-	m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
+	b3OpenCLUtils::getDeviceInfo(device, &info);
+	m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
 
-	m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue);
-	m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue);
-	m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue);
-	m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue);
-	m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue);
-	m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue);
+	m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
+	m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
+	m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
+	m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
+	m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
+	m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
 
-
-	if (initialCapacity>0)
+	if (initialCapacity > 0)
 	{
 		m_workBuffer1->resize(initialCapacity);
 		m_workBuffer3->resize(initialCapacity);
@@ -33,45 +32,40 @@ b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command
 		m_workBuffer4a->resize(initialCapacity);
 	}
 
-	m_scan = new b3PrefixScanCL(ctx,device,queue);
-	m_fill = new b3FillCL(ctx,device,queue);
-	
+	m_scan = new b3PrefixScanCL(ctx, device, queue);
+	m_fill = new b3FillCL(ctx, device, queue);
+
 	const char* additionalMacros = "";
 
 	cl_int pErrNum;
 	const char* kernelSource = radixSort32KernelsCL;
-	
-	cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
-	b3Assert(sortProg);
 
-	m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
-	b3Assert(m_streamCountSortDataKernel );
+	cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
+	b3Assert(sortProg);
 
+	m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
+	b3Assert(m_streamCountSortDataKernel);
 
-	
-	m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
+	m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
 	b3Assert(m_streamCountKernel);
 
-
-	
 	if (m_deviceCPU)
 	{
-		
-		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
+		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
 		b3Assert(m_sortAndScatterSortDataKernel);
-		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
+		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
 		b3Assert(m_sortAndScatterKernel);
-	} else
+	}
+	else
 	{
-		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
+		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
 		b3Assert(m_sortAndScatterSortDataKernel);
-		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
+		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
 		b3Assert(m_sortAndScatterKernel);
 	}
-		
-	m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
+
+	m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
 	b3Assert(m_prefixScanKernel);
-		
 }
 
 b3RadixSort32CL::~b3RadixSort32CL()
@@ -96,8 +90,7 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s
 {
 	int n = inout.size();
 	const int BITS_PER_PASS = 8;
-	const int NUM_TABLES = (1<<BITS_PER_PASS);
-
+	const int NUM_TABLES = (1 << BITS_PER_PASS);
 
 	int tables[NUM_TABLES];
 	int counter[NUM_TABLES];
@@ -107,34 +100,33 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s
 	workbuffer.resize(inout.size());
 	b3SortData* dst = &workbuffer[0];
 
-	int count=0;
-	for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+	int count = 0;
+	for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
 	{
-		for(int i=0; i<NUM_TABLES; i++)
+		for (int i = 0; i < NUM_TABLES; i++)
 		{
 			tables[i] = 0;
 		}
 
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
-			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
 			tables[tableIdx]++;
 		}
 //#define TEST
 #ifdef TEST
-		printf("histogram size=%d\n",NUM_TABLES);
-		for (int i=0;i<NUM_TABLES;i++)
+		printf("histogram size=%d\n", NUM_TABLES);
+		for (int i = 0; i < NUM_TABLES; i++)
 		{
-			if (tables[i]!=0)
+			if (tables[i] != 0)
 			{
-				printf("tables[%d]=%d]\n",i,tables[i]);
+				printf("tables[%d]=%d]\n", i, tables[i]);
 			}
-
 		}
-#endif //TEST
-		//	prefix scan
+#endif  //TEST \
+	//	prefix scan
 		int sum = 0;
-		for(int i=0; i<NUM_TABLES; i++)
+		for (int i = 0; i < NUM_TABLES; i++)
 		{
 			int iData = tables[i];
 			tables[i] = sum;
@@ -143,71 +135,65 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s
 		}
 
 		//	distribute
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
-			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
-			
+			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
+
 			dst[tables[tableIdx] + counter[tableIdx]] = src[i];
-			counter[tableIdx] ++;
+			counter[tableIdx]++;
 		}
 
-		b3Swap( src, dst );
+		b3Swap(src, dst);
 		count++;
 	}
 
-	if (count&1)
+	if (count & 1)
 	{
-		b3Assert(0);//need to copy 
-
+		b3Assert(0);  //need to copy
 	}
 }
 
 void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
 {
-
 	b3AlignedObjectArray<b3SortData> inout;
 	keyValuesInOut.copyToHost(inout);
 
-	executeHost(inout,sortBits);
+	executeHost(inout, sortBits);
 
 	keyValuesInOut.copyFromHost(inout);
 }
 
-void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, 
-								b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
+void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
+							  b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
 {
-
 }
 
 //#define DEBUG_RADIXSORT
 //#define DEBUG_RADIXSORT2
 
-
 void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
 {
-	
 	int originalSize = keyValuesInOut.size();
 	int workingSize = originalSize;
-	
-			
+
 	int dataAlignment = DATA_ALIGNMENT;
 
 #ifdef DEBUG_RADIXSORT2
-    b3AlignedObjectArray<b3SortData>   test2;
-    keyValuesInOut.copyToHost(test2);
-    printf("numElem = %d\n",test2.size());
-    for (int i=0;i<test2.size();i++)
-    {
-        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
-        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
-    }
-#endif //DEBUG_RADIXSORT2
-    
+	b3AlignedObjectArray<b3SortData> test2;
+	keyValuesInOut.copyToHost(test2);
+	printf("numElem = %d\n", test2.size());
+	for (int i = 0; i < test2.size(); i++)
+	{
+		printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+		printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+	}
+#endif  //DEBUG_RADIXSORT2
+
 	b3OpenCLArray<b3SortData>* src = 0;
 
-	if (workingSize%dataAlignment)
+	if (workingSize % dataAlignment)
 	{
-		workingSize += dataAlignment-(workingSize%dataAlignment);
+		workingSize += dataAlignment - (workingSize % dataAlignment);
 		m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
 		m_workBuffer4->resize(workingSize);
 		b3SortData fillValue;
@@ -216,327 +202,301 @@ void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sor
 
 #define USE_BTFILL
 #ifdef USE_BTFILL
-		m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize);
+		m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
 #else
 		//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
-		
-		for (int i=originalSize; i<workingSize;i++)
+
+		for (int i = originalSize; i < workingSize; i++)
 		{
-			m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
+			m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
 		}
-#endif//USE_BTFILL
+#endif  //USE_BTFILL
 
 		src = m_workBuffer4;
-	} else
+	}
+	else
 	{
 		src = &keyValuesInOut;
 		m_workBuffer4->resize(0);
 	}
-		
-	b3Assert( workingSize%DATA_ALIGNMENT == 0 );
-	int minCap = NUM_BUCKET*NUM_WGS;
 
+	b3Assert(workingSize % DATA_ALIGNMENT == 0);
+	int minCap = NUM_BUCKET * NUM_WGS;
 
 	int n = workingSize;
 
 	m_workBuffer1->resize(minCap);
 	m_workBuffer3->resize(workingSize);
-	
 
-//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
-	b3Assert( BITS_PER_PASS == 4 );
-	b3Assert( WG_SIZE == 64 );
-	b3Assert( (sortBits&0x3) == 0 );
+	//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	b3Assert(BITS_PER_PASS == 4);
+	b3Assert(WG_SIZE == 64);
+	b3Assert((sortBits & 0x3) == 0);
 
-	
-	
 	b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
 
 	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
 	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
 
-
 	int nWGs = NUM_WGS;
 	b3ConstData cdata;
 
 	{
-        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
-     	int nBlocks = (n+blockSize-1)/(blockSize);
+		int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
+		int nBlocks = (n + blockSize - 1) / (blockSize);
 		cdata.m_n = n;
 		cdata.m_nWGs = NUM_WGS;
 		cdata.m_startBit = 0;
-		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
-		if( nBlocks < NUM_WGS )
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
+		if (nBlocks < NUM_WGS)
 		{
 			cdata.m_nBlocksPerWG = 1;
 			nWGs = nBlocks;
 		}
 	}
 
-	int count=0;
-	for(int ib=0; ib<sortBits; ib+=4)
+	int count = 0;
+	for (int ib = 0; ib < sortBits; ib += 4)
 	{
 #ifdef DEBUG_RADIXSORT2
-        keyValuesInOut.copyToHost(test2);
-        printf("numElem = %d\n",test2.size());
-        for (int i=0;i<test2.size();i++)
-        {
-            if (test2[i].m_key != test2[i].m_value)
-            {
-                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
-                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
-            }
-        }
-#endif //DEBUG_RADIXSORT2
-        
+		keyValuesInOut.copyToHost(test2);
+		printf("numElem = %d\n", test2.size());
+		for (int i = 0; i < test2.size(); i++)
+		{
+			if (test2[i].m_key != test2[i].m_value)
+			{
+				printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+				printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+			}
+		}
+#endif  //DEBUG_RADIXSORT2
+
 		cdata.m_startBit = ib;
-		
+
 		if (src->size())
 		{
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
-			b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel");
-
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			
-			int num = NUM_WGS*WG_SIZE;
-			launcher.launch1D( num, WG_SIZE );
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
+
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+
+			int num = NUM_WGS * WG_SIZE;
+			launcher.launch1D(num, WG_SIZE);
 		}
 
-        
-        
 #ifdef DEBUG_RADIXSORT
 		b3AlignedObjectArray<unsigned int> testHist;
 		srcHisto->copyToHost(testHist);
-		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
-		for (int i=0;i<testHist.size();i++)
+		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
+		for (int i = 0; i < testHist.size(); i++)
 		{
-			if (testHist[i]!=0)
-				printf("testHist[%d]=%d\n",i,testHist[i]);
+			if (testHist[i] != 0)
+				printf("testHist[%d]=%d\n", i, testHist[i]);
 		}
-#endif //DEBUG_RADIXSORT
-	
-	
+#endif  //DEBUG_RADIXSORT
 
 //fast prefix scan is not working properly on Mac OSX yet
 #ifdef __APPLE__
-	bool fastScan=false;
+		bool fastScan = false;
 #else
-	bool fastScan=!m_deviceCPU;//only use fast scan on GPU
+		bool fastScan = !m_deviceCPU;  //only use fast scan on GPU
 #endif
 
 		if (fastScan)
-		{//	prefix scan group histogram
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
-			b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			launcher.launch1D( 128, 128 );
+		{  //	prefix scan group histogram
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+			launcher.launch1D(128, 128);
 			destHisto = srcHisto;
-		}else
+		}
+		else
 		{
 			//unsigned int sum; //for debugging
-            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+			m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
 		}
 
-
 #ifdef DEBUG_RADIXSORT
 		destHisto->copyToHost(testHist);
-		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
-		for (int i=0;i<testHist.size();i++)
+		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
+		for (int i = 0; i < testHist.size(); i++)
 		{
-			if (testHist[i]!=0)
-				printf("testHist[%d]=%d\n",i,testHist[i]);
+			if (testHist[i] != 0)
+				printf("testHist[%d]=%d\n", i, testHist[i]);
 		}
-        
-        for (int i=0;i<testHist.size();i+=NUM_WGS)
+
+		for (int i = 0; i < testHist.size(); i += NUM_WGS)
 		{
-				printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
+			printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
 		}
 
-#endif //DEBUG_RADIXSORT
+#endif  //DEBUG_RADIXSORT
 
 #define USE_GPU
 #ifdef USE_GPU
-        
+
 		if (src->size())
-		{//	local sort and distribute
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
-			b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" );
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
-            
+		{  //	local sort and distribute
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+			launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
 		}
 #else
-        {
+		{
 #define NUM_TABLES 16
 //#define SEQUENTIAL
 #ifdef SEQUENTIAL
-            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-            int tables[NUM_TABLES];
-            int startBit = ib;
-            
-            destHisto->copyToHost(testHist);
-            b3AlignedObjectArray<b3SortData> srcHost;
-            b3AlignedObjectArray<b3SortData> dstHost;
-            dstHost.resize(src->size());
-            
-            src->copyToHost(srcHost);
-            
-            for (int i=0;i<NUM_TABLES;i++)
-            {
-                tables[i] = testHist[i*NUM_WGS];
-            }
-            
-            //	distribute
-            for(int i=0; i<n; i++)
-            {
-                int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-                
-                dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
-                counter2[tableIdx] ++;
-            }
-            
-            
+			int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+			int tables[NUM_TABLES];
+			int startBit = ib;
+
+			destHisto->copyToHost(testHist);
+			b3AlignedObjectArray<b3SortData> srcHost;
+			b3AlignedObjectArray<b3SortData> dstHost;
+			dstHost.resize(src->size());
+
+			src->copyToHost(srcHost);
+
+			for (int i = 0; i < NUM_TABLES; i++)
+			{
+				tables[i] = testHist[i * NUM_WGS];
+			}
+
+			//	distribute
+			for (int i = 0; i < n; i++)
+			{
+				int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
+
+				dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+				counter2[tableIdx]++;
+			}
+
 #else
-          
-            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-            
-            int tables[NUM_TABLES];
-             b3AlignedObjectArray<b3SortData> dstHostOK;
-            dstHostOK.resize(src->size());
-
-            destHisto->copyToHost(testHist);
-            b3AlignedObjectArray<b3SortData> srcHost;
-            src->copyToHost(srcHost);
-        
-            int blockSize = 256;
-            int nBlocksPerWG = cdata.m_nBlocksPerWG;
-            int startBit = ib;
-
-            {
-                for (int i=0;i<NUM_TABLES;i++)
-                {
-                    tables[i] = testHist[i*NUM_WGS];
-                }
-                
-                //	distribute
-                for(int i=0; i<n; i++)
-                {
-                    int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-                    
-                    dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
-                    counter2[tableIdx] ++;
-                }
-
-            
-            }
-            
-            
-            b3AlignedObjectArray<b3SortData> dstHost;
-            dstHost.resize(src->size());
-            
-            
-            int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-            
-            
-            
-            for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
-            {
-              int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-              int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
-                
-              for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
-              {
-                for (int lIdx = 0;lIdx < 64;lIdx++)
-                {
-                    int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
-                    
-                    //	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
-                    //	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
-                    //	AMD: AtomInc performs better while NV prefers ++
-                    for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
-                    {
-                        if( addr+j < n )
-                        {
-                          //  printf ("addr+j=%d\n", addr+j);
-                            
-                            int i = addr+j;
-                            
-                            int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-                            
-                            int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
-                            
-                            b3SortData ok = dstHostOK[destIndex];
-                                                    
-                            if (ok.m_key != srcHost[i].m_key)
-                            {
-                                printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
-                                printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
-                            }
-                            if (ok.m_value != srcHost[i].m_value)
-                            {
-                                
-                               printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
-                                printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
-
-                            }
-                   
-                            dstHost[destIndex] = srcHost[i];
-                            counter[tableIdx] ++;
-                            
-                        }
-                    }
-                }
-              }
-            }
-            
-         
-#endif //SEQUENTIAL
-            
-            dst->copyFromHost(dstHost);
-        }
-#endif//USE_GPU
-        
-        
-        
+
+			int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+			int tables[NUM_TABLES];
+			b3AlignedObjectArray<b3SortData> dstHostOK;
+			dstHostOK.resize(src->size());
+
+			destHisto->copyToHost(testHist);
+			b3AlignedObjectArray<b3SortData> srcHost;
+			src->copyToHost(srcHost);
+
+			int blockSize = 256;
+			int nBlocksPerWG = cdata.m_nBlocksPerWG;
+			int startBit = ib;
+
+			{
+				for (int i = 0; i < NUM_TABLES; i++)
+				{
+					tables[i] = testHist[i * NUM_WGS];
+				}
+
+				//	distribute
+				for (int i = 0; i < n; i++)
+				{
+					int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
+
+					dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+					counter2[tableIdx]++;
+				}
+			}
+
+			b3AlignedObjectArray<b3SortData> dstHost;
+			dstHost.resize(src->size());
+
+			int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+			for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
+			{
+				int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+				int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
+
+				for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
+				{
+					for (int lIdx = 0; lIdx < 64; lIdx++)
+					{
+						int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
+
+						//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
+						//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
+						//	AMD: AtomInc performs better while NV prefers ++
+						for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
+						{
+							if (addr + j < n)
+							{
+								//  printf ("addr+j=%d\n", addr+j);
+
+								int i = addr + j;
+
+								int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
+
+								int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
+
+								b3SortData ok = dstHostOK[destIndex];
+
+								if (ok.m_key != srcHost[i].m_key)
+								{
+									printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
+									printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
+								}
+								if (ok.m_value != srcHost[i].m_value)
+								{
+									printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
+									printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
+								}
+
+								dstHost[destIndex] = srcHost[i];
+								counter[tableIdx]++;
+							}
+						}
+					}
+				}
+			}
+
+#endif  //SEQUENTIAL
+
+			dst->copyFromHost(dstHost);
+		}
+#endif  //USE_GPU
+
 #ifdef DEBUG_RADIXSORT
 		destHisto->copyToHost(testHist);
-		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
-		for (int i=0;i<testHist.size();i++)
+		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
+		for (int i = 0; i < testHist.size(); i++)
 		{
-			if (testHist[i]!=0)
-				printf("testHist[%d]=%d\n",i,testHist[i]);
+			if (testHist[i] != 0)
+				printf("testHist[%d]=%d\n", i, testHist[i]);
 		}
-#endif //DEBUG_RADIXSORT
-		b3Swap(src, dst );
-		b3Swap(srcHisto,destHisto);
+#endif  //DEBUG_RADIXSORT
+		b3Swap(src, dst);
+		b3Swap(srcHisto, destHisto);
 
 #ifdef DEBUG_RADIXSORT2
-        keyValuesInOut.copyToHost(test2);
-        printf("numElem = %d\n",test2.size());
-        for (int i=0;i<test2.size();i++)
-        {
-            if (test2[i].m_key != test2[i].m_value)
-            {
-                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
-                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
-            }
-        }
-#endif //DEBUG_RADIXSORT2
-        
-        count++;
-                
-        
+		keyValuesInOut.copyToHost(test2);
+		printf("numElem = %d\n", test2.size());
+		for (int i = 0; i < test2.size(); i++)
+		{
+			if (test2[i].m_key != test2[i].m_value)
+			{
+				printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+				printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+			}
+		}
+#endif  //DEBUG_RADIXSORT2
+
+		count++;
 	}
-	
-   
-    
-	if (count&1)
+
+	if (count & 1)
 	{
-		b3Assert(0);//need to copy from workbuffer to keyValuesInOut
+		b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
 	}
 
 	if (m_workBuffer4->size())
@@ -545,153 +505,137 @@ void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sor
 		keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
 	}
 
-
 #ifdef DEBUG_RADIXSORT
-    keyValuesInOut.copyToHost(test2);
-   
-    printf("numElem = %d\n",test2.size());
-    for (int i=0;i<test2.size();i++)
-    {
-        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
-        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
-    }
-#endif    
-	
-}
-
-
-
-
+	keyValuesInOut.copyToHost(test2);
 
+	printf("numElem = %d\n", test2.size());
+	for (int i = 0; i < test2.size(); i++)
+	{
+		printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+		printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+	}
+#endif
+}
 
 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
 {
 	int originalSize = keysInOut.size();
 	int workingSize = originalSize;
-	
-			
+
 	int dataAlignment = DATA_ALIGNMENT;
 
 	b3OpenCLArray<unsigned int>* src = 0;
 
-	if (workingSize%dataAlignment)
+	if (workingSize % dataAlignment)
 	{
-		workingSize += dataAlignment-(workingSize%dataAlignment);
+		workingSize += dataAlignment - (workingSize % dataAlignment);
 		m_workBuffer4a->copyFromOpenCLArray(keysInOut);
 		m_workBuffer4a->resize(workingSize);
 		unsigned int fillValue = 0xffffffff;
-		
-		m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
+
+		m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
 
 		src = m_workBuffer4a;
-	} else
+	}
+	else
 	{
 		src = &keysInOut;
 		m_workBuffer4a->resize(0);
 	}
-	
-	
-
-	b3Assert( workingSize%DATA_ALIGNMENT == 0 );
-	int minCap = NUM_BUCKET*NUM_WGS;
 
+	b3Assert(workingSize % DATA_ALIGNMENT == 0);
+	int minCap = NUM_BUCKET * NUM_WGS;
 
 	int n = workingSize;
 
-	
 	m_workBuffer1->resize(minCap);
 	m_workBuffer3->resize(workingSize);
 	m_workBuffer3a->resize(workingSize);
 
-//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
-	b3Assert( BITS_PER_PASS == 4 );
-	b3Assert( WG_SIZE == 64 );
-	b3Assert( (sortBits&0x3) == 0 );
+	//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	b3Assert(BITS_PER_PASS == 4);
+	b3Assert(WG_SIZE == 64);
+	b3Assert((sortBits & 0x3) == 0);
 
-	
-	
 	b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
 
 	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
 	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
 
-
 	int nWGs = NUM_WGS;
 	b3ConstData cdata;
 
 	{
-        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
-     	int nBlocks = (n+blockSize-1)/(blockSize);
+		int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
+		int nBlocks = (n + blockSize - 1) / (blockSize);
 		cdata.m_n = n;
 		cdata.m_nWGs = NUM_WGS;
 		cdata.m_startBit = 0;
-		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
-		if( nBlocks < NUM_WGS )
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
+		if (nBlocks < NUM_WGS)
 		{
 			cdata.m_nBlocksPerWG = 1;
 			nWGs = nBlocks;
 		}
 	}
 
-	int count=0;
-	for(int ib=0; ib<sortBits; ib+=4)
+	int count = 0;
+	for (int ib = 0; ib < sortBits; ib += 4)
 	{
 		cdata.m_startBit = ib;
-		
+
 		if (src->size())
 		{
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
-			b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel");
-
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			
-			int num = NUM_WGS*WG_SIZE;
-			launcher.launch1D( num, WG_SIZE );
-		}
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
+
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
 
-        
+			int num = NUM_WGS * WG_SIZE;
+			launcher.launch1D(num, WG_SIZE);
+		}
 
 //fast prefix scan is not working properly on Mac OSX yet
 #ifdef __APPLE__
-	bool fastScan=false;	
+		bool fastScan = false;
 #else
-	bool fastScan=!m_deviceCPU;
+		bool fastScan = !m_deviceCPU;
 #endif
 
 		if (fastScan)
-		{//	prefix scan group histogram
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
-			b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			launcher.launch1D( 128, 128 );
+		{  //	prefix scan group histogram
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+			launcher.launch1D(128, 128);
 			destHisto = srcHisto;
-		}else
+		}
+		else
 		{
 			//unsigned int sum; //for debugging
-            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+			m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
 		}
 
 		if (src->size())
-		{//	local sort and distribute
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
-			b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel");
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
-            
+		{  //	local sort and distribute
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+			launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
 		}
-        
-		b3Swap(src, dst );
-		b3Swap(srcHisto,destHisto);
 
-        count++;
+		b3Swap(src, dst);
+		b3Swap(srcHisto, destHisto);
+
+		count++;
 	}
-    
-	if (count&1)
+
+	if (count & 1)
 	{
-		b3Assert(0);//need to copy from workbuffer to keyValuesInOut
+		b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
 	}
 
 	if (m_workBuffer4a->size())
@@ -699,12 +643,4 @@ void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBi
 		m_workBuffer4a->resize(originalSize);
 		keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
 	}
-	
 }
-
-
-
-
-
-
-
author	Oussama <o.boukhelf@gmail.com>	2019-01-03 14:26:51 +0100
committer	Rémi Verschelde <rverschelde@gmail.com>	2019-01-07 12:30:35 +0100
commit	22b7c9dfa80d0f7abca40f061865c2ab3c136a74 (patch)
tree	311cd3f22b012329160f9d43810aea429994af48 /thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
parent	a6722cf36251ddcb538e6ebed9fa4950342b68ba (diff)