summaryrefslogtreecommitdiff
path: root/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
diff options
context:
space:
mode:
authorOussama <o.boukhelf@gmail.com>2019-01-03 14:26:51 +0100
committerRĂ©mi Verschelde <rverschelde@gmail.com>2019-01-07 12:30:35 +0100
commit22b7c9dfa80d0f7abca40f061865c2ab3c136a74 (patch)
tree311cd3f22b012329160f9d43810aea429994af48 /thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
parenta6722cf36251ddcb538e6ebed9fa4950342b68ba (diff)
Update Bullet to the latest commit 126b676
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp')
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp730
1 files changed, 333 insertions, 397 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
index f11ae4bcdb..e86af6583f 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
@@ -10,21 +10,20 @@
#include "kernels/RadixSort32KernelsCL.h"
b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
-:m_commandQueue(queue)
+ : m_commandQueue(queue)
{
b3OpenCLDeviceInfo info;
- b3OpenCLUtils::getDeviceInfo(device,&info);
- m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
+ b3OpenCLUtils::getDeviceInfo(device, &info);
+ m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
- m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue);
- m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue);
- m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue);
- m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue);
- m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue);
- m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue);
+ m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
+ m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
+ m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
+ m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
+ m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
+ m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
-
- if (initialCapacity>0)
+ if (initialCapacity > 0)
{
m_workBuffer1->resize(initialCapacity);
m_workBuffer3->resize(initialCapacity);
@@ -33,45 +32,40 @@ b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command
m_workBuffer4a->resize(initialCapacity);
}
- m_scan = new b3PrefixScanCL(ctx,device,queue);
- m_fill = new b3FillCL(ctx,device,queue);
-
+ m_scan = new b3PrefixScanCL(ctx, device, queue);
+ m_fill = new b3FillCL(ctx, device, queue);
+
const char* additionalMacros = "";
cl_int pErrNum;
const char* kernelSource = radixSort32KernelsCL;
-
- cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
- b3Assert(sortProg);
- m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
- b3Assert(m_streamCountSortDataKernel );
+ cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
+ b3Assert(sortProg);
+ m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
+ b3Assert(m_streamCountSortDataKernel);
-
- m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
+ m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_streamCountKernel);
-
-
if (m_deviceCPU)
{
-
- m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
+ m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterSortDataKernel);
- m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
+ m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterKernel);
- } else
+ }
+ else
{
- m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
+ m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterSortDataKernel);
- m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
+ m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterKernel);
}
-
- m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
+
+ m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_prefixScanKernel);
-
}
b3RadixSort32CL::~b3RadixSort32CL()
@@ -96,8 +90,7 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s
{
int n = inout.size();
const int BITS_PER_PASS = 8;
- const int NUM_TABLES = (1<<BITS_PER_PASS);
-
+ const int NUM_TABLES = (1 << BITS_PER_PASS);
int tables[NUM_TABLES];
int counter[NUM_TABLES];
@@ -107,34 +100,33 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s
workbuffer.resize(inout.size());
b3SortData* dst = &workbuffer[0];
- int count=0;
- for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+ int count = 0;
+ for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
{
- for(int i=0; i<NUM_TABLES; i++)
+ for (int i = 0; i < NUM_TABLES; i++)
{
tables[i] = 0;
}
- for(int i=0; i<n; i++)
+ for (int i = 0; i < n; i++)
{
- int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+ int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
tables[tableIdx]++;
}
//#define TEST
#ifdef TEST
- printf("histogram size=%d\n",NUM_TABLES);
- for (int i=0;i<NUM_TABLES;i++)
+ printf("histogram size=%d\n", NUM_TABLES);
+ for (int i = 0; i < NUM_TABLES; i++)
{
- if (tables[i]!=0)
+ if (tables[i] != 0)
{
- printf("tables[%d]=%d]\n",i,tables[i]);
+ printf("tables[%d]=%d]\n", i, tables[i]);
}
-
}
-#endif //TEST
- // prefix scan
+#endif //TEST \
+ // prefix scan
int sum = 0;
- for(int i=0; i<NUM_TABLES; i++)
+ for (int i = 0; i < NUM_TABLES; i++)
{
int iData = tables[i];
tables[i] = sum;
@@ -143,71 +135,65 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s
}
// distribute
- for(int i=0; i<n; i++)
+ for (int i = 0; i < n; i++)
{
- int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
-
+ int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
+
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
- counter[tableIdx] ++;
+ counter[tableIdx]++;
}
- b3Swap( src, dst );
+ b3Swap(src, dst);
count++;
}
- if (count&1)
+ if (count & 1)
{
- b3Assert(0);//need to copy
-
+ b3Assert(0); //need to copy
}
}
void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
-
b3AlignedObjectArray<b3SortData> inout;
keyValuesInOut.copyToHost(inout);
- executeHost(inout,sortBits);
+ executeHost(inout, sortBits);
keyValuesInOut.copyFromHost(inout);
}
-void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
- b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
+void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
+ b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
{
-
}
//#define DEBUG_RADIXSORT
//#define DEBUG_RADIXSORT2
-
void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
-
int originalSize = keyValuesInOut.size();
int workingSize = originalSize;
-
-
+
int dataAlignment = DATA_ALIGNMENT;
#ifdef DEBUG_RADIXSORT2
- b3AlignedObjectArray<b3SortData> test2;
- keyValuesInOut.copyToHost(test2);
- printf("numElem = %d\n",test2.size());
- for (int i=0;i<test2.size();i++)
- {
- printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
- printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
- }
-#endif //DEBUG_RADIXSORT2
-
+ b3AlignedObjectArray<b3SortData> test2;
+ keyValuesInOut.copyToHost(test2);
+ printf("numElem = %d\n", test2.size());
+ for (int i = 0; i < test2.size(); i++)
+ {
+ printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+ printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+ }
+#endif //DEBUG_RADIXSORT2
+
b3OpenCLArray<b3SortData>* src = 0;
- if (workingSize%dataAlignment)
+ if (workingSize % dataAlignment)
{
- workingSize += dataAlignment-(workingSize%dataAlignment);
+ workingSize += dataAlignment - (workingSize % dataAlignment);
m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
m_workBuffer4->resize(workingSize);
b3SortData fillValue;
@@ -216,327 +202,301 @@ void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sor
#define USE_BTFILL
#ifdef USE_BTFILL
- m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize);
+ m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
#else
//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
-
- for (int i=originalSize; i<workingSize;i++)
+
+ for (int i = originalSize; i < workingSize; i++)
{
- m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
+ m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
}
-#endif//USE_BTFILL
+#endif //USE_BTFILL
src = m_workBuffer4;
- } else
+ }
+ else
{
src = &keyValuesInOut;
m_workBuffer4->resize(0);
}
-
- b3Assert( workingSize%DATA_ALIGNMENT == 0 );
- int minCap = NUM_BUCKET*NUM_WGS;
+ b3Assert(workingSize % DATA_ALIGNMENT == 0);
+ int minCap = NUM_BUCKET * NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
-
-// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
- b3Assert( BITS_PER_PASS == 4 );
- b3Assert( WG_SIZE == 64 );
- b3Assert( (sortBits&0x3) == 0 );
+ // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+ b3Assert(BITS_PER_PASS == 4);
+ b3Assert(WG_SIZE == 64);
+ b3Assert((sortBits & 0x3) == 0);
-
-
b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
-
int nWGs = NUM_WGS;
b3ConstData cdata;
{
- int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
- int nBlocks = (n+blockSize-1)/(blockSize);
+ int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
+ int nBlocks = (n + blockSize - 1) / (blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
- cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
- if( nBlocks < NUM_WGS )
+ cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
+ if (nBlocks < NUM_WGS)
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
- int count=0;
- for(int ib=0; ib<sortBits; ib+=4)
+ int count = 0;
+ for (int ib = 0; ib < sortBits; ib += 4)
{
#ifdef DEBUG_RADIXSORT2
- keyValuesInOut.copyToHost(test2);
- printf("numElem = %d\n",test2.size());
- for (int i=0;i<test2.size();i++)
- {
- if (test2[i].m_key != test2[i].m_value)
- {
- printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
- printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
- }
- }
-#endif //DEBUG_RADIXSORT2
-
+ keyValuesInOut.copyToHost(test2);
+ printf("numElem = %d\n", test2.size());
+ for (int i = 0; i < test2.size(); i++)
+ {
+ if (test2[i].m_key != test2[i].m_value)
+ {
+ printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+ printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+ }
+ }
+#endif //DEBUG_RADIXSORT2
+
cdata.m_startBit = ib;
-
+
if (src->size())
{
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
- b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel");
-
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
-
- int num = NUM_WGS*WG_SIZE;
- launcher.launch1D( num, WG_SIZE );
+ b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
+ b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
+
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(cdata);
+
+ int num = NUM_WGS * WG_SIZE;
+ launcher.launch1D(num, WG_SIZE);
}
-
-
#ifdef DEBUG_RADIXSORT
b3AlignedObjectArray<unsigned int> testHist;
srcHisto->copyToHost(testHist);
- printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
- for (int i=0;i<testHist.size();i++)
+ printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
+ for (int i = 0; i < testHist.size(); i++)
{
- if (testHist[i]!=0)
- printf("testHist[%d]=%d\n",i,testHist[i]);
+ if (testHist[i] != 0)
+ printf("testHist[%d]=%d\n", i, testHist[i]);
}
-#endif //DEBUG_RADIXSORT
-
-
+#endif //DEBUG_RADIXSORT
//fast prefix scan is not working properly on Mac OSX yet
#ifdef __APPLE__
- bool fastScan=false;
+ bool fastScan = false;
#else
- bool fastScan=!m_deviceCPU;//only use fast scan on GPU
+ bool fastScan = !m_deviceCPU; //only use fast scan on GPU
#endif
if (fastScan)
- {// prefix scan group histogram
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
- b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
- launcher.launch1D( 128, 128 );
+ { // prefix scan group histogram
+ b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
+ b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(cdata);
+ launcher.launch1D(128, 128);
destHisto = srcHisto;
- }else
+ }
+ else
{
//unsigned int sum; //for debugging
- m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+ m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
}
-
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
- printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
- for (int i=0;i<testHist.size();i++)
+ printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
+ for (int i = 0; i < testHist.size(); i++)
{
- if (testHist[i]!=0)
- printf("testHist[%d]=%d\n",i,testHist[i]);
+ if (testHist[i] != 0)
+ printf("testHist[%d]=%d\n", i, testHist[i]);
}
-
- for (int i=0;i<testHist.size();i+=NUM_WGS)
+
+ for (int i = 0; i < testHist.size(); i += NUM_WGS)
{
- printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
+ printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
}
-#endif //DEBUG_RADIXSORT
+#endif //DEBUG_RADIXSORT
#define USE_GPU
#ifdef USE_GPU
-
+
if (src->size())
- {// local sort and distribute
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
- b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" );
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
- launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
-
+ { // local sort and distribute
+ b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
+ b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(cdata);
+ launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
}
#else
- {
+ {
#define NUM_TABLES 16
//#define SEQUENTIAL
#ifdef SEQUENTIAL
- int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
- int tables[NUM_TABLES];
- int startBit = ib;
-
- destHisto->copyToHost(testHist);
- b3AlignedObjectArray<b3SortData> srcHost;
- b3AlignedObjectArray<b3SortData> dstHost;
- dstHost.resize(src->size());
-
- src->copyToHost(srcHost);
-
- for (int i=0;i<NUM_TABLES;i++)
- {
- tables[i] = testHist[i*NUM_WGS];
- }
-
- // distribute
- for(int i=0; i<n; i++)
- {
- int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-
- dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
- counter2[tableIdx] ++;
- }
-
-
+ int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ int tables[NUM_TABLES];
+ int startBit = ib;
+
+ destHisto->copyToHost(testHist);
+ b3AlignedObjectArray<b3SortData> srcHost;
+ b3AlignedObjectArray<b3SortData> dstHost;
+ dstHost.resize(src->size());
+
+ src->copyToHost(srcHost);
+
+ for (int i = 0; i < NUM_TABLES; i++)
+ {
+ tables[i] = testHist[i * NUM_WGS];
+ }
+
+ // distribute
+ for (int i = 0; i < n; i++)
+ {
+ int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
+
+ dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+ counter2[tableIdx]++;
+ }
+
#else
-
- int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
- int tables[NUM_TABLES];
- b3AlignedObjectArray<b3SortData> dstHostOK;
- dstHostOK.resize(src->size());
-
- destHisto->copyToHost(testHist);
- b3AlignedObjectArray<b3SortData> srcHost;
- src->copyToHost(srcHost);
-
- int blockSize = 256;
- int nBlocksPerWG = cdata.m_nBlocksPerWG;
- int startBit = ib;
-
- {
- for (int i=0;i<NUM_TABLES;i++)
- {
- tables[i] = testHist[i*NUM_WGS];
- }
-
- // distribute
- for(int i=0; i<n; i++)
- {
- int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-
- dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
- counter2[tableIdx] ++;
- }
-
-
- }
-
-
- b3AlignedObjectArray<b3SortData> dstHost;
- dstHost.resize(src->size());
-
-
- int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-
-
- for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
- {
- int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
- int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
-
- for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
- {
- for (int lIdx = 0;lIdx < 64;lIdx++)
- {
- int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
-
- // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
- // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
- // AMD: AtomInc performs better while NV prefers ++
- for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
- {
- if( addr+j < n )
- {
- // printf ("addr+j=%d\n", addr+j);
-
- int i = addr+j;
-
- int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-
- int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
-
- b3SortData ok = dstHostOK[destIndex];
-
- if (ok.m_key != srcHost[i].m_key)
- {
- printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
- printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
- }
- if (ok.m_value != srcHost[i].m_value)
- {
-
- printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
- printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
-
- }
-
- dstHost[destIndex] = srcHost[i];
- counter[tableIdx] ++;
-
- }
- }
- }
- }
- }
-
-
-#endif //SEQUENTIAL
-
- dst->copyFromHost(dstHost);
- }
-#endif//USE_GPU
-
-
-
+
+ int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ int tables[NUM_TABLES];
+ b3AlignedObjectArray<b3SortData> dstHostOK;
+ dstHostOK.resize(src->size());
+
+ destHisto->copyToHost(testHist);
+ b3AlignedObjectArray<b3SortData> srcHost;
+ src->copyToHost(srcHost);
+
+ int blockSize = 256;
+ int nBlocksPerWG = cdata.m_nBlocksPerWG;
+ int startBit = ib;
+
+ {
+ for (int i = 0; i < NUM_TABLES; i++)
+ {
+ tables[i] = testHist[i * NUM_WGS];
+ }
+
+ // distribute
+ for (int i = 0; i < n; i++)
+ {
+ int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
+
+ dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+ counter2[tableIdx]++;
+ }
+ }
+
+ b3AlignedObjectArray<b3SortData> dstHost;
+ dstHost.resize(src->size());
+
+ int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
+ {
+ int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
+
+ for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
+ {
+ for (int lIdx = 0; lIdx < 64; lIdx++)
+ {
+ int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
+
+ // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
+ // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
+ // AMD: AtomInc performs better while NV prefers ++
+ for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
+ {
+ if (addr + j < n)
+ {
+ // printf ("addr+j=%d\n", addr+j);
+
+ int i = addr + j;
+
+ int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
+
+ int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
+
+ b3SortData ok = dstHostOK[destIndex];
+
+ if (ok.m_key != srcHost[i].m_key)
+ {
+ printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
+ printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
+ }
+ if (ok.m_value != srcHost[i].m_value)
+ {
+ printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
+ printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
+ }
+
+ dstHost[destIndex] = srcHost[i];
+ counter[tableIdx]++;
+ }
+ }
+ }
+ }
+ }
+
+#endif //SEQUENTIAL
+
+ dst->copyFromHost(dstHost);
+ }
+#endif //USE_GPU
+
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
- printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
- for (int i=0;i<testHist.size();i++)
+ printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
+ for (int i = 0; i < testHist.size(); i++)
{
- if (testHist[i]!=0)
- printf("testHist[%d]=%d\n",i,testHist[i]);
+ if (testHist[i] != 0)
+ printf("testHist[%d]=%d\n", i, testHist[i]);
}
-#endif //DEBUG_RADIXSORT
- b3Swap(src, dst );
- b3Swap(srcHisto,destHisto);
+#endif //DEBUG_RADIXSORT
+ b3Swap(src, dst);
+ b3Swap(srcHisto, destHisto);
#ifdef DEBUG_RADIXSORT2
- keyValuesInOut.copyToHost(test2);
- printf("numElem = %d\n",test2.size());
- for (int i=0;i<test2.size();i++)
- {
- if (test2[i].m_key != test2[i].m_value)
- {
- printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
- printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
- }
- }
-#endif //DEBUG_RADIXSORT2
-
- count++;
-
-
+ keyValuesInOut.copyToHost(test2);
+ printf("numElem = %d\n", test2.size());
+ for (int i = 0; i < test2.size(); i++)
+ {
+ if (test2[i].m_key != test2[i].m_value)
+ {
+ printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+ printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+ }
+ }
+#endif //DEBUG_RADIXSORT2
+
+ count++;
}
-
-
-
- if (count&1)
+
+ if (count & 1)
{
- b3Assert(0);//need to copy from workbuffer to keyValuesInOut
+ b3Assert(0); //need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4->size())
@@ -545,153 +505,137 @@ void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sor
keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
}
-
#ifdef DEBUG_RADIXSORT
- keyValuesInOut.copyToHost(test2);
-
- printf("numElem = %d\n",test2.size());
- for (int i=0;i<test2.size();i++)
- {
- printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
- printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
- }
-#endif
-
-}
-
-
-
-
+ keyValuesInOut.copyToHost(test2);
+ printf("numElem = %d\n", test2.size());
+ for (int i = 0; i < test2.size(); i++)
+ {
+ printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+ printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+ }
+#endif
+}
void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
{
int originalSize = keysInOut.size();
int workingSize = originalSize;
-
-
+
int dataAlignment = DATA_ALIGNMENT;
b3OpenCLArray<unsigned int>* src = 0;
- if (workingSize%dataAlignment)
+ if (workingSize % dataAlignment)
{
- workingSize += dataAlignment-(workingSize%dataAlignment);
+ workingSize += dataAlignment - (workingSize % dataAlignment);
m_workBuffer4a->copyFromOpenCLArray(keysInOut);
m_workBuffer4a->resize(workingSize);
unsigned int fillValue = 0xffffffff;
-
- m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
+
+ m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
src = m_workBuffer4a;
- } else
+ }
+ else
{
src = &keysInOut;
m_workBuffer4a->resize(0);
}
-
-
-
- b3Assert( workingSize%DATA_ALIGNMENT == 0 );
- int minCap = NUM_BUCKET*NUM_WGS;
+ b3Assert(workingSize % DATA_ALIGNMENT == 0);
+ int minCap = NUM_BUCKET * NUM_WGS;
int n = workingSize;
-
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
m_workBuffer3a->resize(workingSize);
-// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
- b3Assert( BITS_PER_PASS == 4 );
- b3Assert( WG_SIZE == 64 );
- b3Assert( (sortBits&0x3) == 0 );
+ // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+ b3Assert(BITS_PER_PASS == 4);
+ b3Assert(WG_SIZE == 64);
+ b3Assert((sortBits & 0x3) == 0);
-
-
b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
-
int nWGs = NUM_WGS;
b3ConstData cdata;
{
- int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
- int nBlocks = (n+blockSize-1)/(blockSize);
+ int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
+ int nBlocks = (n + blockSize - 1) / (blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
- cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
- if( nBlocks < NUM_WGS )
+ cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
+ if (nBlocks < NUM_WGS)
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
- int count=0;
- for(int ib=0; ib<sortBits; ib+=4)
+ int count = 0;
+ for (int ib = 0; ib < sortBits; ib += 4)
{
cdata.m_startBit = ib;
-
+
if (src->size())
{
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
- b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel");
-
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
-
- int num = NUM_WGS*WG_SIZE;
- launcher.launch1D( num, WG_SIZE );
- }
+ b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
+ b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
+
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(cdata);
-
+ int num = NUM_WGS * WG_SIZE;
+ launcher.launch1D(num, WG_SIZE);
+ }
//fast prefix scan is not working properly on Mac OSX yet
#ifdef __APPLE__
- bool fastScan=false;
+ bool fastScan = false;
#else
- bool fastScan=!m_deviceCPU;
+ bool fastScan = !m_deviceCPU;
#endif
if (fastScan)
- {// prefix scan group histogram
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
- b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
- launcher.launch1D( 128, 128 );
+ { // prefix scan group histogram
+ b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
+ b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(cdata);
+ launcher.launch1D(128, 128);
destHisto = srcHisto;
- }else
+ }
+ else
{
//unsigned int sum; //for debugging
- m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+ m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
}
if (src->size())
- {// local sort and distribute
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
- b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel");
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
- launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
-
+ { // local sort and distribute
+ b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
+ b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
+ launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+ launcher.setConst(cdata);
+ launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
}
-
- b3Swap(src, dst );
- b3Swap(srcHisto,destHisto);
- count++;
+ b3Swap(src, dst);
+ b3Swap(srcHisto, destHisto);
+
+ count++;
}
-
- if (count&1)
+
+ if (count & 1)
{
- b3Assert(0);//need to copy from workbuffer to keyValuesInOut
+ b3Assert(0); //need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4a->size())
@@ -699,12 +643,4 @@ void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBi
m_workBuffer4a->resize(originalSize);
keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
}
-
}
-
-
-
-
-
-
-