diff options
author | Rémi Verschelde <rverschelde@gmail.com> | 2018-01-13 14:01:53 +0100 |
---|---|---|
committer | Rémi Verschelde <rverschelde@gmail.com> | 2018-01-13 14:08:45 +0100 |
commit | e12c89e8c9896b2e5cdd70dbd2d2acb449ff4b94 (patch) | |
tree | af68e434545e20c538f896e28b73f2db7d626edd /thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp | |
parent | 53c65ae7619ac9e80c89a321c70de64f3745e2aa (diff) |
bullet: Streamline bundling, remove extraneous src/ folder
Document version and how to extract sources in thirdparty/README.md.
Drop unnecessary CMake and Premake files.
Simplify SCsub, drop unused one.
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp')
-rw-r--r-- | thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp | 710 |
1 files changed, 710 insertions, 0 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp new file mode 100644 index 0000000000..f11ae4bcdb --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp @@ -0,0 +1,710 @@ + +#include "b3RadixSort32CL.h" +#include "b3LauncherCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "b3PrefixScanCL.h" +#include "b3FillCL.h" + +#define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl" + +#include "kernels/RadixSort32KernelsCL.h" + +b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity) +:m_commandQueue(queue) +{ + b3OpenCLDeviceInfo info; + b3OpenCLUtils::getDeviceInfo(device,&info); + m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0; + + m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue); + m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue); + m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue); + + + if (initialCapacity>0) + { + m_workBuffer1->resize(initialCapacity); + m_workBuffer3->resize(initialCapacity); + m_workBuffer3a->resize(initialCapacity); + m_workBuffer4->resize(initialCapacity); + m_workBuffer4a->resize(initialCapacity); + } + + m_scan = new b3PrefixScanCL(ctx,device,queue); + m_fill = new b3FillCL(ctx,device,queue); + + const char* additionalMacros = ""; + + cl_int pErrNum; + const char* kernelSource = radixSort32KernelsCL; + + cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH); + b3Assert(sortProg); + + m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_streamCountSortDataKernel ); + + + + m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_streamCountKernel); + + + + if (m_deviceCPU) + { + + m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterSortDataKernel); + m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterKernel); + } else + { + m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterSortDataKernel); + m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterKernel); + } + + m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_prefixScanKernel); + +} + +b3RadixSort32CL::~b3RadixSort32CL() +{ + delete m_scan; + delete m_fill; + delete m_workBuffer1; + delete m_workBuffer2; + delete m_workBuffer3; + delete m_workBuffer3a; + delete m_workBuffer4; + delete m_workBuffer4a; + + clReleaseKernel(m_streamCountSortDataKernel); + clReleaseKernel(m_streamCountKernel); + clReleaseKernel(m_sortAndScatterSortDataKernel); + clReleaseKernel(m_sortAndScatterKernel); + clReleaseKernel(m_prefixScanKernel); +} + +void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */) +{ + int n = inout.size(); + const int BITS_PER_PASS = 8; + const int NUM_TABLES = (1<<BITS_PER_PASS); + + + int tables[NUM_TABLES]; + int counter[NUM_TABLES]; + + b3SortData* src = &inout[0]; + b3AlignedObjectArray<b3SortData> workbuffer; + workbuffer.resize(inout.size()); + b3SortData* dst = &workbuffer[0]; + + int count=0; + for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS) + { + for(int i=0; i<NUM_TABLES; i++) + { + tables[i] = 0; + } + + for(int i=0; i<n; i++) + { + int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); + tables[tableIdx]++; + } +//#define TEST +#ifdef TEST + printf("histogram size=%d\n",NUM_TABLES); + for (int i=0;i<NUM_TABLES;i++) + { + if (tables[i]!=0) + { + printf("tables[%d]=%d]\n",i,tables[i]); + } + + } +#endif //TEST + // prefix scan + int sum = 0; + for(int i=0; i<NUM_TABLES; i++) + { + int iData = tables[i]; + tables[i] = sum; + sum += iData; + counter[i] = 0; + } + + // distribute + for(int i=0; i<n; i++) + { + int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); + + dst[tables[tableIdx] + counter[tableIdx]] = src[i]; + counter[tableIdx] ++; + } + + b3Swap( src, dst ); + count++; + } + + if (count&1) + { + b3Assert(0);//need to copy + + } +} + +void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) +{ + + b3AlignedObjectArray<b3SortData> inout; + keyValuesInOut.copyToHost(inout); + + executeHost(inout,sortBits); + + keyValuesInOut.copyFromHost(inout); +} + +void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, + b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits) +{ + +} + +//#define DEBUG_RADIXSORT +//#define DEBUG_RADIXSORT2 + + +void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) +{ + + int originalSize = keyValuesInOut.size(); + int workingSize = originalSize; + + + int dataAlignment = DATA_ALIGNMENT; + +#ifdef DEBUG_RADIXSORT2 + b3AlignedObjectArray<b3SortData> test2; + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } +#endif //DEBUG_RADIXSORT2 + + b3OpenCLArray<b3SortData>* src = 0; + + if (workingSize%dataAlignment) + { + workingSize += dataAlignment-(workingSize%dataAlignment); + m_workBuffer4->copyFromOpenCLArray(keyValuesInOut); + m_workBuffer4->resize(workingSize); + b3SortData fillValue; + fillValue.m_key = 0xffffffff; + fillValue.m_value = 0xffffffff; + +#define USE_BTFILL +#ifdef USE_BTFILL + m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize); +#else + //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side) + + for (int i=originalSize; i<workingSize;i++) + { + m_workBuffer4->copyFromHostPointer(&fillValue,1,i); + } +#endif//USE_BTFILL + + src = m_workBuffer4; + } else + { + src = &keyValuesInOut; + m_workBuffer4->resize(0); + } + + b3Assert( workingSize%DATA_ALIGNMENT == 0 ); + int minCap = NUM_BUCKET*NUM_WGS; + + + int n = workingSize; + + m_workBuffer1->resize(minCap); + m_workBuffer3->resize(workingSize); + + +// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + b3Assert( BITS_PER_PASS == 4 ); + b3Assert( WG_SIZE == 64 ); + b3Assert( (sortBits&0x3) == 0 ); + + + + b3OpenCLArray<b3SortData>* dst = m_workBuffer3; + + b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; + b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; + + + int nWGs = NUM_WGS; + b3ConstData cdata; + + { + int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 + int nBlocks = (n+blockSize-1)/(blockSize); + cdata.m_n = n; + cdata.m_nWGs = NUM_WGS; + cdata.m_startBit = 0; + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; + if( nBlocks < NUM_WGS ) + { + cdata.m_nBlocksPerWG = 1; + nWGs = nBlocks; + } + } + + int count=0; + for(int ib=0; ib<sortBits; ib+=4) + { +#ifdef DEBUG_RADIXSORT2 + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + if (test2[i].m_key != test2[i].m_value) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } + } +#endif //DEBUG_RADIXSORT2 + + cdata.m_startBit = ib; + + if (src->size()) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel"); + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + + int num = NUM_WGS*WG_SIZE; + launcher.launch1D( num, WG_SIZE ); + } + + + +#ifdef DEBUG_RADIXSORT + b3AlignedObjectArray<unsigned int> testHist; + srcHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;i<testHist.size();i++) + { + if (testHist[i]!=0) + printf("testHist[%d]=%d\n",i,testHist[i]); + } +#endif //DEBUG_RADIXSORT + + + +//fast prefix scan is not working properly on Mac OSX yet +#ifdef __APPLE__ + bool fastScan=false; +#else + bool fastScan=!m_deviceCPU;//only use fast scan on GPU +#endif + + if (fastScan) + {// prefix scan group histogram + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( 128, 128 ); + destHisto = srcHisto; + }else + { + //unsigned int sum; //for debugging + m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + } + + +#ifdef DEBUG_RADIXSORT + destHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;i<testHist.size();i++) + { + if (testHist[i]!=0) + printf("testHist[%d]=%d\n",i,testHist[i]); + } + + for (int i=0;i<testHist.size();i+=NUM_WGS) + { + printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]); + } + +#endif //DEBUG_RADIXSORT + +#define USE_GPU +#ifdef USE_GPU + + if (src->size()) + {// local sort and distribute + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; + b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); + + } +#else + { +#define NUM_TABLES 16 +//#define SEQUENTIAL +#ifdef SEQUENTIAL + int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + int tables[NUM_TABLES]; + int startBit = ib; + + destHisto->copyToHost(testHist); + b3AlignedObjectArray<b3SortData> srcHost; + b3AlignedObjectArray<b3SortData> dstHost; + dstHost.resize(src->size()); + + src->copyToHost(srcHost); + + for (int i=0;i<NUM_TABLES;i++) + { + tables[i] = testHist[i*NUM_WGS]; + } + + // distribute + for(int i=0; i<n; i++) + { + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); + + dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx] ++; + } + + +#else + + int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + int tables[NUM_TABLES]; + b3AlignedObjectArray<b3SortData> dstHostOK; + dstHostOK.resize(src->size()); + + destHisto->copyToHost(testHist); + b3AlignedObjectArray<b3SortData> srcHost; + src->copyToHost(srcHost); + + int blockSize = 256; + int nBlocksPerWG = cdata.m_nBlocksPerWG; + int startBit = ib; + + { + for (int i=0;i<NUM_TABLES;i++) + { + tables[i] = testHist[i*NUM_WGS]; + } + + // distribute + for(int i=0; i<n; i++) + { + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); + + dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx] ++; + } + + + } + + + b3AlignedObjectArray<b3SortData> dstHost; + dstHost.resize(src->size()); + + + int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + + + for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++) + { + int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx = 0;lIdx < 64;lIdx++) + { + int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) + { + if( addr+j < n ) + { + // printf ("addr+j=%d\n", addr+j); + + int i = addr+j; + + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); + + int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx]; + + b3SortData ok = dstHostOK[destIndex]; + + if (ok.m_key != srcHost[i].m_key) + { + printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key ); + printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value ); + } + if (ok.m_value != srcHost[i].m_value) + { + + printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value ); + printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key ); + + } + + dstHost[destIndex] = srcHost[i]; + counter[tableIdx] ++; + + } + } + } + } + } + + +#endif //SEQUENTIAL + + dst->copyFromHost(dstHost); + } +#endif//USE_GPU + + + +#ifdef DEBUG_RADIXSORT + destHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;i<testHist.size();i++) + { + if (testHist[i]!=0) + printf("testHist[%d]=%d\n",i,testHist[i]); + } +#endif //DEBUG_RADIXSORT + b3Swap(src, dst ); + b3Swap(srcHisto,destHisto); + +#ifdef DEBUG_RADIXSORT2 + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + if (test2[i].m_key != test2[i].m_value) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } + } +#endif //DEBUG_RADIXSORT2 + + count++; + + + } + + + + if (count&1) + { + b3Assert(0);//need to copy from workbuffer to keyValuesInOut + } + + if (m_workBuffer4->size()) + { + m_workBuffer4->resize(originalSize); + keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4); + } + + +#ifdef DEBUG_RADIXSORT + keyValuesInOut.copyToHost(test2); + + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } +#endif + +} + + + + + + +void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */) +{ + int originalSize = keysInOut.size(); + int workingSize = originalSize; + + + int dataAlignment = DATA_ALIGNMENT; + + b3OpenCLArray<unsigned int>* src = 0; + + if (workingSize%dataAlignment) + { + workingSize += dataAlignment-(workingSize%dataAlignment); + m_workBuffer4a->copyFromOpenCLArray(keysInOut); + m_workBuffer4a->resize(workingSize); + unsigned int fillValue = 0xffffffff; + + m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize); + + src = m_workBuffer4a; + } else + { + src = &keysInOut; + m_workBuffer4a->resize(0); + } + + + + b3Assert( workingSize%DATA_ALIGNMENT == 0 ); + int minCap = NUM_BUCKET*NUM_WGS; + + + int n = workingSize; + + + m_workBuffer1->resize(minCap); + m_workBuffer3->resize(workingSize); + m_workBuffer3a->resize(workingSize); + +// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + b3Assert( BITS_PER_PASS == 4 ); + b3Assert( WG_SIZE == 64 ); + b3Assert( (sortBits&0x3) == 0 ); + + + + b3OpenCLArray<unsigned int>* dst = m_workBuffer3a; + + b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; + b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; + + + int nWGs = NUM_WGS; + b3ConstData cdata; + + { + int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 + int nBlocks = (n+blockSize-1)/(blockSize); + cdata.m_n = n; + cdata.m_nWGs = NUM_WGS; + cdata.m_startBit = 0; + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; + if( nBlocks < NUM_WGS ) + { + cdata.m_nBlocksPerWG = 1; + nWGs = nBlocks; + } + } + + int count=0; + for(int ib=0; ib<sortBits; ib+=4) + { + cdata.m_startBit = ib; + + if (src->size()) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel"); + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + + int num = NUM_WGS*WG_SIZE; + launcher.launch1D( num, WG_SIZE ); + } + + + +//fast prefix scan is not working properly on Mac OSX yet +#ifdef __APPLE__ + bool fastScan=false; +#else + bool fastScan=!m_deviceCPU; +#endif + + if (fastScan) + {// prefix scan group histogram + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( 128, 128 ); + destHisto = srcHisto; + }else + { + //unsigned int sum; //for debugging + m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + } + + if (src->size()) + {// local sort and distribute + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; + b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); + + } + + b3Swap(src, dst ); + b3Swap(srcHisto,destHisto); + + count++; + } + + if (count&1) + { + b3Assert(0);//need to copy from workbuffer to keyValuesInOut + } + + if (m_workBuffer4a->size()) + { + m_workBuffer4a->resize(originalSize); + keysInOut.copyFromOpenCLArray(*m_workBuffer4a); + } + +} + + + + + + + |