summaryrefslogtreecommitdiff
path: root/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
diff options
context:
space:
mode:
authorRémi Verschelde <rverschelde@gmail.com>2022-03-09 21:15:53 +0100
committerRémi Verschelde <rverschelde@gmail.com>2022-03-09 21:45:47 +0100
commit3d7f1555865a981b7144becfc58d3f3f34362f5f (patch)
treed92912c6d700468b3330148b9179026b9f4efcb4 /thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
parent33c907f9f5b3ec1a43d0251d7cac80da49b5b658 (diff)
Remove unused Bullet module and thirdparty code
It has been disabled in `master` since one year (#45852) and our plan is for Bullet, and possibly other thirdparty physics engines, to be implemented via GDExtension so that they can be selected by the users who need them.
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp')
-rw-r--r--thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp646
1 files changed, 0 insertions, 646 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
deleted file mode 100644
index e86af6583f..0000000000
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
+++ /dev/null
@@ -1,646 +0,0 @@
-
-#include "b3RadixSort32CL.h"
-#include "b3LauncherCL.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
-#include "b3PrefixScanCL.h"
-#include "b3FillCL.h"
-
-#define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
-
-#include "kernels/RadixSort32KernelsCL.h"
-
-b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
- : m_commandQueue(queue)
-{
- b3OpenCLDeviceInfo info;
- b3OpenCLUtils::getDeviceInfo(device, &info);
- m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
-
- m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
- m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
- m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
- m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
- m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
- m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
-
- if (initialCapacity > 0)
- {
- m_workBuffer1->resize(initialCapacity);
- m_workBuffer3->resize(initialCapacity);
- m_workBuffer3a->resize(initialCapacity);
- m_workBuffer4->resize(initialCapacity);
- m_workBuffer4a->resize(initialCapacity);
- }
-
- m_scan = new b3PrefixScanCL(ctx, device, queue);
- m_fill = new b3FillCL(ctx, device, queue);
-
- const char* additionalMacros = "";
-
- cl_int pErrNum;
- const char* kernelSource = radixSort32KernelsCL;
-
- cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
- b3Assert(sortProg);
-
- m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
- b3Assert(m_streamCountSortDataKernel);
-
- m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
- b3Assert(m_streamCountKernel);
-
- if (m_deviceCPU)
- {
- m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
- b3Assert(m_sortAndScatterSortDataKernel);
- m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
- b3Assert(m_sortAndScatterKernel);
- }
- else
- {
- m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
- b3Assert(m_sortAndScatterSortDataKernel);
- m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
- b3Assert(m_sortAndScatterKernel);
- }
-
- m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
- b3Assert(m_prefixScanKernel);
-}
-
-b3RadixSort32CL::~b3RadixSort32CL()
-{
- delete m_scan;
- delete m_fill;
- delete m_workBuffer1;
- delete m_workBuffer2;
- delete m_workBuffer3;
- delete m_workBuffer3a;
- delete m_workBuffer4;
- delete m_workBuffer4a;
-
- clReleaseKernel(m_streamCountSortDataKernel);
- clReleaseKernel(m_streamCountKernel);
- clReleaseKernel(m_sortAndScatterSortDataKernel);
- clReleaseKernel(m_sortAndScatterKernel);
- clReleaseKernel(m_prefixScanKernel);
-}
-
-void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
-{
- int n = inout.size();
- const int BITS_PER_PASS = 8;
- const int NUM_TABLES = (1 << BITS_PER_PASS);
-
- int tables[NUM_TABLES];
- int counter[NUM_TABLES];
-
- b3SortData* src = &inout[0];
- b3AlignedObjectArray<b3SortData> workbuffer;
- workbuffer.resize(inout.size());
- b3SortData* dst = &workbuffer[0];
-
- int count = 0;
- for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
- {
- for (int i = 0; i < NUM_TABLES; i++)
- {
- tables[i] = 0;
- }
-
- for (int i = 0; i < n; i++)
- {
- int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
- tables[tableIdx]++;
- }
-//#define TEST
-#ifdef TEST
- printf("histogram size=%d\n", NUM_TABLES);
- for (int i = 0; i < NUM_TABLES; i++)
- {
- if (tables[i] != 0)
- {
- printf("tables[%d]=%d]\n", i, tables[i]);
- }
- }
-#endif //TEST \
- // prefix scan
- int sum = 0;
- for (int i = 0; i < NUM_TABLES; i++)
- {
- int iData = tables[i];
- tables[i] = sum;
- sum += iData;
- counter[i] = 0;
- }
-
- // distribute
- for (int i = 0; i < n; i++)
- {
- int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
-
- dst[tables[tableIdx] + counter[tableIdx]] = src[i];
- counter[tableIdx]++;
- }
-
- b3Swap(src, dst);
- count++;
- }
-
- if (count & 1)
- {
- b3Assert(0); //need to copy
- }
-}
-
-void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
-{
- b3AlignedObjectArray<b3SortData> inout;
- keyValuesInOut.copyToHost(inout);
-
- executeHost(inout, sortBits);
-
- keyValuesInOut.copyFromHost(inout);
-}
-
-void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
- b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
-{
-}
-
-//#define DEBUG_RADIXSORT
-//#define DEBUG_RADIXSORT2
-
-void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
-{
- int originalSize = keyValuesInOut.size();
- int workingSize = originalSize;
-
- int dataAlignment = DATA_ALIGNMENT;
-
-#ifdef DEBUG_RADIXSORT2
- b3AlignedObjectArray<b3SortData> test2;
- keyValuesInOut.copyToHost(test2);
- printf("numElem = %d\n", test2.size());
- for (int i = 0; i < test2.size(); i++)
- {
- printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
- printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
- }
-#endif //DEBUG_RADIXSORT2
-
- b3OpenCLArray<b3SortData>* src = 0;
-
- if (workingSize % dataAlignment)
- {
- workingSize += dataAlignment - (workingSize % dataAlignment);
- m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
- m_workBuffer4->resize(workingSize);
- b3SortData fillValue;
- fillValue.m_key = 0xffffffff;
- fillValue.m_value = 0xffffffff;
-
-#define USE_BTFILL
-#ifdef USE_BTFILL
- m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
-#else
- //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
-
- for (int i = originalSize; i < workingSize; i++)
- {
- m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
- }
-#endif //USE_BTFILL
-
- src = m_workBuffer4;
- }
- else
- {
- src = &keyValuesInOut;
- m_workBuffer4->resize(0);
- }
-
- b3Assert(workingSize % DATA_ALIGNMENT == 0);
- int minCap = NUM_BUCKET * NUM_WGS;
-
- int n = workingSize;
-
- m_workBuffer1->resize(minCap);
- m_workBuffer3->resize(workingSize);
-
- // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
- b3Assert(BITS_PER_PASS == 4);
- b3Assert(WG_SIZE == 64);
- b3Assert((sortBits & 0x3) == 0);
-
- b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
-
- b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
- b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
-
- int nWGs = NUM_WGS;
- b3ConstData cdata;
-
- {
- int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
- int nBlocks = (n + blockSize - 1) / (blockSize);
- cdata.m_n = n;
- cdata.m_nWGs = NUM_WGS;
- cdata.m_startBit = 0;
- cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
- if (nBlocks < NUM_WGS)
- {
- cdata.m_nBlocksPerWG = 1;
- nWGs = nBlocks;
- }
- }
-
- int count = 0;
- for (int ib = 0; ib < sortBits; ib += 4)
- {
-#ifdef DEBUG_RADIXSORT2
- keyValuesInOut.copyToHost(test2);
- printf("numElem = %d\n", test2.size());
- for (int i = 0; i < test2.size(); i++)
- {
- if (test2[i].m_key != test2[i].m_value)
- {
- printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
- printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
- }
- }
-#endif //DEBUG_RADIXSORT2
-
- cdata.m_startBit = ib;
-
- if (src->size())
- {
- b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
- b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
-
- launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
- launcher.setConst(cdata);
-
- int num = NUM_WGS * WG_SIZE;
- launcher.launch1D(num, WG_SIZE);
- }
-
-#ifdef DEBUG_RADIXSORT
- b3AlignedObjectArray<unsigned int> testHist;
- srcHisto->copyToHost(testHist);
- printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
- for (int i = 0; i < testHist.size(); i++)
- {
- if (testHist[i] != 0)
- printf("testHist[%d]=%d\n", i, testHist[i]);
- }
-#endif //DEBUG_RADIXSORT
-
-//fast prefix scan is not working properly on Mac OSX yet
-#ifdef __APPLE__
- bool fastScan = false;
-#else
- bool fastScan = !m_deviceCPU; //only use fast scan on GPU
-#endif
-
- if (fastScan)
- { // prefix scan group histogram
- b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
- b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
- launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
- launcher.setConst(cdata);
- launcher.launch1D(128, 128);
- destHisto = srcHisto;
- }
- else
- {
- //unsigned int sum; //for debugging
- m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
- }
-
-#ifdef DEBUG_RADIXSORT
- destHisto->copyToHost(testHist);
- printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
- for (int i = 0; i < testHist.size(); i++)
- {
- if (testHist[i] != 0)
- printf("testHist[%d]=%d\n", i, testHist[i]);
- }
-
- for (int i = 0; i < testHist.size(); i += NUM_WGS)
- {
- printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
- }
-
-#endif //DEBUG_RADIXSORT
-
-#define USE_GPU
-#ifdef USE_GPU
-
- if (src->size())
- { // local sort and distribute
- b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
- b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
- launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
- launcher.setConst(cdata);
- launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
- }
-#else
- {
-#define NUM_TABLES 16
-//#define SEQUENTIAL
-#ifdef SEQUENTIAL
- int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
- int tables[NUM_TABLES];
- int startBit = ib;
-
- destHisto->copyToHost(testHist);
- b3AlignedObjectArray<b3SortData> srcHost;
- b3AlignedObjectArray<b3SortData> dstHost;
- dstHost.resize(src->size());
-
- src->copyToHost(srcHost);
-
- for (int i = 0; i < NUM_TABLES; i++)
- {
- tables[i] = testHist[i * NUM_WGS];
- }
-
- // distribute
- for (int i = 0; i < n; i++)
- {
- int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
-
- dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
- counter2[tableIdx]++;
- }
-
-#else
-
- int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
- int tables[NUM_TABLES];
- b3AlignedObjectArray<b3SortData> dstHostOK;
- dstHostOK.resize(src->size());
-
- destHisto->copyToHost(testHist);
- b3AlignedObjectArray<b3SortData> srcHost;
- src->copyToHost(srcHost);
-
- int blockSize = 256;
- int nBlocksPerWG = cdata.m_nBlocksPerWG;
- int startBit = ib;
-
- {
- for (int i = 0; i < NUM_TABLES; i++)
- {
- tables[i] = testHist[i * NUM_WGS];
- }
-
- // distribute
- for (int i = 0; i < n; i++)
- {
- int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
-
- dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
- counter2[tableIdx]++;
- }
- }
-
- b3AlignedObjectArray<b3SortData> dstHost;
- dstHost.resize(src->size());
-
- int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
- for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
- {
- int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
- int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
-
- for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
- {
- for (int lIdx = 0; lIdx < 64; lIdx++)
- {
- int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
-
- // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
- // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
- // AMD: AtomInc performs better while NV prefers ++
- for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
- {
- if (addr + j < n)
- {
- // printf ("addr+j=%d\n", addr+j);
-
- int i = addr + j;
-
- int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
-
- int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
-
- b3SortData ok = dstHostOK[destIndex];
-
- if (ok.m_key != srcHost[i].m_key)
- {
- printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
- printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
- }
- if (ok.m_value != srcHost[i].m_value)
- {
- printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
- printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
- }
-
- dstHost[destIndex] = srcHost[i];
- counter[tableIdx]++;
- }
- }
- }
- }
- }
-
-#endif //SEQUENTIAL
-
- dst->copyFromHost(dstHost);
- }
-#endif //USE_GPU
-
-#ifdef DEBUG_RADIXSORT
- destHisto->copyToHost(testHist);
- printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
- for (int i = 0; i < testHist.size(); i++)
- {
- if (testHist[i] != 0)
- printf("testHist[%d]=%d\n", i, testHist[i]);
- }
-#endif //DEBUG_RADIXSORT
- b3Swap(src, dst);
- b3Swap(srcHisto, destHisto);
-
-#ifdef DEBUG_RADIXSORT2
- keyValuesInOut.copyToHost(test2);
- printf("numElem = %d\n", test2.size());
- for (int i = 0; i < test2.size(); i++)
- {
- if (test2[i].m_key != test2[i].m_value)
- {
- printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
- printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
- }
- }
-#endif //DEBUG_RADIXSORT2
-
- count++;
- }
-
- if (count & 1)
- {
- b3Assert(0); //need to copy from workbuffer to keyValuesInOut
- }
-
- if (m_workBuffer4->size())
- {
- m_workBuffer4->resize(originalSize);
- keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
- }
-
-#ifdef DEBUG_RADIXSORT
- keyValuesInOut.copyToHost(test2);
-
- printf("numElem = %d\n", test2.size());
- for (int i = 0; i < test2.size(); i++)
- {
- printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
- printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
- }
-#endif
-}
-
-void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
-{
- int originalSize = keysInOut.size();
- int workingSize = originalSize;
-
- int dataAlignment = DATA_ALIGNMENT;
-
- b3OpenCLArray<unsigned int>* src = 0;
-
- if (workingSize % dataAlignment)
- {
- workingSize += dataAlignment - (workingSize % dataAlignment);
- m_workBuffer4a->copyFromOpenCLArray(keysInOut);
- m_workBuffer4a->resize(workingSize);
- unsigned int fillValue = 0xffffffff;
-
- m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
-
- src = m_workBuffer4a;
- }
- else
- {
- src = &keysInOut;
- m_workBuffer4a->resize(0);
- }
-
- b3Assert(workingSize % DATA_ALIGNMENT == 0);
- int minCap = NUM_BUCKET * NUM_WGS;
-
- int n = workingSize;
-
- m_workBuffer1->resize(minCap);
- m_workBuffer3->resize(workingSize);
- m_workBuffer3a->resize(workingSize);
-
- // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
- b3Assert(BITS_PER_PASS == 4);
- b3Assert(WG_SIZE == 64);
- b3Assert((sortBits & 0x3) == 0);
-
- b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
-
- b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
- b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
-
- int nWGs = NUM_WGS;
- b3ConstData cdata;
-
- {
- int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
- int nBlocks = (n + blockSize - 1) / (blockSize);
- cdata.m_n = n;
- cdata.m_nWGs = NUM_WGS;
- cdata.m_startBit = 0;
- cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
- if (nBlocks < NUM_WGS)
- {
- cdata.m_nBlocksPerWG = 1;
- nWGs = nBlocks;
- }
- }
-
- int count = 0;
- for (int ib = 0; ib < sortBits; ib += 4)
- {
- cdata.m_startBit = ib;
-
- if (src->size())
- {
- b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
- b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
-
- launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
- launcher.setConst(cdata);
-
- int num = NUM_WGS * WG_SIZE;
- launcher.launch1D(num, WG_SIZE);
- }
-
-//fast prefix scan is not working properly on Mac OSX yet
-#ifdef __APPLE__
- bool fastScan = false;
-#else
- bool fastScan = !m_deviceCPU;
-#endif
-
- if (fastScan)
- { // prefix scan group histogram
- b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
- b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
- launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
- launcher.setConst(cdata);
- launcher.launch1D(128, 128);
- destHisto = srcHisto;
- }
- else
- {
- //unsigned int sum; //for debugging
- m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
- }
-
- if (src->size())
- { // local sort and distribute
- b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
- b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
- launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
- launcher.setConst(cdata);
- launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
- }
-
- b3Swap(src, dst);
- b3Swap(srcHisto, destHisto);
-
- count++;
- }
-
- if (count & 1)
- {
- b3Assert(0); //need to copy from workbuffer to keyValuesInOut
- }
-
- if (m_workBuffer4a->size())
- {
- m_workBuffer4a->resize(originalSize);
- keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
- }
-}