diff options
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives')
26 files changed, 5567 insertions, 0 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp new file mode 100644 index 0000000000..a4980f71e1 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp @@ -0,0 +1,213 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada +//Host-code rewritten by Erwin Coumans + +#define BOUNDSEARCH_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl" +#define KERNEL0 "SearchSortDataLowerKernel" +#define KERNEL1 "SearchSortDataUpperKernel" +#define KERNEL2 "SubtractKernel" + + +#include "b3BoundSearchCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "b3LauncherCL.h" +#include "kernels/BoundSearchKernelsCL.h" + +b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize) + :m_context(ctx), + m_device(device), + m_queue(queue) +{ + + const char* additionalMacros = ""; + //const char* srcFileNameForCaching=""; + + cl_int pErrNum; + const char* kernelSource = boundSearchKernelsCL; + + cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH); + b3Assert(boundSearchProg); + + m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros ); + b3Assert(m_lowerSortDataKernel ); + + m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros ); + b3Assert(m_upperSortDataKernel); + + m_subtractKernel = 0; + + if( maxSize ) + { + m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros ); + b3Assert(m_subtractKernel); + } + + //m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST ); + + m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize ); + m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize ); + + m_filler = new b3FillCL(ctx,device,queue); +} + +b3BoundSearchCL::~b3BoundSearchCL() +{ + + delete m_lower; + delete m_upper; + delete m_filler; + + clReleaseKernel(m_lowerSortDataKernel); + clReleaseKernel(m_upperSortDataKernel); + clReleaseKernel(m_subtractKernel); + + +} + + +void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option ) +{ + b3Int4 constBuffer; + constBuffer.x = nSrc; + constBuffer.y = nDst; + + if( option == BOUND_LOWER ) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) }; + + b3LauncherCL launcher( m_queue, m_lowerSortDataKernel,"m_lowerSortDataKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nSrc ); + launcher.setConst( nDst ); + + launcher.launch1D( nSrc, 64 ); + } + else if( option == BOUND_UPPER ) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) }; + + b3LauncherCL launcher(m_queue, m_upperSortDataKernel,"m_upperSortDataKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nSrc ); + launcher.setConst( nDst ); + + launcher.launch1D( nSrc, 64 ); + } + else if( option == COUNT ) + { + b3Assert( m_lower ); + b3Assert( m_upper ); + b3Assert( m_lower->capacity() <= (int)nDst ); + b3Assert( m_upper->capacity() <= (int)nDst ); + + int zero = 0; + m_filler->execute( *m_lower, zero, nDst ); + m_filler->execute( *m_upper, zero, nDst ); + + execute( src, nSrc, *m_lower, nDst, BOUND_LOWER ); + execute( src, nSrc, *m_upper, nDst, BOUND_UPPER ); + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) }; + + b3LauncherCL launcher( m_queue, m_subtractKernel ,"m_subtractKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( nSrc ); + launcher.setConst( nDst ); + + launcher.launch1D( nDst, 64 ); + } + } + else + { + b3Assert( 0 ); + } + +} + + +void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, + b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option ) +{ + + + for(int i=0; i<nSrc-1; i++) + b3Assert( src[i].m_key <= src[i+1].m_key ); + + b3SortData minData,zeroData,maxData; + minData.m_key = -1; + minData.m_value = -1; + zeroData.m_key=0; + zeroData.m_value=0; + maxData.m_key = nDst; + maxData.m_value = nDst; + + if( option == BOUND_LOWER ) + { + for(int i=0; i<nSrc; i++) + { + b3SortData& iData = (i==0)? minData: src[i-1]; + b3SortData& jData = (i==nSrc)? maxData: src[i]; + + if( iData.m_key != jData.m_key ) + { + int k = jData.m_key; + { + dst[k] = i; + } + } + } + } + else if( option == BOUND_UPPER ) + { + for(int i=1; i<nSrc+1; i++) + { + b3SortData& iData = src[i-1]; + b3SortData& jData = (i==nSrc)? maxData: src[i]; + + if( iData.m_key != jData.m_key ) + { + int k = iData.m_key; + { + dst[k] = i; + } + } + } + } + else if( option == COUNT ) + { + b3AlignedObjectArray<unsigned int> lower; + lower.resize(nDst ); + b3AlignedObjectArray<unsigned int> upper; + upper.resize(nDst ); + + for(int i=0; i<nDst; i++) + { + lower[i] = upper[i] = 0; + } + + executeHost( src, nSrc, lower, nDst, BOUND_LOWER ); + executeHost( src, nSrc, upper, nDst, BOUND_UPPER ); + + for( int i=0; i<nDst; i++) + { + dst[i] = upper[i] - lower[i]; + } + } + else + { + b3Assert( 0 ); + } +} diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h new file mode 100644 index 0000000000..7e2940965c --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h @@ -0,0 +1,67 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#ifndef B3_BOUNDSEARCH_H +#define B3_BOUNDSEARCH_H + +#pragma once + +/*#include <Adl/Adl.h> +#include <AdlPrimitives/Math/Math.h> +#include <AdlPrimitives/Sort/SortData.h> +#include <AdlPrimitives/Fill/Fill.h> +*/ + +#include "b3OpenCLArray.h" +#include "b3FillCL.h" +#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?) +class b3BoundSearchCL +{ + public: + + enum Option + { + BOUND_LOWER, + BOUND_UPPER, + COUNT, + }; + + cl_context m_context; + cl_device_id m_device; + cl_command_queue m_queue; + + + cl_kernel m_lowerSortDataKernel; + cl_kernel m_upperSortDataKernel; + cl_kernel m_subtractKernel; + + b3OpenCLArray<b3Int4>* m_constbtOpenCLArray; + b3OpenCLArray<unsigned int>* m_lower; + b3OpenCLArray<unsigned int>* m_upper; + + b3FillCL* m_filler; + + b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size); + + virtual ~b3BoundSearchCL(); + + // src has to be src[i].m_key <= src[i+1].m_key + void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER ); + + void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER); +}; + + +#endif //B3_BOUNDSEARCH_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h new file mode 100644 index 0000000000..52f219ae3f --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h @@ -0,0 +1,19 @@ + +#ifndef B3_BUFFER_INFO_CL_H +#define B3_BUFFER_INFO_CL_H + +#include "b3OpenCLArray.h" + + +struct b3BufferInfoCL +{ + //b3BufferInfoCL(){} + +// template<typename T> + b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){} + + cl_mem m_clBuffer; + bool m_isReadOnly; +}; + +#endif //B3_BUFFER_INFO_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp new file mode 100644 index 0000000000..f05c2648f1 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp @@ -0,0 +1,126 @@ +#include "b3FillCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "b3BufferInfoCL.h" +#include "b3LauncherCL.h" + +#define FILL_CL_PROGRAM_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl" + +#include "kernels/FillKernelsCL.h" + +b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue) +:m_commandQueue(queue) +{ + const char* kernelSource = fillKernelsCL; + cl_int pErrNum; + const char* additionalMacros = ""; + + cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH); + b3Assert(fillProg); + + m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros ); + b3Assert(m_fillIntKernel); + + m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros ); + b3Assert(m_fillIntKernel); + + m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros ); + b3Assert(m_fillFloatKernel); + + + + m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros ); + b3Assert(m_fillKernelInt2); + +} + +b3FillCL::~b3FillCL() +{ + clReleaseKernel(m_fillKernelInt2); + clReleaseKernel(m_fillIntKernel); + clReleaseKernel(m_fillUnsignedIntKernel); + clReleaseKernel(m_fillFloatKernel); + +} + +void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset) +{ + b3Assert( n>0 ); + + { + b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel,"m_fillFloatKernel" ); + launcher.setBuffer( src.getBufferCL()); + launcher.setConst( n ); + launcher.setConst( value ); + launcher.setConst( offset); + + launcher.launch1D( n ); + } +} + +void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset) +{ + b3Assert( n>0 ); + + + { + b3LauncherCL launcher( m_commandQueue, m_fillIntKernel ,"m_fillIntKernel"); + launcher.setBuffer(src.getBufferCL()); + launcher.setConst( n); + launcher.setConst( value); + launcher.setConst( offset); + launcher.launch1D( n ); + } +} + + +void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset) +{ + b3Assert( n>0 ); + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel,"m_fillUnsignedIntKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( n ); + launcher.setConst(value); + launcher.setConst(offset); + + launcher.launch1D( n ); + } +} + +void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset) +{ + for (int i=0;i<n;i++) + { + src[i+offset]=value; + } +} + +void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset) +{ + for (int i=0;i<n;i++) + { + src[i+offset]=value; + } +} + +void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset) +{ + b3Assert( n>0 ); + + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) }; + + b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2,"m_fillKernelInt2"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(n); + launcher.setConst(value); + launcher.setConst(offset); + + //( constBuffer ); + launcher.launch1D( n ); + } +} diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h new file mode 100644 index 0000000000..1609676b9d --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h @@ -0,0 +1,63 @@ +#ifndef B3_FILL_CL_H +#define B3_FILL_CL_H + +#include "b3OpenCLArray.h" +#include "Bullet3Common/b3Scalar.h" + +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" + + +class b3FillCL +{ + + cl_command_queue m_commandQueue; + + cl_kernel m_fillKernelInt2; + cl_kernel m_fillIntKernel; + cl_kernel m_fillUnsignedIntKernel; + cl_kernel m_fillFloatKernel; + + public: + + struct b3ConstData + { + union + { + b3Int4 m_data; + b3UnsignedInt4 m_UnsignedData; + }; + int m_offset; + int m_n; + int m_padding[2]; + }; + +protected: + +public: + + b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue); + + virtual ~b3FillCL(); + + void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0); + + void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0); + + void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0); + + void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0); + + void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset); + + void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset); + + // void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0); + +}; + + + + + +#endif //B3_FILL_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp new file mode 100644 index 0000000000..94590d11ca --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp @@ -0,0 +1,308 @@ +#include "b3LauncherCL.h" + +bool gDebugLauncherCL = false; + +b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name) +:m_commandQueue(queue), +m_kernel(kernel), +m_idx(0), +m_enableSerialization(false), +m_name(name) +{ + if (gDebugLauncherCL) + { + static int counter = 0; + printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name); + } + + m_serializationSizeInBytes = sizeof(int); +} + +b3LauncherCL::~b3LauncherCL() + { + for (int i=0;i<m_arrays.size();i++) + { + delete (m_arrays[i]); + } + + m_arrays.clear(); + if (gDebugLauncherCL) + { + static int counter = 0; + printf("[%d] Finished launching OpenCL kernel %s\n", counter++,m_name); + } + } + +void b3LauncherCL::setBuffer( cl_mem clBuffer) +{ + if (m_enableSerialization) + { + b3KernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 1; + kernelArg.m_clBuffer = clBuffer; + + cl_mem_info param_name = CL_MEM_SIZE; + size_t param_value; + size_t sizeInBytes = sizeof(size_t); + size_t actualSizeInBytes; + cl_int err; + err = clGetMemObjectInfo ( kernelArg.m_clBuffer, + param_name, + sizeInBytes, + ¶m_value, + &actualSizeInBytes); + + b3Assert( err == CL_SUCCESS ); + kernelArg.m_argSizeInBytes = param_value; + + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes+= sizeof(b3KernelArgData); + m_serializationSizeInBytes+=param_value; + } + cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer); + b3Assert( status == CL_SUCCESS ); +} + + +void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n ) +{ + for(int i=0; i<n; i++) + { + if (m_enableSerialization) + { + b3KernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 1; + kernelArg.m_clBuffer = buffInfo[i].m_clBuffer; + + cl_mem_info param_name = CL_MEM_SIZE; + size_t param_value; + size_t sizeInBytes = sizeof(size_t); + size_t actualSizeInBytes; + cl_int err; + err = clGetMemObjectInfo ( kernelArg.m_clBuffer, + param_name, + sizeInBytes, + ¶m_value, + &actualSizeInBytes); + + b3Assert( err == CL_SUCCESS ); + kernelArg.m_argSizeInBytes = param_value; + + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes+= sizeof(b3KernelArgData); + m_serializationSizeInBytes+=param_value; + } + cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer); + b3Assert( status == CL_SUCCESS ); + } +} + +struct b3KernelArgDataUnaligned +{ + int m_isBuffer; + int m_argIndex; + int m_argSizeInBytes; + int m_unusedPadding; + union + { + cl_mem m_clBuffer; + unsigned char m_argData[B3_CL_MAX_ARG_SIZE]; + }; + +}; +#include <string.h> + + + +int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx) +{ + int index=0; + + int numArguments = *(int*) &buf[index]; + index+=sizeof(int); + + for (int i=0;i<numArguments;i++) + { + b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index]; + + index+=sizeof(b3KernelArgData); + if (arg->m_isBuffer) + { + b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes); + clData->resize(arg->m_argSizeInBytes); + + clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes); + + arg->m_clBuffer = clData->getBufferCL(); + + m_arrays.push_back(clData); + + cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer); + b3Assert( status == CL_SUCCESS ); + index+=arg->m_argSizeInBytes; + } else + { + cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData); + b3Assert( status == CL_SUCCESS ); + } + b3KernelArgData b; + memcpy(&b,arg,sizeof(b3KernelArgDataUnaligned)); + m_kernelArguments.push_back(b); + } +m_serializationSizeInBytes = index; + return index; +} + +int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx) + { + int index=0; + + int numArguments = *(int*) &goldBuffer[index]; + index+=sizeof(int); + + if (numArguments != m_kernelArguments.size()) + { + printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size()); + return -1; + } + + for (int ii=0;ii<numArguments;ii++) + { + b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index]; + + if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes) + { + printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes); + return -2; + } + + { + int expected = argGold->m_isBuffer; + int found = m_kernelArguments[ii].m_isBuffer; + + if (expected != found) + { + printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found); + return -3; + } + } + index+=sizeof(b3KernelArgData); + + if (argGold->m_isBuffer) + { + + unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes); + unsigned char* goldBuf = &goldBuffer[index]; + for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++) + { + memBuf[j] = 0xaa; + } + + cl_int status = 0; + status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes, + memBuf, 0,0,0 ); + b3Assert( status==CL_SUCCESS ); + clFinish(m_commandQueue); + + for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++) + { + int expected = goldBuf[b]; + int found = memBuf[b]; + if (expected != found) + { + printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n", + ii, b, expected, found); + return -4; + } + } + + + index+=argGold->m_argSizeInBytes; + } else + { + + //compare content + for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++) + { + int expected = argGold->m_argData[b]; + int found =m_kernelArguments[ii].m_argData[b]; + if (expected != found) + { + printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n", + ii, b, expected, found); + return -5; + } + } + + } + } + return index; + +} + +int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity) +{ +//initialize to known values +for (int i=0;i<destBufferCapacity;i++) + destBuffer[i] = 0xec; + + assert(destBufferCapacity>=m_serializationSizeInBytes); + + //todo: use the b3Serializer for this to allow for 32/64bit, endianness etc + int numArguments = m_kernelArguments.size(); + int curBufferSize = 0; + int* dest = (int*)&destBuffer[curBufferSize]; + *dest = numArguments; + curBufferSize += sizeof(int); + + + + for (int i=0;i<this->m_kernelArguments.size();i++) + { + b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize]; + *arg = m_kernelArguments[i]; + curBufferSize+=sizeof(b3KernelArgData); + if (arg->m_isBuffer==1) + { + //copy the OpenCL buffer content + cl_int status = 0; + status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes, + &destBuffer[curBufferSize], 0,0,0 ); + b3Assert( status==CL_SUCCESS ); + clFinish(m_commandQueue); + curBufferSize+=arg->m_argSizeInBytes; + } + + } + return curBufferSize; +} + +void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems) +{ + int num = numWorkItems; + int buffSize = getSerializationBufferSize(); + unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; + for (int i=0;i<buffSize+1;i++) + { + unsigned char* ptr = (unsigned char*)&buf[i]; + *ptr = 0xff; + } +// int actualWrite = serializeArguments(buf,buffSize); + +// unsigned char* cptr = (unsigned char*)&buf[buffSize]; +// printf("buf[buffSize] = %d\n",*cptr); + + assert(buf[buffSize]==0xff);//check for buffer overrun + int* ptr = (int*)&buf[buffSize]; + + *ptr = num; + + FILE* f = fopen(fileName,"wb"); + fwrite(buf,buffSize+sizeof(int),1,f); + fclose(f); + + delete[] buf; +} + diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h new file mode 100644 index 0000000000..1b267b31ef --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h @@ -0,0 +1,135 @@ + +#ifndef B3_LAUNCHER_CL_H +#define B3_LAUNCHER_CL_H + +#include "b3BufferInfoCL.h" +#include "Bullet3Common/b3MinMax.h" +#include "b3OpenCLArray.h" +#include <stdio.h> + +#define B3_DEBUG_SERIALIZE_CL + + +#ifdef _WIN32 +#pragma warning(disable :4996) +#endif +#define B3_CL_MAX_ARG_SIZE 16 +B3_ATTRIBUTE_ALIGNED16(struct) b3KernelArgData +{ + int m_isBuffer; + int m_argIndex; + int m_argSizeInBytes; + int m_unusedPadding; + union + { + cl_mem m_clBuffer; + unsigned char m_argData[B3_CL_MAX_ARG_SIZE]; + }; + +}; + +class b3LauncherCL +{ + + cl_command_queue m_commandQueue; + cl_kernel m_kernel; + int m_idx; + + b3AlignedObjectArray<b3KernelArgData> m_kernelArguments; + int m_serializationSizeInBytes; + bool m_enableSerialization; + + const char* m_name; + public: + + b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays; + + b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name); + + virtual ~b3LauncherCL(); + + void setBuffer( cl_mem clBuffer); + + void setBuffers( b3BufferInfoCL* buffInfo, int n ); + + int getSerializationBufferSize() const + { + return m_serializationSizeInBytes; + } + + int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx); + + inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx); + + int serializeArguments(unsigned char* destBuffer, int destBufferCapacity); + + int getNumArguments() const + { + return m_kernelArguments.size(); + } + + b3KernelArgData getArgument(int index) + { + return m_kernelArguments[index]; + } + + void serializeToFile(const char* fileName, int numWorkItems); + + template<typename T> + inline void setConst( const T& consts ) + { + int sz=sizeof(T); + b3Assert(sz<=B3_CL_MAX_ARG_SIZE); + + if (m_enableSerialization) + { + b3KernelArgData kernelArg; + kernelArg.m_argIndex = m_idx; + kernelArg.m_isBuffer = 0; + T* destArg = (T*)kernelArg.m_argData; + *destArg = consts; + kernelArg.m_argSizeInBytes = sizeof(T); + m_kernelArguments.push_back(kernelArg); + m_serializationSizeInBytes+=sizeof(b3KernelArgData); + } + + cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts ); + b3Assert( status == CL_SUCCESS ); + } + + inline void launch1D( int numThreads, int localSize = 64) + { + launch2D( numThreads, 1, localSize, 1 ); + } + + inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY ) + { + size_t gRange[3] = {1,1,1}; + size_t lRange[3] = {1,1,1}; + lRange[0] = localSizeX; + lRange[1] = localSizeY; + gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1)); + gRange[0] *= lRange[0]; + gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1)); + gRange[1] *= lRange[1]; + + cl_int status = clEnqueueNDRangeKernel( m_commandQueue, + m_kernel, 2, NULL, gRange, lRange, 0,0,0 ); + if (status != CL_SUCCESS) + { + printf("Error: OpenCL status = %d\n",status); + } + b3Assert( status == CL_SUCCESS ); + + } + + void enableSerialization(bool serialize) + { + m_enableSerialization = serialize; + } + +}; + + + +#endif //B3_LAUNCHER_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h new file mode 100644 index 0000000000..d70c30f53f --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h @@ -0,0 +1,306 @@ +#ifndef B3_OPENCL_ARRAY_H +#define B3_OPENCL_ARRAY_H + +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h" + +template <typename T> +class b3OpenCLArray +{ + size_t m_size; + size_t m_capacity; + cl_mem m_clBuffer; + + cl_context m_clContext; + cl_command_queue m_commandQueue; + + bool m_ownsMemory; + + bool m_allowGrowingCapacity; + + void deallocate() + { + if (m_clBuffer && m_ownsMemory) + { + clReleaseMemObject(m_clBuffer); + } + m_clBuffer = 0; + m_capacity=0; + } + + b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src); + + B3_FORCE_INLINE size_t allocSize(size_t size) + { + return (size ? size*2 : 1); + } + +public: + + b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity=0, bool allowGrowingCapacity=true) + :m_size(0), m_capacity(0),m_clBuffer(0), + m_clContext(ctx),m_commandQueue(queue), + m_ownsMemory(true),m_allowGrowingCapacity(true) + { + if (initialCapacity) + { + reserve(initialCapacity); + } + m_allowGrowingCapacity = allowGrowingCapacity; + } + + ///this is an error-prone method with no error checking, be careful! + void setFromOpenCLBuffer(cl_mem buffer, size_t sizeInElements) + { + deallocate(); + m_ownsMemory = false; + m_allowGrowingCapacity = false; + m_clBuffer = buffer; + m_size = sizeInElements; + m_capacity = sizeInElements; + } + +// we could enable this assignment, but need to make sure to avoid accidental deep copies +// b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src) +// { +// copyFromArray(src); +// return *this; +// } + + + cl_mem getBufferCL() const + { + return m_clBuffer; + } + + + virtual ~b3OpenCLArray() + { + deallocate(); + m_size=0; + m_capacity=0; + } + + B3_FORCE_INLINE bool push_back(const T& _Val,bool waitForCompletion=true) + { + bool result = true; + size_t sz = size(); + if( sz == capacity() ) + { + result = reserve( allocSize(size()) ); + } + copyFromHostPointer(&_Val, 1, sz, waitForCompletion); + m_size++; + return result; + } + + B3_FORCE_INLINE T forcedAt(size_t n) const + { + b3Assert(n>=0); + b3Assert(n<capacity()); + T elem; + copyToHostPointer(&elem,1,n,true); + return elem; + } + + B3_FORCE_INLINE T at(size_t n) const + { + b3Assert(n>=0); + b3Assert(n<size()); + T elem; + copyToHostPointer(&elem,1,n,true); + return elem; + } + + B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents=true) + { + bool result = true; + size_t curSize = size(); + + if (newsize < curSize) + { + //leave the OpenCL memory for now + } else + { + if (newsize > size()) + { + result = reserve(newsize,copyOldContents); + } + + //leave new data uninitialized (init in debug mode?) + //for (size_t i=curSize;i<newsize;i++) ... + } + + if (result) + { + m_size = newsize; + } else + { + m_size = 0; + } + return result; + } + + B3_FORCE_INLINE size_t size() const + { + return m_size; + } + + B3_FORCE_INLINE size_t capacity() const + { + return m_capacity; + } + + B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents=true) + { + bool result=true; + // determine new minimum length of allocated storage + if (capacity() < _Count) + { // not enough room, reallocate + + if (m_allowGrowingCapacity) + { + cl_int ciErrNum; + //create a new OpenCL buffer + size_t memSizeInBytes = sizeof(T)*_Count; + cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum); + if (ciErrNum!=CL_SUCCESS) + { + b3Error("OpenCL out-of-memory\n"); + _Count = 0; + result = false; + } +//#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS +#ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS + unsigned char* src = (unsigned char*)malloc(memSizeInBytes); + for (size_t i=0;i<memSizeInBytes;i++) + src[i] = 0xbb; + ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 ); + b3Assert(ciErrNum==CL_SUCCESS); + clFinish(m_commandQueue); + free(src); +#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS + + if (result) + { + if (copyOldContents) + copyToCL(buf, size()); + } + + //deallocate the old buffer + deallocate(); + + m_clBuffer = buf; + + m_capacity = _Count; + } else + { + //fail: assert and + b3Assert(0); + deallocate(); + result=false; + } + } + return result; + } + + + void copyToCL(cl_mem destination, size_t numElements, size_t firstElem=0, size_t dstOffsetInElems=0) const + { + if (numElements<=0) + return; + + b3Assert(m_clBuffer); + b3Assert(destination); + + //likely some error, destination is same as source + b3Assert(m_clBuffer != destination); + + b3Assert((firstElem+numElements)<=m_size); + + cl_int status = 0; + + + b3Assert(numElements>0); + b3Assert(numElements<=m_size); + + size_t srcOffsetBytes = sizeof(T)*firstElem; + size_t dstOffsetInBytes = sizeof(T)*dstOffsetInElems; + + status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination, + srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 ); + + b3Assert( status == CL_SUCCESS ); + } + + void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true) + { + size_t newSize = srcArray.size(); + + bool copyOldContents = false; + resize (newSize,copyOldContents); + if (newSize) + copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion); + + } + + void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem= 0, bool waitForCompletion=true) + { + b3Assert(numElems+destFirstElem <= capacity()); + + if (numElems+destFirstElem) + { + cl_int status = 0; + size_t sizeInBytes=sizeof(T)*numElems; + status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes, + src, 0,0,0 ); + b3Assert(status == CL_SUCCESS ); + if (waitForCompletion) + clFinish(m_commandQueue); + } else + { + b3Error("copyFromHostPointer invalid range\n"); + } + } + + + void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion=true) const + { + destArray.resize(this->size()); + if (size()) + copyToHostPointer(&destArray[0], size(),0,waitForCompletion); + } + + void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem=0, bool waitForCompletion=true) const + { + b3Assert(numElem+srcFirstElem <= capacity()); + + if(numElem+srcFirstElem <= capacity()) + { + cl_int status = 0; + status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem, + destPtr, 0,0,0 ); + b3Assert( status==CL_SUCCESS ); + + if (waitForCompletion) + clFinish(m_commandQueue); + } else + { + b3Error("copyToHostPointer invalid range\n"); + } + } + + void copyFromOpenCLArray(const b3OpenCLArray& src) + { + size_t newSize = src.size(); + resize(newSize); + if (size()) + { + src.copyToCL(m_clBuffer,size()); + } + } + +}; + + +#endif //B3_OPENCL_ARRAY_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp new file mode 100644 index 0000000000..42cd197740 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp @@ -0,0 +1,126 @@ +#include "b3PrefixScanCL.h" +#include "b3FillCL.h" +#define B3_PREFIXSCAN_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl" + +#include "b3LauncherCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "kernels/PrefixScanKernelsCL.h" + +b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size) +:m_commandQueue(queue) +{ + const char* scanKernelSource = prefixScanKernelsCL; + cl_int pErrNum; + char* additionalMacros=0; + + m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size); + cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH); + b3Assert(scanProg); + + m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_localScanKernel ); + m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_blockSumKernel ); + m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_propagationKernel ); +} + + +b3PrefixScanCL::~b3PrefixScanCL() +{ + delete m_workBuffer; + clReleaseKernel(m_localScanKernel); + clReleaseKernel(m_blockSumKernel); + clReleaseKernel(m_propagationKernel); +} + +template<class T> +T b3NextPowerOf2(T n) +{ + n -= 1; + for(int i=0; i<sizeof(T)*8; i++) + n = n | (n>>i); + return n+1; +} + +void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum) +{ + +// b3Assert( data->m_option == EXCLUSIVE ); + const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) ); + + dst.resize(src.size()); + m_workBuffer->resize(src.size()); + + b3Int4 constBuffer; + constBuffer.x = n; + constBuffer.y = numBlocks; + constBuffer.z = (int)b3NextPowerOf2( numBlocks ); + + b3OpenCLArray<unsigned int>* srcNative = &src; + b3OpenCLArray<unsigned int>* dstNative = &dst; + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_localScanKernel,"m_localScanKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE ); + } + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_blockSumKernel,"m_blockSumKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( numBlocks > 1 ) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_propagationKernel,"m_propagationKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( sum ) + { + clFinish(m_commandQueue); + dstNative->copyToHostPointer(sum,1,n-1,true); + } + +} + + +void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum) +{ + unsigned int s = 0; + //if( data->m_option == EXCLUSIVE ) + { + for(int i=0; i<n; i++) + { + dst[i] = s; + s += src[i]; + } + } + /*else + { + for(int i=0; i<n; i++) + { + s += hSrc[i]; + hDst[i] = s; + } + } + */ + + if( sum ) + { + *sum = dst[n-1]; + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h new file mode 100644 index 0000000000..a9a2e61b9e --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h @@ -0,0 +1,37 @@ + +#ifndef B3_PREFIX_SCAN_CL_H +#define B3_PREFIX_SCAN_CL_H + +#include "b3OpenCLArray.h" +#include "b3BufferInfoCL.h" +#include "Bullet3Common/b3AlignedObjectArray.h" + +class b3PrefixScanCL +{ + enum + { + BLOCK_SIZE = 128 + }; + +// Option m_option; + + cl_command_queue m_commandQueue; + + cl_kernel m_localScanKernel; + cl_kernel m_blockSumKernel; + cl_kernel m_propagationKernel; + + b3OpenCLArray<unsigned int>* m_workBuffer; + + + public: + + b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0); + + virtual ~b3PrefixScanCL(); + + void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0); + void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum=0); +}; + +#endif //B3_PREFIX_SCAN_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp new file mode 100644 index 0000000000..80560d793d --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp @@ -0,0 +1,126 @@ +#include "b3PrefixScanFloat4CL.h" +#include "b3FillCL.h" +#define B3_PREFIXSCAN_FLOAT4_PROG_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl" + +#include "b3LauncherCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "kernels/PrefixScanKernelsFloat4CL.h" + +b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size) +:m_commandQueue(queue) +{ + const char* scanKernelSource = prefixScanKernelsFloat4CL; + cl_int pErrNum; + char* additionalMacros=0; + + m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx,queue,size); + cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH); + b3Assert(scanProg); + + m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_localScanKernel ); + m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_blockSumKernel ); + m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros ); + b3Assert(m_propagationKernel ); +} + + +b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL() +{ + delete m_workBuffer; + clReleaseKernel(m_localScanKernel); + clReleaseKernel(m_blockSumKernel); + clReleaseKernel(m_propagationKernel); +} + +template<class T> +T b3NextPowerOf2(T n) +{ + n -= 1; + for(int i=0; i<sizeof(T)*8; i++) + n = n | (n>>i); + return n+1; +} + +void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum) +{ + +// b3Assert( data->m_option == EXCLUSIVE ); + const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) ); + + dst.resize(src.size()); + m_workBuffer->resize(src.size()); + + b3Int4 constBuffer; + constBuffer.x = n; + constBuffer.y = numBlocks; + constBuffer.z = (int)b3NextPowerOf2( numBlocks ); + + b3OpenCLArray<b3Vector3>* srcNative = &src; + b3OpenCLArray<b3Vector3>* dstNative = &dst; + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_localScanKernel ,"m_localScanKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE ); + } + + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + + b3LauncherCL launcher( m_commandQueue, m_blockSumKernel ,"m_blockSumKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( numBlocks > 1 ) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_propagationKernel ,"m_propagationKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( constBuffer ); + launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE ); + } + + + if( sum ) + { + clFinish(m_commandQueue); + dstNative->copyToHostPointer(sum,1,n-1,true); + } + +} + + +void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum) +{ + b3Vector3 s=b3MakeVector3(0,0,0); + //if( data->m_option == EXCLUSIVE ) + { + for(int i=0; i<n; i++) + { + dst[i] = s; + s += src[i]; + } + } + /*else + { + for(int i=0; i<n; i++) + { + s += hSrc[i]; + hDst[i] = s; + } + } + */ + + if( sum ) + { + *sum = dst[n-1]; + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h new file mode 100644 index 0000000000..2c8003c1bb --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h @@ -0,0 +1,38 @@ + +#ifndef B3_PREFIX_SCAN_CL_H +#define B3_PREFIX_SCAN_CL_H + +#include "b3OpenCLArray.h" +#include "b3BufferInfoCL.h" +#include "Bullet3Common/b3AlignedObjectArray.h" +#include "Bullet3Common/b3Vector3.h" + +class b3PrefixScanFloat4CL +{ + enum + { + BLOCK_SIZE = 128 + }; + +// Option m_option; + + cl_command_queue m_commandQueue; + + cl_kernel m_localScanKernel; + cl_kernel m_blockSumKernel; + cl_kernel m_propagationKernel; + + b3OpenCLArray<b3Vector3>* m_workBuffer; + + + public: + + b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0); + + virtual ~b3PrefixScanFloat4CL(); + + void execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum = 0); + void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum); +}; + +#endif //B3_PREFIX_SCAN_CL_H diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp new file mode 100644 index 0000000000..f11ae4bcdb --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp @@ -0,0 +1,710 @@ + +#include "b3RadixSort32CL.h" +#include "b3LauncherCL.h" +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "b3PrefixScanCL.h" +#include "b3FillCL.h" + +#define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl" + +#include "kernels/RadixSort32KernelsCL.h" + +b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity) +:m_commandQueue(queue) +{ + b3OpenCLDeviceInfo info; + b3OpenCLUtils::getDeviceInfo(device,&info); + m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0; + + m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue); + m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue); + m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue); + m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue); + + + if (initialCapacity>0) + { + m_workBuffer1->resize(initialCapacity); + m_workBuffer3->resize(initialCapacity); + m_workBuffer3a->resize(initialCapacity); + m_workBuffer4->resize(initialCapacity); + m_workBuffer4a->resize(initialCapacity); + } + + m_scan = new b3PrefixScanCL(ctx,device,queue); + m_fill = new b3FillCL(ctx,device,queue); + + const char* additionalMacros = ""; + + cl_int pErrNum; + const char* kernelSource = radixSort32KernelsCL; + + cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH); + b3Assert(sortProg); + + m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_streamCountSortDataKernel ); + + + + m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_streamCountKernel); + + + + if (m_deviceCPU) + { + + m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterSortDataKernel); + m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterKernel); + } else + { + m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterSortDataKernel); + m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_sortAndScatterKernel); + } + + m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros ); + b3Assert(m_prefixScanKernel); + +} + +b3RadixSort32CL::~b3RadixSort32CL() +{ + delete m_scan; + delete m_fill; + delete m_workBuffer1; + delete m_workBuffer2; + delete m_workBuffer3; + delete m_workBuffer3a; + delete m_workBuffer4; + delete m_workBuffer4a; + + clReleaseKernel(m_streamCountSortDataKernel); + clReleaseKernel(m_streamCountKernel); + clReleaseKernel(m_sortAndScatterSortDataKernel); + clReleaseKernel(m_sortAndScatterKernel); + clReleaseKernel(m_prefixScanKernel); +} + +void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */) +{ + int n = inout.size(); + const int BITS_PER_PASS = 8; + const int NUM_TABLES = (1<<BITS_PER_PASS); + + + int tables[NUM_TABLES]; + int counter[NUM_TABLES]; + + b3SortData* src = &inout[0]; + b3AlignedObjectArray<b3SortData> workbuffer; + workbuffer.resize(inout.size()); + b3SortData* dst = &workbuffer[0]; + + int count=0; + for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS) + { + for(int i=0; i<NUM_TABLES; i++) + { + tables[i] = 0; + } + + for(int i=0; i<n; i++) + { + int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); + tables[tableIdx]++; + } +//#define TEST +#ifdef TEST + printf("histogram size=%d\n",NUM_TABLES); + for (int i=0;i<NUM_TABLES;i++) + { + if (tables[i]!=0) + { + printf("tables[%d]=%d]\n",i,tables[i]); + } + + } +#endif //TEST + // prefix scan + int sum = 0; + for(int i=0; i<NUM_TABLES; i++) + { + int iData = tables[i]; + tables[i] = sum; + sum += iData; + counter[i] = 0; + } + + // distribute + for(int i=0; i<n; i++) + { + int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1); + + dst[tables[tableIdx] + counter[tableIdx]] = src[i]; + counter[tableIdx] ++; + } + + b3Swap( src, dst ); + count++; + } + + if (count&1) + { + b3Assert(0);//need to copy + + } +} + +void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) +{ + + b3AlignedObjectArray<b3SortData> inout; + keyValuesInOut.copyToHost(inout); + + executeHost(inout,sortBits); + + keyValuesInOut.copyFromHost(inout); +} + +void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, + b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits) +{ + +} + +//#define DEBUG_RADIXSORT +//#define DEBUG_RADIXSORT2 + + +void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */) +{ + + int originalSize = keyValuesInOut.size(); + int workingSize = originalSize; + + + int dataAlignment = DATA_ALIGNMENT; + +#ifdef DEBUG_RADIXSORT2 + b3AlignedObjectArray<b3SortData> test2; + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } +#endif //DEBUG_RADIXSORT2 + + b3OpenCLArray<b3SortData>* src = 0; + + if (workingSize%dataAlignment) + { + workingSize += dataAlignment-(workingSize%dataAlignment); + m_workBuffer4->copyFromOpenCLArray(keyValuesInOut); + m_workBuffer4->resize(workingSize); + b3SortData fillValue; + fillValue.m_key = 0xffffffff; + fillValue.m_value = 0xffffffff; + +#define USE_BTFILL +#ifdef USE_BTFILL + m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize); +#else + //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side) + + for (int i=originalSize; i<workingSize;i++) + { + m_workBuffer4->copyFromHostPointer(&fillValue,1,i); + } +#endif//USE_BTFILL + + src = m_workBuffer4; + } else + { + src = &keyValuesInOut; + m_workBuffer4->resize(0); + } + + b3Assert( workingSize%DATA_ALIGNMENT == 0 ); + int minCap = NUM_BUCKET*NUM_WGS; + + + int n = workingSize; + + m_workBuffer1->resize(minCap); + m_workBuffer3->resize(workingSize); + + +// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + b3Assert( BITS_PER_PASS == 4 ); + b3Assert( WG_SIZE == 64 ); + b3Assert( (sortBits&0x3) == 0 ); + + + + b3OpenCLArray<b3SortData>* dst = m_workBuffer3; + + b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; + b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; + + + int nWGs = NUM_WGS; + b3ConstData cdata; + + { + int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 + int nBlocks = (n+blockSize-1)/(blockSize); + cdata.m_n = n; + cdata.m_nWGs = NUM_WGS; + cdata.m_startBit = 0; + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; + if( nBlocks < NUM_WGS ) + { + cdata.m_nBlocksPerWG = 1; + nWGs = nBlocks; + } + } + + int count=0; + for(int ib=0; ib<sortBits; ib+=4) + { +#ifdef DEBUG_RADIXSORT2 + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + if (test2[i].m_key != test2[i].m_value) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } + } +#endif //DEBUG_RADIXSORT2 + + cdata.m_startBit = ib; + + if (src->size()) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel"); + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + + int num = NUM_WGS*WG_SIZE; + launcher.launch1D( num, WG_SIZE ); + } + + + +#ifdef DEBUG_RADIXSORT + b3AlignedObjectArray<unsigned int> testHist; + srcHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;i<testHist.size();i++) + { + if (testHist[i]!=0) + printf("testHist[%d]=%d\n",i,testHist[i]); + } +#endif //DEBUG_RADIXSORT + + + +//fast prefix scan is not working properly on Mac OSX yet +#ifdef __APPLE__ + bool fastScan=false; +#else + bool fastScan=!m_deviceCPU;//only use fast scan on GPU +#endif + + if (fastScan) + {// prefix scan group histogram + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( 128, 128 ); + destHisto = srcHisto; + }else + { + //unsigned int sum; //for debugging + m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + } + + +#ifdef DEBUG_RADIXSORT + destHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;i<testHist.size();i++) + { + if (testHist[i]!=0) + printf("testHist[%d]=%d\n",i,testHist[i]); + } + + for (int i=0;i<testHist.size();i+=NUM_WGS) + { + printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]); + } + +#endif //DEBUG_RADIXSORT + +#define USE_GPU +#ifdef USE_GPU + + if (src->size()) + {// local sort and distribute + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; + b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); + + } +#else + { +#define NUM_TABLES 16 +//#define SEQUENTIAL +#ifdef SEQUENTIAL + int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + int tables[NUM_TABLES]; + int startBit = ib; + + destHisto->copyToHost(testHist); + b3AlignedObjectArray<b3SortData> srcHost; + b3AlignedObjectArray<b3SortData> dstHost; + dstHost.resize(src->size()); + + src->copyToHost(srcHost); + + for (int i=0;i<NUM_TABLES;i++) + { + tables[i] = testHist[i*NUM_WGS]; + } + + // distribute + for(int i=0; i<n; i++) + { + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); + + dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx] ++; + } + + +#else + + int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + int tables[NUM_TABLES]; + b3AlignedObjectArray<b3SortData> dstHostOK; + dstHostOK.resize(src->size()); + + destHisto->copyToHost(testHist); + b3AlignedObjectArray<b3SortData> srcHost; + src->copyToHost(srcHost); + + int blockSize = 256; + int nBlocksPerWG = cdata.m_nBlocksPerWG; + int startBit = ib; + + { + for (int i=0;i<NUM_TABLES;i++) + { + tables[i] = testHist[i*NUM_WGS]; + } + + // distribute + for(int i=0; i<n; i++) + { + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); + + dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; + counter2[tableIdx] ++; + } + + + } + + + b3AlignedObjectArray<b3SortData> dstHost; + dstHost.resize(src->size()); + + + int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + + + for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++) + { + int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx = 0;lIdx < 64;lIdx++) + { + int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) + { + if( addr+j < n ) + { + // printf ("addr+j=%d\n", addr+j); + + int i = addr+j; + + int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1); + + int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx]; + + b3SortData ok = dstHostOK[destIndex]; + + if (ok.m_key != srcHost[i].m_key) + { + printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key ); + printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value ); + } + if (ok.m_value != srcHost[i].m_value) + { + + printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value ); + printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key ); + + } + + dstHost[destIndex] = srcHost[i]; + counter[tableIdx] ++; + + } + } + } + } + } + + +#endif //SEQUENTIAL + + dst->copyFromHost(dstHost); + } +#endif//USE_GPU + + + +#ifdef DEBUG_RADIXSORT + destHisto->copyToHost(testHist); + printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); + for (int i=0;i<testHist.size();i++) + { + if (testHist[i]!=0) + printf("testHist[%d]=%d\n",i,testHist[i]); + } +#endif //DEBUG_RADIXSORT + b3Swap(src, dst ); + b3Swap(srcHisto,destHisto); + +#ifdef DEBUG_RADIXSORT2 + keyValuesInOut.copyToHost(test2); + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + if (test2[i].m_key != test2[i].m_value) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } + } +#endif //DEBUG_RADIXSORT2 + + count++; + + + } + + + + if (count&1) + { + b3Assert(0);//need to copy from workbuffer to keyValuesInOut + } + + if (m_workBuffer4->size()) + { + m_workBuffer4->resize(originalSize); + keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4); + } + + +#ifdef DEBUG_RADIXSORT + keyValuesInOut.copyToHost(test2); + + printf("numElem = %d\n",test2.size()); + for (int i=0;i<test2.size();i++) + { + printf("test2[%d].m_key=%d\n",i,test2[i].m_key); + printf("test2[%d].m_value=%d\n",i,test2[i].m_value); + } +#endif + +} + + + + + + +void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */) +{ + int originalSize = keysInOut.size(); + int workingSize = originalSize; + + + int dataAlignment = DATA_ALIGNMENT; + + b3OpenCLArray<unsigned int>* src = 0; + + if (workingSize%dataAlignment) + { + workingSize += dataAlignment-(workingSize%dataAlignment); + m_workBuffer4a->copyFromOpenCLArray(keysInOut); + m_workBuffer4a->resize(workingSize); + unsigned int fillValue = 0xffffffff; + + m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize); + + src = m_workBuffer4a; + } else + { + src = &keysInOut; + m_workBuffer4a->resize(0); + } + + + + b3Assert( workingSize%DATA_ALIGNMENT == 0 ); + int minCap = NUM_BUCKET*NUM_WGS; + + + int n = workingSize; + + + m_workBuffer1->resize(minCap); + m_workBuffer3->resize(workingSize); + m_workBuffer3a->resize(workingSize); + +// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); + b3Assert( BITS_PER_PASS == 4 ); + b3Assert( WG_SIZE == 64 ); + b3Assert( (sortBits&0x3) == 0 ); + + + + b3OpenCLArray<unsigned int>* dst = m_workBuffer3a; + + b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1; + b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2; + + + int nWGs = NUM_WGS; + b3ConstData cdata; + + { + int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 + int nBlocks = (n+blockSize-1)/(blockSize); + cdata.m_n = n; + cdata.m_nWGs = NUM_WGS; + cdata.m_startBit = 0; + cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; + if( nBlocks < NUM_WGS ) + { + cdata.m_nBlocksPerWG = 1; + nWGs = nBlocks; + } + } + + int count=0; + for(int ib=0; ib<sortBits; ib+=4) + { + cdata.m_startBit = ib; + + if (src->size()) + { + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel"); + + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + + int num = NUM_WGS*WG_SIZE; + launcher.launch1D( num, WG_SIZE ); + } + + + +//fast prefix scan is not working properly on Mac OSX yet +#ifdef __APPLE__ + bool fastScan=false; +#else + bool fastScan=!m_deviceCPU; +#endif + + if (fastScan) + {// prefix scan group histogram + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) }; + b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( 128, 128 ); + destHisto = srcHisto; + }else + { + //unsigned int sum; //for debugging + m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); + } + + if (src->size()) + {// local sort and distribute + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )}; + b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel"); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( cdata ); + launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); + + } + + b3Swap(src, dst ); + b3Swap(srcHisto,destHisto); + + count++; + } + + if (count&1) + { + b3Assert(0);//need to copy from workbuffer to keyValuesInOut + } + + if (m_workBuffer4a->size()) + { + m_workBuffer4a->resize(originalSize); + keysInOut.copyFromOpenCLArray(*m_workBuffer4a); + } + +} + + + + + + + diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h new file mode 100644 index 0000000000..975bd80e53 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h @@ -0,0 +1,95 @@ + +#ifndef B3_RADIXSORT32_H +#define B3_RADIXSORT32_H + +#include "b3OpenCLArray.h" + +struct b3SortData +{ + union + { + unsigned int m_key; + unsigned int x; + }; + + union + { + unsigned int m_value; + unsigned int y; + + }; +}; +#include "b3BufferInfoCL.h" + +class b3RadixSort32CL +{ + + b3OpenCLArray<unsigned int>* m_workBuffer1; + b3OpenCLArray<unsigned int>* m_workBuffer2; + + b3OpenCLArray<b3SortData>* m_workBuffer3; + b3OpenCLArray<b3SortData>* m_workBuffer4; + + b3OpenCLArray<unsigned int>* m_workBuffer3a; + b3OpenCLArray<unsigned int>* m_workBuffer4a; + + cl_command_queue m_commandQueue; + + cl_kernel m_streamCountSortDataKernel; + cl_kernel m_streamCountKernel; + + cl_kernel m_prefixScanKernel; + cl_kernel m_sortAndScatterSortDataKernel; + cl_kernel m_sortAndScatterKernel; + + + bool m_deviceCPU; + + class b3PrefixScanCL* m_scan; + class b3FillCL* m_fill; + +public: + struct b3ConstData + { + int m_n; + int m_nWGs; + int m_startBit; + int m_nBlocksPerWG; + }; + enum + { + DATA_ALIGNMENT = 256, + WG_SIZE = 64, + BLOCK_SIZE = 256, + ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE), + BITS_PER_PASS = 4, + NUM_BUCKET=(1<<BITS_PER_PASS), + // if you change this, change nPerWI in kernel as well + NUM_WGS = 20*6, // cypress +// NUM_WGS = 24*6, // cayman +// NUM_WGS = 32*4, // nv + }; + + +private: + + +public: + + b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0); + + virtual ~b3RadixSort32CL(); + + void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, + b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32); + + ///keys only + void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32 ); + + void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32 ); + void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32); + void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32); + +}; +#endif //B3_RADIXSORT32_H + diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl new file mode 100644 index 0000000000..f3b4a1e8a7 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl @@ -0,0 +1,106 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +typedef struct +{ + u32 m_key; + u32 m_value; +}SortData; + + + +typedef struct +{ + u32 m_nSrc; + u32 m_nDst; + u32 m_padding[2]; +} ConstBuffer; + + + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nSrc ) + { + SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1); + SortData end; end.m_key = nDst; end.m_value = nDst; + + SortData iData = (gIdx==0)? first: src[gIdx-1]; + SortData jData = (gIdx==nSrc)? end: src[gIdx]; + + if( iData.m_key != jData.m_key ) + { +// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++) + u32 k = jData.m_key; + { + dst[k] = gIdx; + } + } + } +} + + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX+1; + + if( gIdx < nSrc+1 ) + { + SortData first; first.m_key = 0; first.m_value = 0; + SortData end; end.m_key = nDst; end.m_value = nDst; + + SortData iData = src[gIdx-1]; + SortData jData = (gIdx==nSrc)? end: src[gIdx]; + + if( iData.m_key != jData.m_key ) + { + u32 k = iData.m_key; + { + dst[k] = gIdx; + } + } + } +} + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX; + + + if( gIdx < nDst ) + { + C[gIdx] = A[gIdx] - B[gIdx]; + } +} + diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h new file mode 100644 index 0000000000..9c9e847138 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h @@ -0,0 +1,87 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* boundSearchKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"typedef struct\n" +"{\n" +" u32 m_key; \n" +" u32 m_value;\n" +"}SortData;\n" +"typedef struct\n" +"{\n" +" u32 m_nSrc;\n" +" u32 m_nDst;\n" +" u32 m_padding[2];\n" +"} ConstBuffer;\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nSrc )\n" +" {\n" +" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n" +" SortData end; end.m_key = nDst; end.m_value = nDst;\n" +" SortData iData = (gIdx==0)? first: src[gIdx-1];\n" +" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" +" if( iData.m_key != jData.m_key )\n" +" {\n" +"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n" +" u32 k = jData.m_key;\n" +" {\n" +" dst[k] = gIdx;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX+1;\n" +" if( gIdx < nSrc+1 )\n" +" {\n" +" SortData first; first.m_key = 0; first.m_value = 0;\n" +" SortData end; end.m_key = nDst; end.m_value = nDst;\n" +" SortData iData = src[gIdx-1];\n" +" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" +" if( iData.m_key != jData.m_key )\n" +" {\n" +" u32 k = iData.m_key;\n" +" {\n" +" dst[k] = gIdx;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nDst )\n" +" {\n" +" C[gIdx] = A[gIdx] - B[gIdx];\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl new file mode 100644 index 0000000000..2eee5752ec --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl @@ -0,0 +1,128 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +typedef struct +{ + int m_n; + int m_padding[3]; +} ConstBuffer; + + + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy1F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float4 a0 = src[gIdx]; + + dst[ gIdx ] = a0; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy2F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( 2*gIdx <= cb.m_n ) + { + float4 a0 = src[gIdx*2+0]; + float4 a1 = src[gIdx*2+1]; + + dst[ gIdx*2+0 ] = a0; + dst[ gIdx*2+1 ] = a1; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy4F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( 4*gIdx <= cb.m_n ) + { + int idx0 = gIdx*4+0; + int idx1 = gIdx*4+1; + int idx2 = gIdx*4+2; + int idx3 = gIdx*4+3; + + float4 a0 = src[idx0]; + float4 a1 = src[idx1]; + float4 a2 = src[idx2]; + float4 a3 = src[idx3]; + + dst[ idx0 ] = a0; + dst[ idx1 ] = a1; + dst[ idx2 ] = a2; + dst[ idx3 ] = a3; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void CopyF1Kernel(__global float* dstF1, __global float* srcF1, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float a0 = srcF1[gIdx]; + + dstF1[ gIdx ] = a0; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float2 a0 = srcF2[gIdx]; + + dstF2[ gIdx ] = a0; + } +} + diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h new file mode 100644 index 0000000000..e5670e3cd3 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h @@ -0,0 +1,132 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* copyKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"\n" +"typedef struct\n" +"{\n" +" int m_n;\n" +" int m_padding[3];\n" +"} ConstBuffer;\n" +"\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float4 a0 = src[gIdx];\n" +"\n" +" dst[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( 2*gIdx <= cb.m_n )\n" +" {\n" +" float4 a0 = src[gIdx*2+0];\n" +" float4 a1 = src[gIdx*2+1];\n" +"\n" +" dst[ gIdx*2+0 ] = a0;\n" +" dst[ gIdx*2+1 ] = a1;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( 4*gIdx <= cb.m_n )\n" +" {\n" +" int idx0 = gIdx*4+0;\n" +" int idx1 = gIdx*4+1;\n" +" int idx2 = gIdx*4+2;\n" +" int idx3 = gIdx*4+3;\n" +"\n" +" float4 a0 = src[idx0];\n" +" float4 a1 = src[idx1];\n" +" float4 a2 = src[idx2];\n" +" float4 a3 = src[idx3];\n" +"\n" +" dst[ idx0 ] = a0;\n" +" dst[ idx1 ] = a1;\n" +" dst[ idx2 ] = a2;\n" +" dst[ idx3 ] = a3;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float a0 = srcF1[gIdx];\n" +"\n" +" dstF1[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float2 a0 = srcF2[gIdx];\n" +"\n" +" dstF2[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl new file mode 100644 index 0000000000..71c31075dd --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl @@ -0,0 +1,107 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +typedef struct +{ + union + { + int4 m_data; + uint4 m_unsignedData; + float m_floatData; + }; + int m_offset; + int m_n; + int m_padding[2]; +} ConstBuffer; + + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num_elements ) + { + dstInt[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num_elements ) + { + dstFloat[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt2[ gIdx + offset] = make_int2( value.x, value.y ); + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt4[ offset+gIdx ] = value; + } +} + diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h new file mode 100644 index 0000000000..4f8b96e489 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h @@ -0,0 +1,91 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* fillKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"typedef struct\n" +"{\n" +" union\n" +" {\n" +" int4 m_data;\n" +" uint4 m_unsignedData;\n" +" float m_floatData;\n" +" };\n" +" int m_offset;\n" +" int m_n;\n" +" int m_padding[2];\n" +"} ConstBuffer;\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num_elements )\n" +" {\n" +" dstInt[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num_elements )\n" +" {\n" +" dstFloat[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num )\n" +" {\n" +" dstInt[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num )\n" +" {\n" +" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num )\n" +" {\n" +" dstInt4[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl new file mode 100644 index 0000000000..c9da79854a --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl @@ -0,0 +1,154 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +// takahiro end +#define WG_SIZE 128 +#define m_numElems x +#define m_numBlocks y +#define m_numScanBlocks z + +/*typedef struct +{ + uint m_numElems; + uint m_numBlocks; + uint m_numScanBlocks; + uint m_padding[1]; +} ConstBuffer; +*/ + +float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize) +{ + float4 blocksum; + int offset = 1; + for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1) + { + GROUP_LDS_BARRIER; + for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + data[bi] += data[ai]; + } + } + + GROUP_LDS_BARRIER; + + if( lIdx == 0 ) + { + blocksum = data[ n-1 ]; + data[ n-1 ] = 0; + } + + GROUP_LDS_BARRIER; + + offset >>= 1; + for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 ) + { + GROUP_LDS_BARRIER; + for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize ) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + float4 temp = data[ai]; + data[ai] = data[bi]; + data[bi] += temp; + } + } + GROUP_LDS_BARRIER; + + return blocksum; +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb) +{ + __local float4 ldsData[WG_SIZE*2]; + + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + + ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0; + ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0; + + float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE); + + if( lIdx == 0 ) + sumBuffer[GET_GROUP_IDX] = sum; + + if( (2*gIdx) < cb.m_numElems ) + { + dst[2*gIdx] = ldsData[2*lIdx]; + } + if( (2*gIdx + 1) < cb.m_numElems ) + { + dst[2*gIdx + 1] = ldsData[2*lIdx + 1]; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb) +{ + const u32 blockSize = WG_SIZE*2; + + int myIdx = GET_GROUP_IDX+1; + int lIdx = GET_LOCAL_IDX; + + float4 iBlockSum = blockSum[myIdx]; + + int endValue = min((myIdx+1)*(blockSize), cb.m_numElems); + for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE) + { + dst[i] += iBlockSum; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void TopLevelScanKernel(__global float4* dst, uint4 cb) +{ + __local float4 ldsData[2048]; + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + int lSize = GET_GROUP_SIZE; + + for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize ) + { + ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0; + } + + GROUP_LDS_BARRIER; + + float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE); + + for(int i=lIdx; i<cb.m_numBlocks; i+=lSize ) + { + dst[i] = ldsData[i]; + } + + if( gIdx == 0 ) + { + dst[cb.m_numBlocks] = sum; + } +} diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl new file mode 100644 index 0000000000..963cc1e48e --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl @@ -0,0 +1,154 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +// takahiro end +#define WG_SIZE 128 +#define m_numElems x +#define m_numBlocks y +#define m_numScanBlocks z + +/*typedef struct +{ + uint m_numElems; + uint m_numBlocks; + uint m_numScanBlocks; + uint m_padding[1]; +} ConstBuffer; +*/ + +u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize) +{ + u32 blocksum; + int offset = 1; + for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1) + { + GROUP_LDS_BARRIER; + for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + data[bi] += data[ai]; + } + } + + GROUP_LDS_BARRIER; + + if( lIdx == 0 ) + { + blocksum = data[ n-1 ]; + data[ n-1 ] = 0; + } + + GROUP_LDS_BARRIER; + + offset >>= 1; + for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 ) + { + GROUP_LDS_BARRIER; + for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize ) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + u32 temp = data[ai]; + data[ai] = data[bi]; + data[bi] += temp; + } + } + GROUP_LDS_BARRIER; + + return blocksum; +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer, + uint4 cb) +{ + __local u32 ldsData[WG_SIZE*2]; + + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + + ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0; + ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0; + + u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE); + + if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum; + + if( (2*gIdx) < cb.m_numElems ) + { + dst[2*gIdx] = ldsData[2*lIdx]; + } + if( (2*gIdx + 1) < cb.m_numElems ) + { + dst[2*gIdx + 1] = ldsData[2*lIdx + 1]; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb) +{ + const u32 blockSize = WG_SIZE*2; + + int myIdx = GET_GROUP_IDX+1; + int lIdx = GET_LOCAL_IDX; + + u32 iBlockSum = blockSum[myIdx]; + + int endValue = min((myIdx+1)*(blockSize), cb.m_numElems); + for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE) + { + dst[i] += iBlockSum; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void TopLevelScanKernel(__global u32* dst, uint4 cb) +{ + __local u32 ldsData[2048]; + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + int lSize = GET_GROUP_SIZE; + + for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize ) + { + ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0; + } + + GROUP_LDS_BARRIER; + + u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE); + + for(int i=lIdx; i<cb.m_numBlocks; i+=lSize ) + { + dst[i] = ldsData[i]; + } + + if( gIdx == 0 ) + { + dst[cb.m_numBlocks] = sum; + } +} diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h new file mode 100644 index 0000000000..27baab8331 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h @@ -0,0 +1,129 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* prefixScanKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"// takahiro end\n" +"#define WG_SIZE 128 \n" +"#define m_numElems x\n" +"#define m_numBlocks y\n" +"#define m_numScanBlocks z\n" +"/*typedef struct\n" +"{\n" +" uint m_numElems;\n" +" uint m_numBlocks;\n" +" uint m_numScanBlocks;\n" +" uint m_padding[1];\n" +"} ConstBuffer;\n" +"*/\n" +"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n" +"{\n" +" u32 blocksum;\n" +" int offset = 1;\n" +" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" data[bi] += data[ai];\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 )\n" +" {\n" +" blocksum = data[ n-1 ];\n" +" data[ n-1 ] = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" offset >>= 1;\n" +" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" u32 temp = data[ai];\n" +" data[ai] = data[bi];\n" +" data[bi] += temp;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" return blocksum;\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n" +" uint4 cb)\n" +"{\n" +" __local u32 ldsData[WG_SIZE*2];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" +" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" +" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n" +" if( (2*gIdx) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx] = ldsData[2*lIdx];\n" +" }\n" +" if( (2*gIdx + 1) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n" +"{\n" +" const u32 blockSize = WG_SIZE*2;\n" +" int myIdx = GET_GROUP_IDX+1;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" u32 iBlockSum = blockSum[myIdx];\n" +" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" +" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" +" {\n" +" dst[i] += iBlockSum;\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n" +"{\n" +" __local u32 ldsData[2048];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int lSize = GET_GROUP_SIZE;\n" +" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" +" {\n" +" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" +" {\n" +" dst[i] = ldsData[i];\n" +" }\n" +" if( gIdx == 0 )\n" +" {\n" +" dst[cb.m_numBlocks] = sum;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h new file mode 100644 index 0000000000..5b13254796 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h @@ -0,0 +1,129 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* prefixScanKernelsFloat4CL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"// takahiro end\n" +"#define WG_SIZE 128 \n" +"#define m_numElems x\n" +"#define m_numBlocks y\n" +"#define m_numScanBlocks z\n" +"/*typedef struct\n" +"{\n" +" uint m_numElems;\n" +" uint m_numBlocks;\n" +" uint m_numScanBlocks;\n" +" uint m_padding[1];\n" +"} ConstBuffer;\n" +"*/\n" +"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n" +"{\n" +" float4 blocksum;\n" +" int offset = 1;\n" +" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" data[bi] += data[ai];\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 )\n" +" {\n" +" blocksum = data[ n-1 ];\n" +" data[ n-1 ] = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" offset >>= 1;\n" +" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" float4 temp = data[ai];\n" +" data[ai] = data[bi];\n" +" data[bi] += temp;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" return blocksum;\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)\n" +"{\n" +" __local float4 ldsData[WG_SIZE*2];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" +" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" +" float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" if( lIdx == 0 ) \n" +" sumBuffer[GET_GROUP_IDX] = sum;\n" +" if( (2*gIdx) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx] = ldsData[2*lIdx];\n" +" }\n" +" if( (2*gIdx + 1) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n" +"{\n" +" const u32 blockSize = WG_SIZE*2;\n" +" int myIdx = GET_GROUP_IDX+1;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" float4 iBlockSum = blockSum[myIdx];\n" +" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" +" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" +" {\n" +" dst[i] += iBlockSum;\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n" +"{\n" +" __local float4 ldsData[2048];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int lSize = GET_GROUP_SIZE;\n" +" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" +" {\n" +" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" +" {\n" +" dst[i] = ldsData[i];\n" +" }\n" +" if( gIdx == 0 )\n" +" {\n" +" dst[cb.m_numBlocks] = sum;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl new file mode 100644 index 0000000000..7402e2f3b3 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl @@ -0,0 +1,1071 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Author Takahiro Harada + + +//#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AtomAdd(x, value) atom_add(&(x), value) + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +#define WG_SIZE 64 +#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE) +#define BITS_PER_PASS 4 +#define NUM_BUCKET (1<<BITS_PER_PASS) +typedef uchar u8; + +// this isn't optimization for VLIW. But just reducing writes. +#define USE_2LEVEL_REDUCE 1 + +//#define CHECK_BOUNDARY 1 + +//#define NV_GPU 1 + + +// Cypress +#define nPerWI 16 +// Cayman +//#define nPerWI 20 + +#define m_n x +#define m_nWGs y +#define m_startBit z +#define m_nBlocksPerWG w + +/* +typedef struct +{ + int m_n; + int m_nWGs; + int m_startBit; + int m_nBlocksPerWG; +} ConstBuffer; +*/ + +typedef struct +{ + unsigned int m_key; + unsigned int m_value; +} SortDataCL; + + +uint prefixScanVectorEx( uint4* data ) +{ + u32 sum = 0; + u32 tmp = data[0].x; + data[0].x = sum; + sum += tmp; + tmp = data[0].y; + data[0].y = sum; + sum += tmp; + tmp = data[0].z; + data[0].z = sum; + sum += tmp; + tmp = data[0].w; + data[0].w = sum; + sum += tmp; + return sum; +} + +u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ ) +{ + { // Set data + sorterSharedMemory[lIdx] = 0; + sorterSharedMemory[lIdx+wgSize] = pData; + } + + GROUP_LDS_BARRIER; + + { // Prefix sum + int idx = 2*lIdx + (wgSize+1); +#if defined(USE_2LEVEL_REDUCE) + if( lIdx < 64 ) + { + u32 u0, u1, u2; + u0 = sorterSharedMemory[idx-3]; + u1 = sorterSharedMemory[idx-2]; + u2 = sorterSharedMemory[idx-1]; + AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); + GROUP_MEM_FENCE; + + u0 = sorterSharedMemory[idx-12]; + u1 = sorterSharedMemory[idx-8]; + u2 = sorterSharedMemory[idx-4]; + AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); + GROUP_MEM_FENCE; + + u0 = sorterSharedMemory[idx-48]; + u1 = sorterSharedMemory[idx-32]; + u2 = sorterSharedMemory[idx-16]; + AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); + GROUP_MEM_FENCE; + if( wgSize > 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; + GROUP_MEM_FENCE; + } + + sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + } +#else + if( lIdx < 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-1]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-4]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-8]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-16]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; + GROUP_MEM_FENCE; + if( wgSize > 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; + GROUP_MEM_FENCE; + } + + sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + } +#endif + } + + GROUP_LDS_BARRIER; + + *totalSum = sorterSharedMemory[wgSize*2-1]; + u32 addValue = sorterSharedMemory[lIdx+wgSize-1]; + return addValue; +} + +//__attribute__((reqd_work_group_size(128,1,1))) +uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) +{ + u32 s4 = prefixScanVectorEx( &pData ); + u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 ); + return pData + make_uint4( rank, rank, rank, rank ); +} + + +//__attribute__((reqd_work_group_size(64,1,1))) +uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) +{ + u32 s4 = prefixScanVectorEx( &pData ); + u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 ); + return pData + make_uint4( rank, rank, rank, rank ); +} + +u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;} + +u32 bit8Scan(u32 v) +{ + return (v<<8) + (v<<16) + (v<<24); +} + +//=== + + + + +#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx] + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb ) +{ + __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + for(int i=0; i<NUM_BUCKET; i++) + { + MY_HISTOGRAM(i) = 0; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + u32 localKey; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + { +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + { + localKey = (gSrc[addr+i]>>startBit) & 0xf; +#if defined(NV_GPU) + MY_HISTOGRAM( localKey )++; +#else + AtomInc( MY_HISTOGRAM( localKey ) ); +#endif + } + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<GET_GROUP_SIZE; i++) + { + sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE]; + } + histogramOut[lIdx*nWGs+wgIdx] = sum; + } +} + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb ) +{ + __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + for(int i=0; i<NUM_BUCKET; i++) + { + MY_HISTOGRAM(i) = 0; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + u32 localKey; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + { +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + { + localKey = (gSrc[addr+i].m_key>>startBit) & 0xf; +#if defined(NV_GPU) + MY_HISTOGRAM( localKey )++; +#else + AtomInc( MY_HISTOGRAM( localKey ) ); +#endif + } + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<GET_GROUP_SIZE; i++) + { + sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE]; + } + histogramOut[lIdx*nWGs+wgIdx] = sum; + } +} + +#define nPerLane (nPerWI/4) + +// NUM_BUCKET*nWGs < 128*nPerWI +__kernel +__attribute__((reqd_work_group_size(128,1,1))) +void PrefixScanKernel( __global u32* wHistogram1, int4 cb ) +{ + __local u32 ldsTopScanData[128*2]; + + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + const int nWGs = cb.m_nWGs; + + u32 data[nPerWI]; + for(int i=0; i<nPerWI; i++) + { + data[i] = 0; + if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs ) + data[i] = wHistogram1[nPerWI*lIdx+i]; + } + + uint4 myData = make_uint4(0,0,0,0); + + for(int i=0; i<nPerLane; i++) + { + myData.x += data[nPerLane*0+i]; + myData.y += data[nPerLane*1+i]; + myData.z += data[nPerLane*2+i]; + myData.w += data[nPerLane*3+i]; + } + + uint totalSum; + uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData ); + +// for(int j=0; j<4; j++) // somehow it introduces a lot of branches + { int j = 0; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + { int j = 1; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + { int j = 2; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + { int j = 3; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + + for(int i=0; i<nPerLane; i++) + { + data[nPerLane*0+i] += scanned.x; + data[nPerLane*1+i] += scanned.y; + data[nPerLane*2+i] += scanned.z; + data[nPerLane*3+i] += scanned.w; + } + + for(int i=0; i<nPerWI; i++) + { + int index = nPerWI*lIdx+i; + if (index < NUM_BUCKET*nWGs) + wHistogram1[nPerWI*lIdx+i] = data[i]; + } +} + +// 4 scan, 4 exchange +void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData) +{ + for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++) + { + u32 mask = (1<<bitIdx); + uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask ); + uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) ); + u32 total; + prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData ); + { + uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3); + uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total ); + dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) ); + + GROUP_LDS_BARRIER; + + ldsSortData[dstAddr.x] = sortData[0]; + ldsSortData[dstAddr.y] = sortData[1]; + ldsSortData[dstAddr.z] = sortData[2]; + ldsSortData[dstAddr.w] = sortData[3]; + + GROUP_LDS_BARRIER; + + sortData[0] = ldsSortData[localAddr.x]; + sortData[1] = ldsSortData[localAddr.y]; + sortData[2] = ldsSortData[localAddr.z]; + sortData[3] = ldsSortData[localAddr.w]; + + GROUP_LDS_BARRIER; + } + } +} + +// 2 scan, 2 exchange +void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData) +{ + for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2) + { + uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, + (sortData[1]>>(startBit+ibit)) & 0x3, + (sortData[2]>>(startBit+ibit)) & 0x3, + (sortData[3]>>(startBit+ibit)) & 0x3); + + u32 key4; + u32 sKeyPacked[4] = { 0, 0, 0, 0 }; + { + sKeyPacked[0] |= 1<<(8*b.x); + sKeyPacked[1] |= 1<<(8*b.y); + sKeyPacked[2] |= 1<<(8*b.z); + sKeyPacked[3] |= 1<<(8*b.w); + + key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; + } + + u32 rankPacked; + u32 sumPacked; + { + rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); + } + + GROUP_LDS_BARRIER; + + u32 newOffset[4] = { 0,0,0,0 }; + { + u32 sumScanned = bit8Scan( sumPacked ); + + u32 scannedKeys[4]; + scannedKeys[0] = 1<<(8*b.x); + scannedKeys[1] = 1<<(8*b.y); + scannedKeys[2] = 1<<(8*b.z); + scannedKeys[3] = 1<<(8*b.w); + { // 4 scans at once + u32 sum4 = 0; + for(int ie=0; ie<4; ie++) + { + u32 tmp = scannedKeys[ie]; + scannedKeys[ie] = sum4; + sum4 += tmp; + } + } + + { + u32 sumPlusRank = sumScanned + rankPacked; + { u32 ie = b.x; + scannedKeys[0] += sumPlusRank; + newOffset[0] = unpack4Key( scannedKeys[0], ie ); + } + { u32 ie = b.y; + scannedKeys[1] += sumPlusRank; + newOffset[1] = unpack4Key( scannedKeys[1], ie ); + } + { u32 ie = b.z; + scannedKeys[2] += sumPlusRank; + newOffset[2] = unpack4Key( scannedKeys[2], ie ); + } + { u32 ie = b.w; + scannedKeys[3] += sumPlusRank; + newOffset[3] = unpack4Key( scannedKeys[3], ie ); + } + } + } + + + GROUP_LDS_BARRIER; + + { + ldsSortData[newOffset[0]] = sortData[0]; + ldsSortData[newOffset[1]] = sortData[1]; + ldsSortData[newOffset[2]] = sortData[2]; + ldsSortData[newOffset[3]] = sortData[3]; + + GROUP_LDS_BARRIER; + + u32 dstAddr = 4*lIdx; + sortData[0] = ldsSortData[dstAddr+0]; + sortData[1] = ldsSortData[dstAddr+1]; + sortData[2] = ldsSortData[dstAddr+2]; + sortData[3] = ldsSortData[dstAddr+3]; + + GROUP_LDS_BARRIER; + } + } +} + +#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key] + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) +{ + __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local u32 localHistogramToCarry[NUM_BUCKET]; + __local u32 localHistogram[NUM_BUCKET*2]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int startBit = cb.m_startBit; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + if( lIdx < (NUM_BUCKET) ) + { + localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + u32 myHistogram = 0; + + u32 sortData[ELEMENTS_PER_WORK_ITEM]; + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff; +#else + sortData[i] = gSrc[ addr+i ]; +#endif + + sort4Bits(sortData, startBit, lIdx, ldsSortData); + + u32 keys[ELEMENTS_PER_WORK_ITEM]; + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + keys[i] = (sortData[i]>>startBit) & 0xf; + + { // create histogram + u32 setIdx = lIdx/16; + if( lIdx < NUM_BUCKET ) + { + localHistogram[lIdx] = 0; + } + ldsSortData[lIdx] = 0; + GROUP_LDS_BARRIER; + + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + +#if defined(NV_GPU) + SET_HISTOGRAM( setIdx, keys[i] )++; +#else + AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) ); +#endif + + GROUP_LDS_BARRIER; + + uint hIdx = NUM_BUCKET+lIdx; + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<WG_SIZE/16; i++) + { + sum += SET_HISTOGRAM( i, lIdx ); + } + myHistogram = sum; + localHistogram[hIdx] = sum; + } + GROUP_LDS_BARRIER; + +#if defined(USE_2LEVEL_REDUCE) + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + + u32 u0, u1, u2; + u0 = localHistogram[hIdx-3]; + u1 = localHistogram[hIdx-2]; + u2 = localHistogram[hIdx-1]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + u0 = localHistogram[hIdx-12]; + u1 = localHistogram[hIdx-8]; + u2 = localHistogram[hIdx-4]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + } +#else + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-2]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-4]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-8]; + GROUP_MEM_FENCE; + } +#endif + GROUP_LDS_BARRIER; + } + + { + for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++) + { + int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie; + int binIdx = keys[ie]; + int groupOffset = localHistogramToCarry[binIdx]; + int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx]; +#if defined(CHECK_BOUNDARY) + if( addr+ie < n ) +#endif + gDst[ groupOffset + myIdx ] = sortData[ie]; + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + localHistogramToCarry[lIdx] += myHistogram; + } + GROUP_LDS_BARRIER; + } +} + +// 2 scan, 2 exchange +void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal) +{ + for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2) + { + uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, + (sortData[1]>>(startBit+ibit)) & 0x3, + (sortData[2]>>(startBit+ibit)) & 0x3, + (sortData[3]>>(startBit+ibit)) & 0x3); + + u32 key4; + u32 sKeyPacked[4] = { 0, 0, 0, 0 }; + { + sKeyPacked[0] |= 1<<(8*b.x); + sKeyPacked[1] |= 1<<(8*b.y); + sKeyPacked[2] |= 1<<(8*b.z); + sKeyPacked[3] |= 1<<(8*b.w); + + key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; + } + + u32 rankPacked; + u32 sumPacked; + { + rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); + } + + GROUP_LDS_BARRIER; + + u32 newOffset[4] = { 0,0,0,0 }; + { + u32 sumScanned = bit8Scan( sumPacked ); + + u32 scannedKeys[4]; + scannedKeys[0] = 1<<(8*b.x); + scannedKeys[1] = 1<<(8*b.y); + scannedKeys[2] = 1<<(8*b.z); + scannedKeys[3] = 1<<(8*b.w); + { // 4 scans at once + u32 sum4 = 0; + for(int ie=0; ie<4; ie++) + { + u32 tmp = scannedKeys[ie]; + scannedKeys[ie] = sum4; + sum4 += tmp; + } + } + + { + u32 sumPlusRank = sumScanned + rankPacked; + { u32 ie = b.x; + scannedKeys[0] += sumPlusRank; + newOffset[0] = unpack4Key( scannedKeys[0], ie ); + } + { u32 ie = b.y; + scannedKeys[1] += sumPlusRank; + newOffset[1] = unpack4Key( scannedKeys[1], ie ); + } + { u32 ie = b.z; + scannedKeys[2] += sumPlusRank; + newOffset[2] = unpack4Key( scannedKeys[2], ie ); + } + { u32 ie = b.w; + scannedKeys[3] += sumPlusRank; + newOffset[3] = unpack4Key( scannedKeys[3], ie ); + } + } + } + + + GROUP_LDS_BARRIER; + + { + ldsSortData[newOffset[0]] = sortData[0]; + ldsSortData[newOffset[1]] = sortData[1]; + ldsSortData[newOffset[2]] = sortData[2]; + ldsSortData[newOffset[3]] = sortData[3]; + + ldsSortVal[newOffset[0]] = sortVal[0]; + ldsSortVal[newOffset[1]] = sortVal[1]; + ldsSortVal[newOffset[2]] = sortVal[2]; + ldsSortVal[newOffset[3]] = sortVal[3]; + + GROUP_LDS_BARRIER; + + u32 dstAddr = 4*lIdx; + sortData[0] = ldsSortData[dstAddr+0]; + sortData[1] = ldsSortData[dstAddr+1]; + sortData[2] = ldsSortData[dstAddr+2]; + sortData[3] = ldsSortData[dstAddr+3]; + + sortVal[0] = ldsSortVal[dstAddr+0]; + sortVal[1] = ldsSortVal[dstAddr+1]; + sortVal[2] = ldsSortVal[dstAddr+2]; + sortVal[3] = ldsSortVal[dstAddr+3]; + + GROUP_LDS_BARRIER; + } + } +} + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb) +{ + __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local u32 localHistogramToCarry[NUM_BUCKET]; + __local u32 localHistogram[NUM_BUCKET*2]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int startBit = cb.m_startBit; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + if( lIdx < (NUM_BUCKET) ) + { + localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; + } + + GROUP_LDS_BARRIER; + + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + + u32 myHistogram = 0; + + int sortData[ELEMENTS_PER_WORK_ITEM]; + int sortVal[ELEMENTS_PER_WORK_ITEM]; + + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + { + sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff; + sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff; + } +#else + { + sortData[i] = gSrc[ addr+i ].m_key; + sortVal[i] = gSrc[ addr+i ].m_value; + } +#endif + + sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal); + + u32 keys[ELEMENTS_PER_WORK_ITEM]; + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + keys[i] = (sortData[i]>>startBit) & 0xf; + + { // create histogram + u32 setIdx = lIdx/16; + if( lIdx < NUM_BUCKET ) + { + localHistogram[lIdx] = 0; + } + ldsSortData[lIdx] = 0; + GROUP_LDS_BARRIER; + + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + +#if defined(NV_GPU) + SET_HISTOGRAM( setIdx, keys[i] )++; +#else + AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) ); +#endif + + GROUP_LDS_BARRIER; + + uint hIdx = NUM_BUCKET+lIdx; + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<WG_SIZE/16; i++) + { + sum += SET_HISTOGRAM( i, lIdx ); + } + myHistogram = sum; + localHistogram[hIdx] = sum; + } + GROUP_LDS_BARRIER; + +#if defined(USE_2LEVEL_REDUCE) + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + + u32 u0, u1, u2; + u0 = localHistogram[hIdx-3]; + u1 = localHistogram[hIdx-2]; + u2 = localHistogram[hIdx-1]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + u0 = localHistogram[hIdx-12]; + u1 = localHistogram[hIdx-8]; + u2 = localHistogram[hIdx-4]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + } +#else + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-2]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-4]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-8]; + GROUP_MEM_FENCE; + } +#endif + GROUP_LDS_BARRIER; + } + + { + for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++) + { + int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie; + int binIdx = keys[ie]; + int groupOffset = localHistogramToCarry[binIdx]; + int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx]; +#if defined(CHECK_BOUNDARY) + if( addr+ie < n ) + { + if ((groupOffset + myIdx)<n) + { + if (sortData[ie]==sortVal[ie]) + { + + SortDataCL tmp; + tmp.m_key = sortData[ie]; + tmp.m_value = sortVal[ie]; + if (tmp.m_key == tmp.m_value) + gDst[groupOffset + myIdx ] = tmp; + } + + } + } +#else + if ((groupOffset + myIdx)<n) + { + gDst[ groupOffset + myIdx ].m_key = sortData[ie]; + gDst[ groupOffset + myIdx ].m_value = sortVal[ie]; + } +#endif + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + localHistogramToCarry[lIdx] += myHistogram; + } + GROUP_LDS_BARRIER; + } +} + + + + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb) +{ + + u32 gIdx = GET_GLOBAL_IDX; + u32 realLocalIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + int counter[NUM_BUCKET]; + + if (realLocalIdx>0) + return; + + for (int c=0;c<NUM_BUCKET;c++) + counter[c]=0; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx=0;lIdx<WG_SIZE;lIdx++) + { + int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) + { + int i = addr2+j; + if( i < n ) + { + int tableIdx; + tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1 + gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; + counter[tableIdx] ++; + } + } + } + } + +} + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) +{ + + u32 gIdx = GET_GLOBAL_IDX; + u32 realLocalIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + int counter[NUM_BUCKET]; + + if (realLocalIdx>0) + return; + + for (int c=0;c<NUM_BUCKET;c++) + counter[c]=0; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx=0;lIdx<WG_SIZE;lIdx++) + { + int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) + { + int i = addr2+j; + if( i < n ) + { + int tableIdx; + tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1 + gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; + counter[tableIdx] ++; + } + } + } + } + +}
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h new file mode 100644 index 0000000000..8876c16aa6 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h @@ -0,0 +1,910 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* radixSort32KernelsCL= \ +"/*\n" +"Bullet Continuous Collision Detection and Physics Library\n" +"Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Author Takahiro Harada\n" +"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define WG_SIZE 64\n" +"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n" +"#define BITS_PER_PASS 4\n" +"#define NUM_BUCKET (1<<BITS_PER_PASS)\n" +"typedef uchar u8;\n" +"// this isn't optimization for VLIW. But just reducing writes. \n" +"#define USE_2LEVEL_REDUCE 1\n" +"//#define CHECK_BOUNDARY 1\n" +"//#define NV_GPU 1\n" +"// Cypress\n" +"#define nPerWI 16\n" +"// Cayman\n" +"//#define nPerWI 20\n" +"#define m_n x\n" +"#define m_nWGs y\n" +"#define m_startBit z\n" +"#define m_nBlocksPerWG w\n" +"/*\n" +"typedef struct\n" +"{\n" +" int m_n;\n" +" int m_nWGs;\n" +" int m_startBit;\n" +" int m_nBlocksPerWG;\n" +"} ConstBuffer;\n" +"*/\n" +"typedef struct\n" +"{\n" +" unsigned int m_key;\n" +" unsigned int m_value;\n" +"} SortDataCL;\n" +"uint prefixScanVectorEx( uint4* data )\n" +"{\n" +" u32 sum = 0;\n" +" u32 tmp = data[0].x;\n" +" data[0].x = sum;\n" +" sum += tmp;\n" +" tmp = data[0].y;\n" +" data[0].y = sum;\n" +" sum += tmp;\n" +" tmp = data[0].z;\n" +" data[0].z = sum;\n" +" sum += tmp;\n" +" tmp = data[0].w;\n" +" data[0].w = sum;\n" +" sum += tmp;\n" +" return sum;\n" +"}\n" +"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n" +"{\n" +" { // Set data\n" +" sorterSharedMemory[lIdx] = 0;\n" +" sorterSharedMemory[lIdx+wgSize] = pData;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" { // Prefix sum\n" +" int idx = 2*lIdx + (wgSize+1);\n" +"#if defined(USE_2LEVEL_REDUCE)\n" +" if( lIdx < 64 )\n" +" {\n" +" u32 u0, u1, u2;\n" +" u0 = sorterSharedMemory[idx-3];\n" +" u1 = sorterSharedMemory[idx-2];\n" +" u2 = sorterSharedMemory[idx-1];\n" +" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" +" GROUP_MEM_FENCE;\n" +" u0 = sorterSharedMemory[idx-12];\n" +" u1 = sorterSharedMemory[idx-8];\n" +" u2 = sorterSharedMemory[idx-4];\n" +" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" +" GROUP_MEM_FENCE;\n" +" u0 = sorterSharedMemory[idx-48];\n" +" u1 = sorterSharedMemory[idx-32];\n" +" u2 = sorterSharedMemory[idx-16];\n" +" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" +" GROUP_MEM_FENCE;\n" +" if( wgSize > 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n" +" GROUP_MEM_FENCE;\n" +" if( wgSize > 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" *totalSum = sorterSharedMemory[wgSize*2-1];\n" +" u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n" +" return addValue;\n" +"}\n" +"//__attribute__((reqd_work_group_size(128,1,1)))\n" +"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" +"{\n" +" u32 s4 = prefixScanVectorEx( &pData );\n" +" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n" +" return pData + make_uint4( rank, rank, rank, rank );\n" +"}\n" +"//__attribute__((reqd_work_group_size(64,1,1)))\n" +"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" +"{\n" +" u32 s4 = prefixScanVectorEx( &pData );\n" +" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n" +" return pData + make_uint4( rank, rank, rank, rank );\n" +"}\n" +"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n" +"u32 bit8Scan(u32 v)\n" +"{\n" +" return (v<<8) + (v<<16) + (v<<24);\n" +"}\n" +"//===\n" +"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n" +"{\n" +" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" for(int i=0; i<NUM_BUCKET; i++)\n" +" {\n" +" MY_HISTOGRAM(i) = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" u32 localKey;\n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" +" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" +" // AMD: AtomInc performs better while NV prefers ++\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" {\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +" {\n" +" localKey = (gSrc[addr+i]>>startBit) & 0xf;\n" +"#if defined(NV_GPU)\n" +" MY_HISTOGRAM( localKey )++;\n" +"#else\n" +" AtomInc( MY_HISTOGRAM( localKey ) );\n" +"#endif\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<GET_GROUP_SIZE; i++)\n" +" {\n" +" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" +" }\n" +" histogramOut[lIdx*nWGs+wgIdx] = sum;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb )\n" +"{\n" +" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" for(int i=0; i<NUM_BUCKET; i++)\n" +" {\n" +" MY_HISTOGRAM(i) = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" u32 localKey;\n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" +" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" +" // AMD: AtomInc performs better while NV prefers ++\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" {\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +" {\n" +" localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n" +"#if defined(NV_GPU)\n" +" MY_HISTOGRAM( localKey )++;\n" +"#else\n" +" AtomInc( MY_HISTOGRAM( localKey ) );\n" +"#endif\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<GET_GROUP_SIZE; i++)\n" +" {\n" +" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" +" }\n" +" histogramOut[lIdx*nWGs+wgIdx] = sum;\n" +" }\n" +"}\n" +"#define nPerLane (nPerWI/4)\n" +"// NUM_BUCKET*nWGs < 128*nPerWI\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(128,1,1)))\n" +"void PrefixScanKernel( __global u32* wHistogram1, int4 cb )\n" +"{\n" +" __local u32 ldsTopScanData[128*2];\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" const int nWGs = cb.m_nWGs;\n" +" u32 data[nPerWI];\n" +" for(int i=0; i<nPerWI; i++)\n" +" {\n" +" data[i] = 0;\n" +" if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n" +" data[i] = wHistogram1[nPerWI*lIdx+i];\n" +" }\n" +" uint4 myData = make_uint4(0,0,0,0);\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" myData.x += data[nPerLane*0+i];\n" +" myData.y += data[nPerLane*1+i];\n" +" myData.z += data[nPerLane*2+i];\n" +" myData.w += data[nPerLane*3+i];\n" +" }\n" +" uint totalSum;\n" +" uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n" +"// for(int j=0; j<4; j++) // somehow it introduces a lot of branches\n" +" { int j = 0;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" { int j = 1;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" { int j = 2;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" { int j = 3;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" data[nPerLane*0+i] += scanned.x;\n" +" data[nPerLane*1+i] += scanned.y;\n" +" data[nPerLane*2+i] += scanned.z;\n" +" data[nPerLane*3+i] += scanned.w;\n" +" }\n" +" for(int i=0; i<nPerWI; i++)\n" +" {\n" +" int index = nPerWI*lIdx+i;\n" +" if (index < NUM_BUCKET*nWGs)\n" +" wHistogram1[nPerWI*lIdx+i] = data[i];\n" +" }\n" +"}\n" +"// 4 scan, 4 exchange\n" +"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" +"{\n" +" for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n" +" {\n" +" u32 mask = (1<<bitIdx);\n" +" uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n" +" uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n" +" u32 total;\n" +" prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n" +" {\n" +" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n" +" uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n" +" dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n" +" GROUP_LDS_BARRIER;\n" +" ldsSortData[dstAddr.x] = sortData[0];\n" +" ldsSortData[dstAddr.y] = sortData[1];\n" +" ldsSortData[dstAddr.z] = sortData[2];\n" +" ldsSortData[dstAddr.w] = sortData[3];\n" +" GROUP_LDS_BARRIER;\n" +" sortData[0] = ldsSortData[localAddr.x];\n" +" sortData[1] = ldsSortData[localAddr.y];\n" +" sortData[2] = ldsSortData[localAddr.z];\n" +" sortData[3] = ldsSortData[localAddr.w];\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"// 2 scan, 2 exchange\n" +"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" +"{\n" +" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" +" {\n" +" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" +" (sortData[1]>>(startBit+ibit)) & 0x3, \n" +" (sortData[2]>>(startBit+ibit)) & 0x3, \n" +" (sortData[3]>>(startBit+ibit)) & 0x3);\n" +" u32 key4;\n" +" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" +" {\n" +" sKeyPacked[0] |= 1<<(8*b.x);\n" +" sKeyPacked[1] |= 1<<(8*b.y);\n" +" sKeyPacked[2] |= 1<<(8*b.z);\n" +" sKeyPacked[3] |= 1<<(8*b.w);\n" +" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" +" }\n" +" u32 rankPacked;\n" +" u32 sumPacked;\n" +" {\n" +" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" u32 newOffset[4] = { 0,0,0,0 };\n" +" {\n" +" u32 sumScanned = bit8Scan( sumPacked );\n" +" u32 scannedKeys[4];\n" +" scannedKeys[0] = 1<<(8*b.x);\n" +" scannedKeys[1] = 1<<(8*b.y);\n" +" scannedKeys[2] = 1<<(8*b.z);\n" +" scannedKeys[3] = 1<<(8*b.w);\n" +" { // 4 scans at once\n" +" u32 sum4 = 0;\n" +" for(int ie=0; ie<4; ie++)\n" +" {\n" +" u32 tmp = scannedKeys[ie];\n" +" scannedKeys[ie] = sum4;\n" +" sum4 += tmp;\n" +" }\n" +" }\n" +" {\n" +" u32 sumPlusRank = sumScanned + rankPacked;\n" +" { u32 ie = b.x;\n" +" scannedKeys[0] += sumPlusRank;\n" +" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" +" }\n" +" { u32 ie = b.y;\n" +" scannedKeys[1] += sumPlusRank;\n" +" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" +" }\n" +" { u32 ie = b.z;\n" +" scannedKeys[2] += sumPlusRank;\n" +" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" +" }\n" +" { u32 ie = b.w;\n" +" scannedKeys[3] += sumPlusRank;\n" +" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" {\n" +" ldsSortData[newOffset[0]] = sortData[0];\n" +" ldsSortData[newOffset[1]] = sortData[1];\n" +" ldsSortData[newOffset[2]] = sortData[2];\n" +" ldsSortData[newOffset[3]] = sortData[3];\n" +" GROUP_LDS_BARRIER;\n" +" u32 dstAddr = 4*lIdx;\n" +" sortData[0] = ldsSortData[dstAddr+0];\n" +" sortData[1] = ldsSortData[dstAddr+1];\n" +" sortData[2] = ldsSortData[dstAddr+2];\n" +" sortData[3] = ldsSortData[dstAddr+3];\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" +"{\n" +" __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local u32 localHistogramToCarry[NUM_BUCKET];\n" +" __local u32 localHistogram[NUM_BUCKET*2];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int startBit = cb.m_startBit;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" if( lIdx < (NUM_BUCKET) )\n" +" {\n" +" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" u32 myHistogram = 0;\n" +" u32 sortData[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n" +"#else\n" +" sortData[i] = gSrc[ addr+i ];\n" +"#endif\n" +" sort4Bits(sortData, startBit, lIdx, ldsSortData);\n" +" u32 keys[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" keys[i] = (sortData[i]>>startBit) & 0xf;\n" +" { // create histogram\n" +" u32 setIdx = lIdx/16;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[lIdx] = 0;\n" +" }\n" +" ldsSortData[lIdx] = 0;\n" +" GROUP_LDS_BARRIER;\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +"#if defined(NV_GPU)\n" +" SET_HISTOGRAM( setIdx, keys[i] )++;\n" +"#else\n" +" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" +"#endif\n" +" \n" +" GROUP_LDS_BARRIER;\n" +" \n" +" uint hIdx = NUM_BUCKET+lIdx;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<WG_SIZE/16; i++)\n" +" {\n" +" sum += SET_HISTOGRAM( i, lIdx );\n" +" }\n" +" myHistogram = sum;\n" +" localHistogram[hIdx] = sum;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +"#if defined(USE_2LEVEL_REDUCE)\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" u32 u0, u1, u2;\n" +" u0 = localHistogram[hIdx-3];\n" +" u1 = localHistogram[hIdx-2];\n" +" u2 = localHistogram[hIdx-1];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" u0 = localHistogram[hIdx-12];\n" +" u1 = localHistogram[hIdx-8];\n" +" u2 = localHistogram[hIdx-4];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-2];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-4];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-8];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" {\n" +" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" +" {\n" +" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" +" int binIdx = keys[ie];\n" +" int groupOffset = localHistogramToCarry[binIdx];\n" +" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+ie < n )\n" +"#endif\n" +" gDst[ groupOffset + myIdx ] = sortData[ie];\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogramToCarry[lIdx] += myHistogram;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +"}\n" +"// 2 scan, 2 exchange\n" +"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n" +"{\n" +" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" +" {\n" +" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" +" (sortData[1]>>(startBit+ibit)) & 0x3, \n" +" (sortData[2]>>(startBit+ibit)) & 0x3, \n" +" (sortData[3]>>(startBit+ibit)) & 0x3);\n" +" u32 key4;\n" +" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" +" {\n" +" sKeyPacked[0] |= 1<<(8*b.x);\n" +" sKeyPacked[1] |= 1<<(8*b.y);\n" +" sKeyPacked[2] |= 1<<(8*b.z);\n" +" sKeyPacked[3] |= 1<<(8*b.w);\n" +" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" +" }\n" +" u32 rankPacked;\n" +" u32 sumPacked;\n" +" {\n" +" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" u32 newOffset[4] = { 0,0,0,0 };\n" +" {\n" +" u32 sumScanned = bit8Scan( sumPacked );\n" +" u32 scannedKeys[4];\n" +" scannedKeys[0] = 1<<(8*b.x);\n" +" scannedKeys[1] = 1<<(8*b.y);\n" +" scannedKeys[2] = 1<<(8*b.z);\n" +" scannedKeys[3] = 1<<(8*b.w);\n" +" { // 4 scans at once\n" +" u32 sum4 = 0;\n" +" for(int ie=0; ie<4; ie++)\n" +" {\n" +" u32 tmp = scannedKeys[ie];\n" +" scannedKeys[ie] = sum4;\n" +" sum4 += tmp;\n" +" }\n" +" }\n" +" {\n" +" u32 sumPlusRank = sumScanned + rankPacked;\n" +" { u32 ie = b.x;\n" +" scannedKeys[0] += sumPlusRank;\n" +" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" +" }\n" +" { u32 ie = b.y;\n" +" scannedKeys[1] += sumPlusRank;\n" +" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" +" }\n" +" { u32 ie = b.z;\n" +" scannedKeys[2] += sumPlusRank;\n" +" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" +" }\n" +" { u32 ie = b.w;\n" +" scannedKeys[3] += sumPlusRank;\n" +" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" {\n" +" ldsSortData[newOffset[0]] = sortData[0];\n" +" ldsSortData[newOffset[1]] = sortData[1];\n" +" ldsSortData[newOffset[2]] = sortData[2];\n" +" ldsSortData[newOffset[3]] = sortData[3];\n" +" ldsSortVal[newOffset[0]] = sortVal[0];\n" +" ldsSortVal[newOffset[1]] = sortVal[1];\n" +" ldsSortVal[newOffset[2]] = sortVal[2];\n" +" ldsSortVal[newOffset[3]] = sortVal[3];\n" +" GROUP_LDS_BARRIER;\n" +" u32 dstAddr = 4*lIdx;\n" +" sortData[0] = ldsSortData[dstAddr+0];\n" +" sortData[1] = ldsSortData[dstAddr+1];\n" +" sortData[2] = ldsSortData[dstAddr+2];\n" +" sortData[3] = ldsSortData[dstAddr+3];\n" +" sortVal[0] = ldsSortVal[dstAddr+0];\n" +" sortVal[1] = ldsSortVal[dstAddr+1];\n" +" sortVal[2] = ldsSortVal[dstAddr+2];\n" +" sortVal[3] = ldsSortVal[dstAddr+3];\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" +"{\n" +" __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local u32 localHistogramToCarry[NUM_BUCKET];\n" +" __local u32 localHistogram[NUM_BUCKET*2];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int startBit = cb.m_startBit;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" if( lIdx < (NUM_BUCKET) )\n" +" {\n" +" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" u32 myHistogram = 0;\n" +" int sortData[ELEMENTS_PER_WORK_ITEM];\n" +" int sortVal[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" {\n" +" sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n" +" sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n" +" }\n" +"#else\n" +" {\n" +" sortData[i] = gSrc[ addr+i ].m_key;\n" +" sortVal[i] = gSrc[ addr+i ].m_value;\n" +" }\n" +"#endif\n" +" sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n" +" u32 keys[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" keys[i] = (sortData[i]>>startBit) & 0xf;\n" +" { // create histogram\n" +" u32 setIdx = lIdx/16;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[lIdx] = 0;\n" +" }\n" +" ldsSortData[lIdx] = 0;\n" +" GROUP_LDS_BARRIER;\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +"#if defined(NV_GPU)\n" +" SET_HISTOGRAM( setIdx, keys[i] )++;\n" +"#else\n" +" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" +"#endif\n" +" \n" +" GROUP_LDS_BARRIER;\n" +" \n" +" uint hIdx = NUM_BUCKET+lIdx;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<WG_SIZE/16; i++)\n" +" {\n" +" sum += SET_HISTOGRAM( i, lIdx );\n" +" }\n" +" myHistogram = sum;\n" +" localHistogram[hIdx] = sum;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +"#if defined(USE_2LEVEL_REDUCE)\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" u32 u0, u1, u2;\n" +" u0 = localHistogram[hIdx-3];\n" +" u1 = localHistogram[hIdx-2];\n" +" u2 = localHistogram[hIdx-1];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" u0 = localHistogram[hIdx-12];\n" +" u1 = localHistogram[hIdx-8];\n" +" u2 = localHistogram[hIdx-4];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-2];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-4];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-8];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" {\n" +" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" +" {\n" +" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" +" int binIdx = keys[ie];\n" +" int groupOffset = localHistogramToCarry[binIdx];\n" +" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+ie < n )\n" +" {\n" +" if ((groupOffset + myIdx)<n)\n" +" {\n" +" if (sortData[ie]==sortVal[ie])\n" +" {\n" +" \n" +" SortDataCL tmp;\n" +" tmp.m_key = sortData[ie];\n" +" tmp.m_value = sortVal[ie];\n" +" if (tmp.m_key == tmp.m_value)\n" +" gDst[groupOffset + myIdx ] = tmp;\n" +" }\n" +" \n" +" }\n" +" }\n" +"#else\n" +" if ((groupOffset + myIdx)<n)\n" +" {\n" +" gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n" +" gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n" +" }\n" +"#endif\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogramToCarry[lIdx] += myHistogram;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" +"{\n" +" \n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 realLocalIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" int counter[NUM_BUCKET];\n" +" \n" +" if (realLocalIdx>0)\n" +" return;\n" +" \n" +" for (int c=0;c<NUM_BUCKET;c++)\n" +" counter[c]=0;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" \n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" +" {\n" +" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" +" {\n" +" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" \n" +" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" +" {\n" +" int i = addr2+j;\n" +" if( i < n )\n" +" {\n" +" int tableIdx;\n" +" tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" +" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" +" counter[tableIdx] ++;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" +"{\n" +" \n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 realLocalIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" int counter[NUM_BUCKET];\n" +" \n" +" if (realLocalIdx>0)\n" +" return;\n" +" \n" +" for (int c=0;c<NUM_BUCKET;c++)\n" +" counter[c]=0;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" \n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" +" {\n" +" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" +" {\n" +" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" \n" +" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" +" {\n" +" int i = addr2+j;\n" +" if( i < n )\n" +" {\n" +" int tableIdx;\n" +" tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" +" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" +" counter[tableIdx] ++;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +"}\n" +; |