diff options
Diffstat (limited to 'thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels')
12 files changed, 3198 insertions, 0 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl new file mode 100644 index 0000000000..f3b4a1e8a7 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl @@ -0,0 +1,106 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +typedef struct +{ + u32 m_key; + u32 m_value; +}SortData; + + + +typedef struct +{ + u32 m_nSrc; + u32 m_nDst; + u32 m_padding[2]; +} ConstBuffer; + + + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nSrc ) + { + SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1); + SortData end; end.m_key = nDst; end.m_value = nDst; + + SortData iData = (gIdx==0)? first: src[gIdx-1]; + SortData jData = (gIdx==nSrc)? end: src[gIdx]; + + if( iData.m_key != jData.m_key ) + { +// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++) + u32 k = jData.m_key; + { + dst[k] = gIdx; + } + } + } +} + + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX+1; + + if( gIdx < nSrc+1 ) + { + SortData first; first.m_key = 0; first.m_value = 0; + SortData end; end.m_key = nDst; end.m_value = nDst; + + SortData iData = src[gIdx-1]; + SortData jData = (gIdx==nSrc)? end: src[gIdx]; + + if( iData.m_key != jData.m_key ) + { + u32 k = iData.m_key; + { + dst[k] = gIdx; + } + } + } +} + +__attribute__((reqd_work_group_size(64,1,1))) +__kernel +void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, + unsigned int nSrc, unsigned int nDst) +{ + int gIdx = GET_GLOBAL_IDX; + + + if( gIdx < nDst ) + { + C[gIdx] = A[gIdx] - B[gIdx]; + } +} + diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h new file mode 100644 index 0000000000..9c9e847138 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h @@ -0,0 +1,87 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* boundSearchKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"typedef struct\n" +"{\n" +" u32 m_key; \n" +" u32 m_value;\n" +"}SortData;\n" +"typedef struct\n" +"{\n" +" u32 m_nSrc;\n" +" u32 m_nDst;\n" +" u32 m_padding[2];\n" +"} ConstBuffer;\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nSrc )\n" +" {\n" +" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n" +" SortData end; end.m_key = nDst; end.m_value = nDst;\n" +" SortData iData = (gIdx==0)? first: src[gIdx-1];\n" +" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" +" if( iData.m_key != jData.m_key )\n" +" {\n" +"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n" +" u32 k = jData.m_key;\n" +" {\n" +" dst[k] = gIdx;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX+1;\n" +" if( gIdx < nSrc+1 )\n" +" {\n" +" SortData first; first.m_key = 0; first.m_value = 0;\n" +" SortData end; end.m_key = nDst; end.m_value = nDst;\n" +" SortData iData = src[gIdx-1];\n" +" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" +" if( iData.m_key != jData.m_key )\n" +" {\n" +" u32 k = iData.m_key;\n" +" {\n" +" dst[k] = gIdx;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"__kernel\n" +"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n" +" unsigned int nSrc, unsigned int nDst)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nDst )\n" +" {\n" +" C[gIdx] = A[gIdx] - B[gIdx];\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl new file mode 100644 index 0000000000..2eee5752ec --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl @@ -0,0 +1,128 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +typedef struct +{ + int m_n; + int m_padding[3]; +} ConstBuffer; + + + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy1F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float4 a0 = src[gIdx]; + + dst[ gIdx ] = a0; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy2F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( 2*gIdx <= cb.m_n ) + { + float4 a0 = src[gIdx*2+0]; + float4 a1 = src[gIdx*2+1]; + + dst[ gIdx*2+0 ] = a0; + dst[ gIdx*2+1 ] = a1; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void Copy4F4Kernel(__global float4* dst, __global float4* src, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( 4*gIdx <= cb.m_n ) + { + int idx0 = gIdx*4+0; + int idx1 = gIdx*4+1; + int idx2 = gIdx*4+2; + int idx3 = gIdx*4+3; + + float4 a0 = src[idx0]; + float4 a1 = src[idx1]; + float4 a2 = src[idx2]; + float4 a3 = src[idx3]; + + dst[ idx0 ] = a0; + dst[ idx1 ] = a1; + dst[ idx2 ] = a2; + dst[ idx3 ] = a3; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void CopyF1Kernel(__global float* dstF1, __global float* srcF1, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float a0 = srcF1[gIdx]; + + dstF1[ gIdx ] = a0; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, + ConstBuffer cb) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < cb.m_n ) + { + float2 a0 = srcF2[gIdx]; + + dstF2[ gIdx ] = a0; + } +} + diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h new file mode 100644 index 0000000000..e5670e3cd3 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h @@ -0,0 +1,132 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* copyKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"\n" +"typedef struct\n" +"{\n" +" int m_n;\n" +" int m_padding[3];\n" +"} ConstBuffer;\n" +"\n" +"\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float4 a0 = src[gIdx];\n" +"\n" +" dst[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( 2*gIdx <= cb.m_n )\n" +" {\n" +" float4 a0 = src[gIdx*2+0];\n" +" float4 a1 = src[gIdx*2+1];\n" +"\n" +" dst[ gIdx*2+0 ] = a0;\n" +" dst[ gIdx*2+1 ] = a1;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( 4*gIdx <= cb.m_n )\n" +" {\n" +" int idx0 = gIdx*4+0;\n" +" int idx1 = gIdx*4+1;\n" +" int idx2 = gIdx*4+2;\n" +" int idx3 = gIdx*4+3;\n" +"\n" +" float4 a0 = src[idx0];\n" +" float4 a1 = src[idx1];\n" +" float4 a2 = src[idx2];\n" +" float4 a3 = src[idx3];\n" +"\n" +" dst[ idx0 ] = a0;\n" +" dst[ idx1 ] = a1;\n" +" dst[ idx2 ] = a2;\n" +" dst[ idx3 ] = a3;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float a0 = srcF1[gIdx];\n" +"\n" +" dstF1[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n" +" ConstBuffer cb)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +"\n" +" if( gIdx < cb.m_n )\n" +" {\n" +" float2 a0 = srcF2[gIdx];\n" +"\n" +" dstF2[ gIdx ] = a0;\n" +" }\n" +"}\n" +"\n" +"\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl new file mode 100644 index 0000000000..71c31075dd --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl @@ -0,0 +1,107 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +typedef struct +{ + union + { + int4 m_data; + uint4 m_unsignedData; + float m_floatData; + }; + int m_offset; + int m_n; + int m_padding[2]; +} ConstBuffer; + + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num_elements ) + { + dstInt[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num_elements ) + { + dstFloat[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt[ offset+gIdx ] = value; + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt2[ gIdx + offset] = make_int2( value.x, value.y ); + } +} + +__kernel +__attribute__((reqd_work_group_size(64,1,1))) +void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < num ) + { + dstInt4[ offset+gIdx ] = value; + } +} + diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h new file mode 100644 index 0000000000..4f8b96e489 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h @@ -0,0 +1,91 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* fillKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"typedef struct\n" +"{\n" +" union\n" +" {\n" +" int4 m_data;\n" +" uint4 m_unsignedData;\n" +" float m_floatData;\n" +" };\n" +" int m_offset;\n" +" int m_n;\n" +" int m_padding[2];\n" +"} ConstBuffer;\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num_elements )\n" +" {\n" +" dstInt[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num_elements )\n" +" {\n" +" dstFloat[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num )\n" +" {\n" +" dstInt[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num )\n" +" {\n" +" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(64,1,1)))\n" +"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < num )\n" +" {\n" +" dstInt4[ offset+gIdx ] = value;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl new file mode 100644 index 0000000000..c9da79854a --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl @@ -0,0 +1,154 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +// takahiro end +#define WG_SIZE 128 +#define m_numElems x +#define m_numBlocks y +#define m_numScanBlocks z + +/*typedef struct +{ + uint m_numElems; + uint m_numBlocks; + uint m_numScanBlocks; + uint m_padding[1]; +} ConstBuffer; +*/ + +float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize) +{ + float4 blocksum; + int offset = 1; + for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1) + { + GROUP_LDS_BARRIER; + for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + data[bi] += data[ai]; + } + } + + GROUP_LDS_BARRIER; + + if( lIdx == 0 ) + { + blocksum = data[ n-1 ]; + data[ n-1 ] = 0; + } + + GROUP_LDS_BARRIER; + + offset >>= 1; + for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 ) + { + GROUP_LDS_BARRIER; + for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize ) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + float4 temp = data[ai]; + data[ai] = data[bi]; + data[bi] += temp; + } + } + GROUP_LDS_BARRIER; + + return blocksum; +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb) +{ + __local float4 ldsData[WG_SIZE*2]; + + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + + ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0; + ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0; + + float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE); + + if( lIdx == 0 ) + sumBuffer[GET_GROUP_IDX] = sum; + + if( (2*gIdx) < cb.m_numElems ) + { + dst[2*gIdx] = ldsData[2*lIdx]; + } + if( (2*gIdx + 1) < cb.m_numElems ) + { + dst[2*gIdx + 1] = ldsData[2*lIdx + 1]; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb) +{ + const u32 blockSize = WG_SIZE*2; + + int myIdx = GET_GROUP_IDX+1; + int lIdx = GET_LOCAL_IDX; + + float4 iBlockSum = blockSum[myIdx]; + + int endValue = min((myIdx+1)*(blockSize), cb.m_numElems); + for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE) + { + dst[i] += iBlockSum; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void TopLevelScanKernel(__global float4* dst, uint4 cb) +{ + __local float4 ldsData[2048]; + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + int lSize = GET_GROUP_SIZE; + + for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize ) + { + ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0; + } + + GROUP_LDS_BARRIER; + + float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE); + + for(int i=lIdx; i<cb.m_numBlocks; i+=lSize ) + { + dst[i] = ldsData[i]; + } + + if( gIdx == 0 ) + { + dst[cb.m_numBlocks] = sum; + } +} diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl new file mode 100644 index 0000000000..963cc1e48e --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl @@ -0,0 +1,154 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) + +// takahiro end +#define WG_SIZE 128 +#define m_numElems x +#define m_numBlocks y +#define m_numScanBlocks z + +/*typedef struct +{ + uint m_numElems; + uint m_numBlocks; + uint m_numScanBlocks; + uint m_padding[1]; +} ConstBuffer; +*/ + +u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize) +{ + u32 blocksum; + int offset = 1; + for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1) + { + GROUP_LDS_BARRIER; + for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + data[bi] += data[ai]; + } + } + + GROUP_LDS_BARRIER; + + if( lIdx == 0 ) + { + blocksum = data[ n-1 ]; + data[ n-1 ] = 0; + } + + GROUP_LDS_BARRIER; + + offset >>= 1; + for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 ) + { + GROUP_LDS_BARRIER; + for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize ) + { + int ai = offset*(2*iIdx+1)-1; + int bi = offset*(2*iIdx+2)-1; + u32 temp = data[ai]; + data[ai] = data[bi]; + data[bi] += temp; + } + } + GROUP_LDS_BARRIER; + + return blocksum; +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer, + uint4 cb) +{ + __local u32 ldsData[WG_SIZE*2]; + + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + + ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0; + ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0; + + u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE); + + if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum; + + if( (2*gIdx) < cb.m_numElems ) + { + dst[2*gIdx] = ldsData[2*lIdx]; + } + if( (2*gIdx + 1) < cb.m_numElems ) + { + dst[2*gIdx + 1] = ldsData[2*lIdx + 1]; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb) +{ + const u32 blockSize = WG_SIZE*2; + + int myIdx = GET_GROUP_IDX+1; + int lIdx = GET_LOCAL_IDX; + + u32 iBlockSum = blockSum[myIdx]; + + int endValue = min((myIdx+1)*(blockSize), cb.m_numElems); + for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE) + { + dst[i] += iBlockSum; + } +} + +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +__kernel +void TopLevelScanKernel(__global u32* dst, uint4 cb) +{ + __local u32 ldsData[2048]; + int gIdx = GET_GLOBAL_IDX; + int lIdx = GET_LOCAL_IDX; + int lSize = GET_GROUP_SIZE; + + for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize ) + { + ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0; + } + + GROUP_LDS_BARRIER; + + u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE); + + for(int i=lIdx; i<cb.m_numBlocks; i+=lSize ) + { + dst[i] = ldsData[i]; + } + + if( gIdx == 0 ) + { + dst[cb.m_numBlocks] = sum; + } +} diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h new file mode 100644 index 0000000000..27baab8331 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h @@ -0,0 +1,129 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* prefixScanKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"// takahiro end\n" +"#define WG_SIZE 128 \n" +"#define m_numElems x\n" +"#define m_numBlocks y\n" +"#define m_numScanBlocks z\n" +"/*typedef struct\n" +"{\n" +" uint m_numElems;\n" +" uint m_numBlocks;\n" +" uint m_numScanBlocks;\n" +" uint m_padding[1];\n" +"} ConstBuffer;\n" +"*/\n" +"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n" +"{\n" +" u32 blocksum;\n" +" int offset = 1;\n" +" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" data[bi] += data[ai];\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 )\n" +" {\n" +" blocksum = data[ n-1 ];\n" +" data[ n-1 ] = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" offset >>= 1;\n" +" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" u32 temp = data[ai];\n" +" data[ai] = data[bi];\n" +" data[bi] += temp;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" return blocksum;\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n" +" uint4 cb)\n" +"{\n" +" __local u32 ldsData[WG_SIZE*2];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" +" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" +" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n" +" if( (2*gIdx) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx] = ldsData[2*lIdx];\n" +" }\n" +" if( (2*gIdx + 1) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n" +"{\n" +" const u32 blockSize = WG_SIZE*2;\n" +" int myIdx = GET_GROUP_IDX+1;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" u32 iBlockSum = blockSum[myIdx];\n" +" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" +" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" +" {\n" +" dst[i] += iBlockSum;\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n" +"{\n" +" __local u32 ldsData[2048];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int lSize = GET_GROUP_SIZE;\n" +" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" +" {\n" +" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" +" {\n" +" dst[i] = ldsData[i];\n" +" }\n" +" if( gIdx == 0 )\n" +" {\n" +" dst[cb.m_numBlocks] = sum;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h new file mode 100644 index 0000000000..5b13254796 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h @@ -0,0 +1,129 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* prefixScanKernelsFloat4CL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"// takahiro end\n" +"#define WG_SIZE 128 \n" +"#define m_numElems x\n" +"#define m_numBlocks y\n" +"#define m_numScanBlocks z\n" +"/*typedef struct\n" +"{\n" +" uint m_numElems;\n" +" uint m_numBlocks;\n" +" uint m_numScanBlocks;\n" +" uint m_padding[1];\n" +"} ConstBuffer;\n" +"*/\n" +"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n" +"{\n" +" float4 blocksum;\n" +" int offset = 1;\n" +" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" data[bi] += data[ai];\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 )\n" +" {\n" +" blocksum = data[ n-1 ];\n" +" data[ n-1 ] = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" offset >>= 1;\n" +" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n" +" {\n" +" GROUP_LDS_BARRIER;\n" +" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n" +" {\n" +" int ai = offset*(2*iIdx+1)-1;\n" +" int bi = offset*(2*iIdx+2)-1;\n" +" float4 temp = data[ai];\n" +" data[ai] = data[bi];\n" +" data[bi] += temp;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" return blocksum;\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)\n" +"{\n" +" __local float4 ldsData[WG_SIZE*2];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n" +" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n" +" float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" if( lIdx == 0 ) \n" +" sumBuffer[GET_GROUP_IDX] = sum;\n" +" if( (2*gIdx) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx] = ldsData[2*lIdx];\n" +" }\n" +" if( (2*gIdx + 1) < cb.m_numElems )\n" +" {\n" +" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n" +"{\n" +" const u32 blockSize = WG_SIZE*2;\n" +" int myIdx = GET_GROUP_IDX+1;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" float4 iBlockSum = blockSum[myIdx];\n" +" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n" +" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n" +" {\n" +" dst[i] += iBlockSum;\n" +" }\n" +"}\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"__kernel\n" +"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n" +"{\n" +" __local float4 ldsData[2048];\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int lSize = GET_GROUP_SIZE;\n" +" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n" +" {\n" +" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n" +" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n" +" {\n" +" dst[i] = ldsData[i];\n" +" }\n" +" if( gIdx == 0 )\n" +" {\n" +" dst[cb.m_numBlocks] = sum;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl new file mode 100644 index 0000000000..7402e2f3b3 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl @@ -0,0 +1,1071 @@ +/* +Bullet Continuous Collision Detection and Physics Library +Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Author Takahiro Harada + + +//#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +typedef unsigned int u32; +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AtomAdd(x, value) atom_add(&(x), value) + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + + +#define make_uint4 (uint4) +#define make_uint2 (uint2) +#define make_int2 (int2) + +#define WG_SIZE 64 +#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE) +#define BITS_PER_PASS 4 +#define NUM_BUCKET (1<<BITS_PER_PASS) +typedef uchar u8; + +// this isn't optimization for VLIW. But just reducing writes. +#define USE_2LEVEL_REDUCE 1 + +//#define CHECK_BOUNDARY 1 + +//#define NV_GPU 1 + + +// Cypress +#define nPerWI 16 +// Cayman +//#define nPerWI 20 + +#define m_n x +#define m_nWGs y +#define m_startBit z +#define m_nBlocksPerWG w + +/* +typedef struct +{ + int m_n; + int m_nWGs; + int m_startBit; + int m_nBlocksPerWG; +} ConstBuffer; +*/ + +typedef struct +{ + unsigned int m_key; + unsigned int m_value; +} SortDataCL; + + +uint prefixScanVectorEx( uint4* data ) +{ + u32 sum = 0; + u32 tmp = data[0].x; + data[0].x = sum; + sum += tmp; + tmp = data[0].y; + data[0].y = sum; + sum += tmp; + tmp = data[0].z; + data[0].z = sum; + sum += tmp; + tmp = data[0].w; + data[0].w = sum; + sum += tmp; + return sum; +} + +u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ ) +{ + { // Set data + sorterSharedMemory[lIdx] = 0; + sorterSharedMemory[lIdx+wgSize] = pData; + } + + GROUP_LDS_BARRIER; + + { // Prefix sum + int idx = 2*lIdx + (wgSize+1); +#if defined(USE_2LEVEL_REDUCE) + if( lIdx < 64 ) + { + u32 u0, u1, u2; + u0 = sorterSharedMemory[idx-3]; + u1 = sorterSharedMemory[idx-2]; + u2 = sorterSharedMemory[idx-1]; + AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); + GROUP_MEM_FENCE; + + u0 = sorterSharedMemory[idx-12]; + u1 = sorterSharedMemory[idx-8]; + u2 = sorterSharedMemory[idx-4]; + AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); + GROUP_MEM_FENCE; + + u0 = sorterSharedMemory[idx-48]; + u1 = sorterSharedMemory[idx-32]; + u2 = sorterSharedMemory[idx-16]; + AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); + GROUP_MEM_FENCE; + if( wgSize > 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; + GROUP_MEM_FENCE; + } + + sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + } +#else + if( lIdx < 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-1]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-4]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-8]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-16]; + GROUP_MEM_FENCE; + sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; + GROUP_MEM_FENCE; + if( wgSize > 64 ) + { + sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; + GROUP_MEM_FENCE; + } + + sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; + GROUP_MEM_FENCE; + } +#endif + } + + GROUP_LDS_BARRIER; + + *totalSum = sorterSharedMemory[wgSize*2-1]; + u32 addValue = sorterSharedMemory[lIdx+wgSize-1]; + return addValue; +} + +//__attribute__((reqd_work_group_size(128,1,1))) +uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) +{ + u32 s4 = prefixScanVectorEx( &pData ); + u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 ); + return pData + make_uint4( rank, rank, rank, rank ); +} + + +//__attribute__((reqd_work_group_size(64,1,1))) +uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) +{ + u32 s4 = prefixScanVectorEx( &pData ); + u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 ); + return pData + make_uint4( rank, rank, rank, rank ); +} + +u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;} + +u32 bit8Scan(u32 v) +{ + return (v<<8) + (v<<16) + (v<<24); +} + +//=== + + + + +#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx] + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb ) +{ + __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + for(int i=0; i<NUM_BUCKET; i++) + { + MY_HISTOGRAM(i) = 0; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + u32 localKey; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + { +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + { + localKey = (gSrc[addr+i]>>startBit) & 0xf; +#if defined(NV_GPU) + MY_HISTOGRAM( localKey )++; +#else + AtomInc( MY_HISTOGRAM( localKey ) ); +#endif + } + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<GET_GROUP_SIZE; i++) + { + sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE]; + } + histogramOut[lIdx*nWGs+wgIdx] = sum; + } +} + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb ) +{ + __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + for(int i=0; i<NUM_BUCKET; i++) + { + MY_HISTOGRAM(i) = 0; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + u32 localKey; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD + // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops + // AMD: AtomInc performs better while NV prefers ++ + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + { +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + { + localKey = (gSrc[addr+i].m_key>>startBit) & 0xf; +#if defined(NV_GPU) + MY_HISTOGRAM( localKey )++; +#else + AtomInc( MY_HISTOGRAM( localKey ) ); +#endif + } + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<GET_GROUP_SIZE; i++) + { + sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE]; + } + histogramOut[lIdx*nWGs+wgIdx] = sum; + } +} + +#define nPerLane (nPerWI/4) + +// NUM_BUCKET*nWGs < 128*nPerWI +__kernel +__attribute__((reqd_work_group_size(128,1,1))) +void PrefixScanKernel( __global u32* wHistogram1, int4 cb ) +{ + __local u32 ldsTopScanData[128*2]; + + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + const int nWGs = cb.m_nWGs; + + u32 data[nPerWI]; + for(int i=0; i<nPerWI; i++) + { + data[i] = 0; + if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs ) + data[i] = wHistogram1[nPerWI*lIdx+i]; + } + + uint4 myData = make_uint4(0,0,0,0); + + for(int i=0; i<nPerLane; i++) + { + myData.x += data[nPerLane*0+i]; + myData.y += data[nPerLane*1+i]; + myData.z += data[nPerLane*2+i]; + myData.w += data[nPerLane*3+i]; + } + + uint totalSum; + uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData ); + +// for(int j=0; j<4; j++) // somehow it introduces a lot of branches + { int j = 0; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + { int j = 1; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + { int j = 2; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + { int j = 3; + u32 sum = 0; + for(int i=0; i<nPerLane; i++) + { + u32 tmp = data[nPerLane*j+i]; + data[nPerLane*j+i] = sum; + sum += tmp; + } + } + + for(int i=0; i<nPerLane; i++) + { + data[nPerLane*0+i] += scanned.x; + data[nPerLane*1+i] += scanned.y; + data[nPerLane*2+i] += scanned.z; + data[nPerLane*3+i] += scanned.w; + } + + for(int i=0; i<nPerWI; i++) + { + int index = nPerWI*lIdx+i; + if (index < NUM_BUCKET*nWGs) + wHistogram1[nPerWI*lIdx+i] = data[i]; + } +} + +// 4 scan, 4 exchange +void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData) +{ + for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++) + { + u32 mask = (1<<bitIdx); + uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask ); + uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) ); + u32 total; + prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData ); + { + uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3); + uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total ); + dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) ); + + GROUP_LDS_BARRIER; + + ldsSortData[dstAddr.x] = sortData[0]; + ldsSortData[dstAddr.y] = sortData[1]; + ldsSortData[dstAddr.z] = sortData[2]; + ldsSortData[dstAddr.w] = sortData[3]; + + GROUP_LDS_BARRIER; + + sortData[0] = ldsSortData[localAddr.x]; + sortData[1] = ldsSortData[localAddr.y]; + sortData[2] = ldsSortData[localAddr.z]; + sortData[3] = ldsSortData[localAddr.w]; + + GROUP_LDS_BARRIER; + } + } +} + +// 2 scan, 2 exchange +void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData) +{ + for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2) + { + uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, + (sortData[1]>>(startBit+ibit)) & 0x3, + (sortData[2]>>(startBit+ibit)) & 0x3, + (sortData[3]>>(startBit+ibit)) & 0x3); + + u32 key4; + u32 sKeyPacked[4] = { 0, 0, 0, 0 }; + { + sKeyPacked[0] |= 1<<(8*b.x); + sKeyPacked[1] |= 1<<(8*b.y); + sKeyPacked[2] |= 1<<(8*b.z); + sKeyPacked[3] |= 1<<(8*b.w); + + key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; + } + + u32 rankPacked; + u32 sumPacked; + { + rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); + } + + GROUP_LDS_BARRIER; + + u32 newOffset[4] = { 0,0,0,0 }; + { + u32 sumScanned = bit8Scan( sumPacked ); + + u32 scannedKeys[4]; + scannedKeys[0] = 1<<(8*b.x); + scannedKeys[1] = 1<<(8*b.y); + scannedKeys[2] = 1<<(8*b.z); + scannedKeys[3] = 1<<(8*b.w); + { // 4 scans at once + u32 sum4 = 0; + for(int ie=0; ie<4; ie++) + { + u32 tmp = scannedKeys[ie]; + scannedKeys[ie] = sum4; + sum4 += tmp; + } + } + + { + u32 sumPlusRank = sumScanned + rankPacked; + { u32 ie = b.x; + scannedKeys[0] += sumPlusRank; + newOffset[0] = unpack4Key( scannedKeys[0], ie ); + } + { u32 ie = b.y; + scannedKeys[1] += sumPlusRank; + newOffset[1] = unpack4Key( scannedKeys[1], ie ); + } + { u32 ie = b.z; + scannedKeys[2] += sumPlusRank; + newOffset[2] = unpack4Key( scannedKeys[2], ie ); + } + { u32 ie = b.w; + scannedKeys[3] += sumPlusRank; + newOffset[3] = unpack4Key( scannedKeys[3], ie ); + } + } + } + + + GROUP_LDS_BARRIER; + + { + ldsSortData[newOffset[0]] = sortData[0]; + ldsSortData[newOffset[1]] = sortData[1]; + ldsSortData[newOffset[2]] = sortData[2]; + ldsSortData[newOffset[3]] = sortData[3]; + + GROUP_LDS_BARRIER; + + u32 dstAddr = 4*lIdx; + sortData[0] = ldsSortData[dstAddr+0]; + sortData[1] = ldsSortData[dstAddr+1]; + sortData[2] = ldsSortData[dstAddr+2]; + sortData[3] = ldsSortData[dstAddr+3]; + + GROUP_LDS_BARRIER; + } + } +} + +#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key] + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) +{ + __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local u32 localHistogramToCarry[NUM_BUCKET]; + __local u32 localHistogram[NUM_BUCKET*2]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int startBit = cb.m_startBit; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + if( lIdx < (NUM_BUCKET) ) + { + localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; + } + + GROUP_LDS_BARRIER; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + u32 myHistogram = 0; + + u32 sortData[ELEMENTS_PER_WORK_ITEM]; + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff; +#else + sortData[i] = gSrc[ addr+i ]; +#endif + + sort4Bits(sortData, startBit, lIdx, ldsSortData); + + u32 keys[ELEMENTS_PER_WORK_ITEM]; + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + keys[i] = (sortData[i]>>startBit) & 0xf; + + { // create histogram + u32 setIdx = lIdx/16; + if( lIdx < NUM_BUCKET ) + { + localHistogram[lIdx] = 0; + } + ldsSortData[lIdx] = 0; + GROUP_LDS_BARRIER; + + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + +#if defined(NV_GPU) + SET_HISTOGRAM( setIdx, keys[i] )++; +#else + AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) ); +#endif + + GROUP_LDS_BARRIER; + + uint hIdx = NUM_BUCKET+lIdx; + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<WG_SIZE/16; i++) + { + sum += SET_HISTOGRAM( i, lIdx ); + } + myHistogram = sum; + localHistogram[hIdx] = sum; + } + GROUP_LDS_BARRIER; + +#if defined(USE_2LEVEL_REDUCE) + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + + u32 u0, u1, u2; + u0 = localHistogram[hIdx-3]; + u1 = localHistogram[hIdx-2]; + u2 = localHistogram[hIdx-1]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + u0 = localHistogram[hIdx-12]; + u1 = localHistogram[hIdx-8]; + u2 = localHistogram[hIdx-4]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + } +#else + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-2]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-4]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-8]; + GROUP_MEM_FENCE; + } +#endif + GROUP_LDS_BARRIER; + } + + { + for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++) + { + int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie; + int binIdx = keys[ie]; + int groupOffset = localHistogramToCarry[binIdx]; + int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx]; +#if defined(CHECK_BOUNDARY) + if( addr+ie < n ) +#endif + gDst[ groupOffset + myIdx ] = sortData[ie]; + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + localHistogramToCarry[lIdx] += myHistogram; + } + GROUP_LDS_BARRIER; + } +} + +// 2 scan, 2 exchange +void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal) +{ + for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2) + { + uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, + (sortData[1]>>(startBit+ibit)) & 0x3, + (sortData[2]>>(startBit+ibit)) & 0x3, + (sortData[3]>>(startBit+ibit)) & 0x3); + + u32 key4; + u32 sKeyPacked[4] = { 0, 0, 0, 0 }; + { + sKeyPacked[0] |= 1<<(8*b.x); + sKeyPacked[1] |= 1<<(8*b.y); + sKeyPacked[2] |= 1<<(8*b.z); + sKeyPacked[3] |= 1<<(8*b.w); + + key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; + } + + u32 rankPacked; + u32 sumPacked; + { + rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); + } + + GROUP_LDS_BARRIER; + + u32 newOffset[4] = { 0,0,0,0 }; + { + u32 sumScanned = bit8Scan( sumPacked ); + + u32 scannedKeys[4]; + scannedKeys[0] = 1<<(8*b.x); + scannedKeys[1] = 1<<(8*b.y); + scannedKeys[2] = 1<<(8*b.z); + scannedKeys[3] = 1<<(8*b.w); + { // 4 scans at once + u32 sum4 = 0; + for(int ie=0; ie<4; ie++) + { + u32 tmp = scannedKeys[ie]; + scannedKeys[ie] = sum4; + sum4 += tmp; + } + } + + { + u32 sumPlusRank = sumScanned + rankPacked; + { u32 ie = b.x; + scannedKeys[0] += sumPlusRank; + newOffset[0] = unpack4Key( scannedKeys[0], ie ); + } + { u32 ie = b.y; + scannedKeys[1] += sumPlusRank; + newOffset[1] = unpack4Key( scannedKeys[1], ie ); + } + { u32 ie = b.z; + scannedKeys[2] += sumPlusRank; + newOffset[2] = unpack4Key( scannedKeys[2], ie ); + } + { u32 ie = b.w; + scannedKeys[3] += sumPlusRank; + newOffset[3] = unpack4Key( scannedKeys[3], ie ); + } + } + } + + + GROUP_LDS_BARRIER; + + { + ldsSortData[newOffset[0]] = sortData[0]; + ldsSortData[newOffset[1]] = sortData[1]; + ldsSortData[newOffset[2]] = sortData[2]; + ldsSortData[newOffset[3]] = sortData[3]; + + ldsSortVal[newOffset[0]] = sortVal[0]; + ldsSortVal[newOffset[1]] = sortVal[1]; + ldsSortVal[newOffset[2]] = sortVal[2]; + ldsSortVal[newOffset[3]] = sortVal[3]; + + GROUP_LDS_BARRIER; + + u32 dstAddr = 4*lIdx; + sortData[0] = ldsSortData[dstAddr+0]; + sortData[1] = ldsSortData[dstAddr+1]; + sortData[2] = ldsSortData[dstAddr+2]; + sortData[3] = ldsSortData[dstAddr+3]; + + sortVal[0] = ldsSortVal[dstAddr+0]; + sortVal[1] = ldsSortVal[dstAddr+1]; + sortVal[2] = ldsSortVal[dstAddr+2]; + sortVal[3] = ldsSortVal[dstAddr+3]; + + GROUP_LDS_BARRIER; + } + } +} + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb) +{ + __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; + __local u32 localHistogramToCarry[NUM_BUCKET]; + __local u32 localHistogram[NUM_BUCKET*2]; + + u32 gIdx = GET_GLOBAL_IDX; + u32 lIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int startBit = cb.m_startBit; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + if( lIdx < (NUM_BUCKET) ) + { + localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; + } + + GROUP_LDS_BARRIER; + + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; + + int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize) + { + + u32 myHistogram = 0; + + int sortData[ELEMENTS_PER_WORK_ITEM]; + int sortVal[ELEMENTS_PER_WORK_ITEM]; + + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + { + sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff; + sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff; + } +#else + { + sortData[i] = gSrc[ addr+i ].m_key; + sortVal[i] = gSrc[ addr+i ].m_value; + } +#endif + + sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal); + + u32 keys[ELEMENTS_PER_WORK_ITEM]; + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) + keys[i] = (sortData[i]>>startBit) & 0xf; + + { // create histogram + u32 setIdx = lIdx/16; + if( lIdx < NUM_BUCKET ) + { + localHistogram[lIdx] = 0; + } + ldsSortData[lIdx] = 0; + GROUP_LDS_BARRIER; + + for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++) +#if defined(CHECK_BOUNDARY) + if( addr+i < n ) +#endif + +#if defined(NV_GPU) + SET_HISTOGRAM( setIdx, keys[i] )++; +#else + AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) ); +#endif + + GROUP_LDS_BARRIER; + + uint hIdx = NUM_BUCKET+lIdx; + if( lIdx < NUM_BUCKET ) + { + u32 sum = 0; + for(int i=0; i<WG_SIZE/16; i++) + { + sum += SET_HISTOGRAM( i, lIdx ); + } + myHistogram = sum; + localHistogram[hIdx] = sum; + } + GROUP_LDS_BARRIER; + +#if defined(USE_2LEVEL_REDUCE) + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + + u32 u0, u1, u2; + u0 = localHistogram[hIdx-3]; + u1 = localHistogram[hIdx-2]; + u2 = localHistogram[hIdx-1]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + u0 = localHistogram[hIdx-12]; + u1 = localHistogram[hIdx-8]; + u2 = localHistogram[hIdx-4]; + AtomAdd( localHistogram[hIdx], u0 + u1 + u2 ); + GROUP_MEM_FENCE; + } +#else + if( lIdx < NUM_BUCKET ) + { + localHistogram[hIdx] = localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-1]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-2]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-4]; + GROUP_MEM_FENCE; + localHistogram[hIdx] += localHistogram[hIdx-8]; + GROUP_MEM_FENCE; + } +#endif + GROUP_LDS_BARRIER; + } + + { + for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++) + { + int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie; + int binIdx = keys[ie]; + int groupOffset = localHistogramToCarry[binIdx]; + int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx]; +#if defined(CHECK_BOUNDARY) + if( addr+ie < n ) + { + if ((groupOffset + myIdx)<n) + { + if (sortData[ie]==sortVal[ie]) + { + + SortDataCL tmp; + tmp.m_key = sortData[ie]; + tmp.m_value = sortVal[ie]; + if (tmp.m_key == tmp.m_value) + gDst[groupOffset + myIdx ] = tmp; + } + + } + } +#else + if ((groupOffset + myIdx)<n) + { + gDst[ groupOffset + myIdx ].m_key = sortData[ie]; + gDst[ groupOffset + myIdx ].m_value = sortVal[ie]; + } +#endif + } + } + + GROUP_LDS_BARRIER; + + if( lIdx < NUM_BUCKET ) + { + localHistogramToCarry[lIdx] += myHistogram; + } + GROUP_LDS_BARRIER; + } +} + + + + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb) +{ + + u32 gIdx = GET_GLOBAL_IDX; + u32 realLocalIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + int counter[NUM_BUCKET]; + + if (realLocalIdx>0) + return; + + for (int c=0;c<NUM_BUCKET;c++) + counter[c]=0; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx=0;lIdx<WG_SIZE;lIdx++) + { + int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) + { + int i = addr2+j; + if( i < n ) + { + int tableIdx; + tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1 + gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; + counter[tableIdx] ++; + } + } + } + } + +} + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) +{ + + u32 gIdx = GET_GLOBAL_IDX; + u32 realLocalIdx = GET_LOCAL_IDX; + u32 wgIdx = GET_GROUP_IDX; + u32 wgSize = GET_GROUP_SIZE; + const int startBit = cb.m_startBit; + const int n = cb.m_n; + const int nWGs = cb.m_nWGs; + const int nBlocksPerWG = cb.m_nBlocksPerWG; + + int counter[NUM_BUCKET]; + + if (realLocalIdx>0) + return; + + for (int c=0;c<NUM_BUCKET;c++) + counter[c]=0; + + const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; + + int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx; + + for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++) + { + for (int lIdx=0;lIdx<WG_SIZE;lIdx++) + { + int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; + + for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++) + { + int i = addr2+j; + if( i < n ) + { + int tableIdx; + tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1 + gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; + counter[tableIdx] ++; + } + } + } + } + +}
\ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h new file mode 100644 index 0000000000..8876c16aa6 --- /dev/null +++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h @@ -0,0 +1,910 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* radixSort32KernelsCL= \ +"/*\n" +"Bullet Continuous Collision Detection and Physics Library\n" +"Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Author Takahiro Harada\n" +"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"typedef unsigned int u32;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_uint4 (uint4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define WG_SIZE 64\n" +"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n" +"#define BITS_PER_PASS 4\n" +"#define NUM_BUCKET (1<<BITS_PER_PASS)\n" +"typedef uchar u8;\n" +"// this isn't optimization for VLIW. But just reducing writes. \n" +"#define USE_2LEVEL_REDUCE 1\n" +"//#define CHECK_BOUNDARY 1\n" +"//#define NV_GPU 1\n" +"// Cypress\n" +"#define nPerWI 16\n" +"// Cayman\n" +"//#define nPerWI 20\n" +"#define m_n x\n" +"#define m_nWGs y\n" +"#define m_startBit z\n" +"#define m_nBlocksPerWG w\n" +"/*\n" +"typedef struct\n" +"{\n" +" int m_n;\n" +" int m_nWGs;\n" +" int m_startBit;\n" +" int m_nBlocksPerWG;\n" +"} ConstBuffer;\n" +"*/\n" +"typedef struct\n" +"{\n" +" unsigned int m_key;\n" +" unsigned int m_value;\n" +"} SortDataCL;\n" +"uint prefixScanVectorEx( uint4* data )\n" +"{\n" +" u32 sum = 0;\n" +" u32 tmp = data[0].x;\n" +" data[0].x = sum;\n" +" sum += tmp;\n" +" tmp = data[0].y;\n" +" data[0].y = sum;\n" +" sum += tmp;\n" +" tmp = data[0].z;\n" +" data[0].z = sum;\n" +" sum += tmp;\n" +" tmp = data[0].w;\n" +" data[0].w = sum;\n" +" sum += tmp;\n" +" return sum;\n" +"}\n" +"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n" +"{\n" +" { // Set data\n" +" sorterSharedMemory[lIdx] = 0;\n" +" sorterSharedMemory[lIdx+wgSize] = pData;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" { // Prefix sum\n" +" int idx = 2*lIdx + (wgSize+1);\n" +"#if defined(USE_2LEVEL_REDUCE)\n" +" if( lIdx < 64 )\n" +" {\n" +" u32 u0, u1, u2;\n" +" u0 = sorterSharedMemory[idx-3];\n" +" u1 = sorterSharedMemory[idx-2];\n" +" u2 = sorterSharedMemory[idx-1];\n" +" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" +" GROUP_MEM_FENCE;\n" +" u0 = sorterSharedMemory[idx-12];\n" +" u1 = sorterSharedMemory[idx-8];\n" +" u2 = sorterSharedMemory[idx-4];\n" +" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" +" GROUP_MEM_FENCE;\n" +" u0 = sorterSharedMemory[idx-48];\n" +" u1 = sorterSharedMemory[idx-32];\n" +" u2 = sorterSharedMemory[idx-16];\n" +" AtomAdd( sorterSharedMemory[idx], u0+u1+u2 ); \n" +" GROUP_MEM_FENCE;\n" +" if( wgSize > 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n" +" GROUP_MEM_FENCE;\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n" +" GROUP_MEM_FENCE;\n" +" if( wgSize > 64 )\n" +" {\n" +" sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +" sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" *totalSum = sorterSharedMemory[wgSize*2-1];\n" +" u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n" +" return addValue;\n" +"}\n" +"//__attribute__((reqd_work_group_size(128,1,1)))\n" +"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" +"{\n" +" u32 s4 = prefixScanVectorEx( &pData );\n" +" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n" +" return pData + make_uint4( rank, rank, rank, rank );\n" +"}\n" +"//__attribute__((reqd_work_group_size(64,1,1)))\n" +"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" +"{\n" +" u32 s4 = prefixScanVectorEx( &pData );\n" +" u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n" +" return pData + make_uint4( rank, rank, rank, rank );\n" +"}\n" +"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n" +"u32 bit8Scan(u32 v)\n" +"{\n" +" return (v<<8) + (v<<16) + (v<<24);\n" +"}\n" +"//===\n" +"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n" +"{\n" +" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" for(int i=0; i<NUM_BUCKET; i++)\n" +" {\n" +" MY_HISTOGRAM(i) = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" u32 localKey;\n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" +" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" +" // AMD: AtomInc performs better while NV prefers ++\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" {\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +" {\n" +" localKey = (gSrc[addr+i]>>startBit) & 0xf;\n" +"#if defined(NV_GPU)\n" +" MY_HISTOGRAM( localKey )++;\n" +"#else\n" +" AtomInc( MY_HISTOGRAM( localKey ) );\n" +"#endif\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<GET_GROUP_SIZE; i++)\n" +" {\n" +" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" +" }\n" +" histogramOut[lIdx*nWGs+wgIdx] = sum;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb )\n" +"{\n" +" __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" for(int i=0; i<NUM_BUCKET; i++)\n" +" {\n" +" MY_HISTOGRAM(i) = 0;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" u32 localKey;\n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n" +" // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n" +" // AMD: AtomInc performs better while NV prefers ++\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" {\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +" {\n" +" localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n" +"#if defined(NV_GPU)\n" +" MY_HISTOGRAM( localKey )++;\n" +"#else\n" +" AtomInc( MY_HISTOGRAM( localKey ) );\n" +"#endif\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<GET_GROUP_SIZE; i++)\n" +" {\n" +" sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n" +" }\n" +" histogramOut[lIdx*nWGs+wgIdx] = sum;\n" +" }\n" +"}\n" +"#define nPerLane (nPerWI/4)\n" +"// NUM_BUCKET*nWGs < 128*nPerWI\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(128,1,1)))\n" +"void PrefixScanKernel( __global u32* wHistogram1, int4 cb )\n" +"{\n" +" __local u32 ldsTopScanData[128*2];\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" const int nWGs = cb.m_nWGs;\n" +" u32 data[nPerWI];\n" +" for(int i=0; i<nPerWI; i++)\n" +" {\n" +" data[i] = 0;\n" +" if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n" +" data[i] = wHistogram1[nPerWI*lIdx+i];\n" +" }\n" +" uint4 myData = make_uint4(0,0,0,0);\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" myData.x += data[nPerLane*0+i];\n" +" myData.y += data[nPerLane*1+i];\n" +" myData.z += data[nPerLane*2+i];\n" +" myData.w += data[nPerLane*3+i];\n" +" }\n" +" uint totalSum;\n" +" uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n" +"// for(int j=0; j<4; j++) // somehow it introduces a lot of branches\n" +" { int j = 0;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" { int j = 1;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" { int j = 2;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" { int j = 3;\n" +" u32 sum = 0;\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" u32 tmp = data[nPerLane*j+i];\n" +" data[nPerLane*j+i] = sum;\n" +" sum += tmp;\n" +" }\n" +" }\n" +" for(int i=0; i<nPerLane; i++)\n" +" {\n" +" data[nPerLane*0+i] += scanned.x;\n" +" data[nPerLane*1+i] += scanned.y;\n" +" data[nPerLane*2+i] += scanned.z;\n" +" data[nPerLane*3+i] += scanned.w;\n" +" }\n" +" for(int i=0; i<nPerWI; i++)\n" +" {\n" +" int index = nPerWI*lIdx+i;\n" +" if (index < NUM_BUCKET*nWGs)\n" +" wHistogram1[nPerWI*lIdx+i] = data[i];\n" +" }\n" +"}\n" +"// 4 scan, 4 exchange\n" +"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" +"{\n" +" for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n" +" {\n" +" u32 mask = (1<<bitIdx);\n" +" uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n" +" uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n" +" u32 total;\n" +" prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n" +" {\n" +" uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n" +" uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n" +" dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n" +" GROUP_LDS_BARRIER;\n" +" ldsSortData[dstAddr.x] = sortData[0];\n" +" ldsSortData[dstAddr.y] = sortData[1];\n" +" ldsSortData[dstAddr.z] = sortData[2];\n" +" ldsSortData[dstAddr.w] = sortData[3];\n" +" GROUP_LDS_BARRIER;\n" +" sortData[0] = ldsSortData[localAddr.x];\n" +" sortData[1] = ldsSortData[localAddr.y];\n" +" sortData[2] = ldsSortData[localAddr.z];\n" +" sortData[3] = ldsSortData[localAddr.w];\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"// 2 scan, 2 exchange\n" +"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" +"{\n" +" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" +" {\n" +" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" +" (sortData[1]>>(startBit+ibit)) & 0x3, \n" +" (sortData[2]>>(startBit+ibit)) & 0x3, \n" +" (sortData[3]>>(startBit+ibit)) & 0x3);\n" +" u32 key4;\n" +" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" +" {\n" +" sKeyPacked[0] |= 1<<(8*b.x);\n" +" sKeyPacked[1] |= 1<<(8*b.y);\n" +" sKeyPacked[2] |= 1<<(8*b.z);\n" +" sKeyPacked[3] |= 1<<(8*b.w);\n" +" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" +" }\n" +" u32 rankPacked;\n" +" u32 sumPacked;\n" +" {\n" +" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" u32 newOffset[4] = { 0,0,0,0 };\n" +" {\n" +" u32 sumScanned = bit8Scan( sumPacked );\n" +" u32 scannedKeys[4];\n" +" scannedKeys[0] = 1<<(8*b.x);\n" +" scannedKeys[1] = 1<<(8*b.y);\n" +" scannedKeys[2] = 1<<(8*b.z);\n" +" scannedKeys[3] = 1<<(8*b.w);\n" +" { // 4 scans at once\n" +" u32 sum4 = 0;\n" +" for(int ie=0; ie<4; ie++)\n" +" {\n" +" u32 tmp = scannedKeys[ie];\n" +" scannedKeys[ie] = sum4;\n" +" sum4 += tmp;\n" +" }\n" +" }\n" +" {\n" +" u32 sumPlusRank = sumScanned + rankPacked;\n" +" { u32 ie = b.x;\n" +" scannedKeys[0] += sumPlusRank;\n" +" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" +" }\n" +" { u32 ie = b.y;\n" +" scannedKeys[1] += sumPlusRank;\n" +" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" +" }\n" +" { u32 ie = b.z;\n" +" scannedKeys[2] += sumPlusRank;\n" +" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" +" }\n" +" { u32 ie = b.w;\n" +" scannedKeys[3] += sumPlusRank;\n" +" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" {\n" +" ldsSortData[newOffset[0]] = sortData[0];\n" +" ldsSortData[newOffset[1]] = sortData[1];\n" +" ldsSortData[newOffset[2]] = sortData[2];\n" +" ldsSortData[newOffset[3]] = sortData[3];\n" +" GROUP_LDS_BARRIER;\n" +" u32 dstAddr = 4*lIdx;\n" +" sortData[0] = ldsSortData[dstAddr+0];\n" +" sortData[1] = ldsSortData[dstAddr+1];\n" +" sortData[2] = ldsSortData[dstAddr+2];\n" +" sortData[3] = ldsSortData[dstAddr+3];\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" +"{\n" +" __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local u32 localHistogramToCarry[NUM_BUCKET];\n" +" __local u32 localHistogram[NUM_BUCKET*2];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int startBit = cb.m_startBit;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" if( lIdx < (NUM_BUCKET) )\n" +" {\n" +" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" u32 myHistogram = 0;\n" +" u32 sortData[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n" +"#else\n" +" sortData[i] = gSrc[ addr+i ];\n" +"#endif\n" +" sort4Bits(sortData, startBit, lIdx, ldsSortData);\n" +" u32 keys[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" keys[i] = (sortData[i]>>startBit) & 0xf;\n" +" { // create histogram\n" +" u32 setIdx = lIdx/16;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[lIdx] = 0;\n" +" }\n" +" ldsSortData[lIdx] = 0;\n" +" GROUP_LDS_BARRIER;\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +"#if defined(NV_GPU)\n" +" SET_HISTOGRAM( setIdx, keys[i] )++;\n" +"#else\n" +" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" +"#endif\n" +" \n" +" GROUP_LDS_BARRIER;\n" +" \n" +" uint hIdx = NUM_BUCKET+lIdx;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<WG_SIZE/16; i++)\n" +" {\n" +" sum += SET_HISTOGRAM( i, lIdx );\n" +" }\n" +" myHistogram = sum;\n" +" localHistogram[hIdx] = sum;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +"#if defined(USE_2LEVEL_REDUCE)\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" u32 u0, u1, u2;\n" +" u0 = localHistogram[hIdx-3];\n" +" u1 = localHistogram[hIdx-2];\n" +" u2 = localHistogram[hIdx-1];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" u0 = localHistogram[hIdx-12];\n" +" u1 = localHistogram[hIdx-8];\n" +" u2 = localHistogram[hIdx-4];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-2];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-4];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-8];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" {\n" +" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" +" {\n" +" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" +" int binIdx = keys[ie];\n" +" int groupOffset = localHistogramToCarry[binIdx];\n" +" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+ie < n )\n" +"#endif\n" +" gDst[ groupOffset + myIdx ] = sortData[ie];\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogramToCarry[lIdx] += myHistogram;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +"}\n" +"// 2 scan, 2 exchange\n" +"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n" +"{\n" +" for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n" +" {\n" +" uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n" +" (sortData[1]>>(startBit+ibit)) & 0x3, \n" +" (sortData[2]>>(startBit+ibit)) & 0x3, \n" +" (sortData[3]>>(startBit+ibit)) & 0x3);\n" +" u32 key4;\n" +" u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" +" {\n" +" sKeyPacked[0] |= 1<<(8*b.x);\n" +" sKeyPacked[1] |= 1<<(8*b.y);\n" +" sKeyPacked[2] |= 1<<(8*b.z);\n" +" sKeyPacked[3] |= 1<<(8*b.w);\n" +" key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" +" }\n" +" u32 rankPacked;\n" +" u32 sumPacked;\n" +" {\n" +" rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" u32 newOffset[4] = { 0,0,0,0 };\n" +" {\n" +" u32 sumScanned = bit8Scan( sumPacked );\n" +" u32 scannedKeys[4];\n" +" scannedKeys[0] = 1<<(8*b.x);\n" +" scannedKeys[1] = 1<<(8*b.y);\n" +" scannedKeys[2] = 1<<(8*b.z);\n" +" scannedKeys[3] = 1<<(8*b.w);\n" +" { // 4 scans at once\n" +" u32 sum4 = 0;\n" +" for(int ie=0; ie<4; ie++)\n" +" {\n" +" u32 tmp = scannedKeys[ie];\n" +" scannedKeys[ie] = sum4;\n" +" sum4 += tmp;\n" +" }\n" +" }\n" +" {\n" +" u32 sumPlusRank = sumScanned + rankPacked;\n" +" { u32 ie = b.x;\n" +" scannedKeys[0] += sumPlusRank;\n" +" newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" +" }\n" +" { u32 ie = b.y;\n" +" scannedKeys[1] += sumPlusRank;\n" +" newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" +" }\n" +" { u32 ie = b.z;\n" +" scannedKeys[2] += sumPlusRank;\n" +" newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" +" }\n" +" { u32 ie = b.w;\n" +" scannedKeys[3] += sumPlusRank;\n" +" newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" +" }\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" {\n" +" ldsSortData[newOffset[0]] = sortData[0];\n" +" ldsSortData[newOffset[1]] = sortData[1];\n" +" ldsSortData[newOffset[2]] = sortData[2];\n" +" ldsSortData[newOffset[3]] = sortData[3];\n" +" ldsSortVal[newOffset[0]] = sortVal[0];\n" +" ldsSortVal[newOffset[1]] = sortVal[1];\n" +" ldsSortVal[newOffset[2]] = sortVal[2];\n" +" ldsSortVal[newOffset[3]] = sortVal[3];\n" +" GROUP_LDS_BARRIER;\n" +" u32 dstAddr = 4*lIdx;\n" +" sortData[0] = ldsSortData[dstAddr+0];\n" +" sortData[1] = ldsSortData[dstAddr+1];\n" +" sortData[2] = ldsSortData[dstAddr+2];\n" +" sortData[3] = ldsSortData[dstAddr+3];\n" +" sortVal[0] = ldsSortVal[dstAddr+0];\n" +" sortVal[1] = ldsSortVal[dstAddr+1];\n" +" sortVal[2] = ldsSortVal[dstAddr+2];\n" +" sortVal[3] = ldsSortVal[dstAddr+3];\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" +"{\n" +" __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" +" __local u32 localHistogramToCarry[NUM_BUCKET];\n" +" __local u32 localHistogram[NUM_BUCKET*2];\n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 lIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int startBit = cb.m_startBit;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" if( lIdx < (NUM_BUCKET) )\n" +" {\n" +" localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" +" int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n" +" {\n" +" u32 myHistogram = 0;\n" +" int sortData[ELEMENTS_PER_WORK_ITEM];\n" +" int sortVal[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" {\n" +" sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n" +" sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n" +" }\n" +"#else\n" +" {\n" +" sortData[i] = gSrc[ addr+i ].m_key;\n" +" sortVal[i] = gSrc[ addr+i ].m_value;\n" +" }\n" +"#endif\n" +" sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n" +" u32 keys[ELEMENTS_PER_WORK_ITEM];\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +" keys[i] = (sortData[i]>>startBit) & 0xf;\n" +" { // create histogram\n" +" u32 setIdx = lIdx/16;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[lIdx] = 0;\n" +" }\n" +" ldsSortData[lIdx] = 0;\n" +" GROUP_LDS_BARRIER;\n" +" for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+i < n )\n" +"#endif\n" +"#if defined(NV_GPU)\n" +" SET_HISTOGRAM( setIdx, keys[i] )++;\n" +"#else\n" +" AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n" +"#endif\n" +" \n" +" GROUP_LDS_BARRIER;\n" +" \n" +" uint hIdx = NUM_BUCKET+lIdx;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" u32 sum = 0;\n" +" for(int i=0; i<WG_SIZE/16; i++)\n" +" {\n" +" sum += SET_HISTOGRAM( i, lIdx );\n" +" }\n" +" myHistogram = sum;\n" +" localHistogram[hIdx] = sum;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +"#if defined(USE_2LEVEL_REDUCE)\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" u32 u0, u1, u2;\n" +" u0 = localHistogram[hIdx-3];\n" +" u1 = localHistogram[hIdx-2];\n" +" u2 = localHistogram[hIdx-1];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" u0 = localHistogram[hIdx-12];\n" +" u1 = localHistogram[hIdx-8];\n" +" u2 = localHistogram[hIdx-4];\n" +" AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#else\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogram[hIdx] = localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-1];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-2];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-4];\n" +" GROUP_MEM_FENCE;\n" +" localHistogram[hIdx] += localHistogram[hIdx-8];\n" +" GROUP_MEM_FENCE;\n" +" }\n" +"#endif\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" {\n" +" for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n" +" {\n" +" int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n" +" int binIdx = keys[ie];\n" +" int groupOffset = localHistogramToCarry[binIdx];\n" +" int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n" +"#if defined(CHECK_BOUNDARY)\n" +" if( addr+ie < n )\n" +" {\n" +" if ((groupOffset + myIdx)<n)\n" +" {\n" +" if (sortData[ie]==sortVal[ie])\n" +" {\n" +" \n" +" SortDataCL tmp;\n" +" tmp.m_key = sortData[ie];\n" +" tmp.m_value = sortVal[ie];\n" +" if (tmp.m_key == tmp.m_value)\n" +" gDst[groupOffset + myIdx ] = tmp;\n" +" }\n" +" \n" +" }\n" +" }\n" +"#else\n" +" if ((groupOffset + myIdx)<n)\n" +" {\n" +" gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n" +" gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n" +" }\n" +"#endif\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx < NUM_BUCKET )\n" +" {\n" +" localHistogramToCarry[lIdx] += myHistogram;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" +"{\n" +" \n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 realLocalIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" int counter[NUM_BUCKET];\n" +" \n" +" if (realLocalIdx>0)\n" +" return;\n" +" \n" +" for (int c=0;c<NUM_BUCKET;c++)\n" +" counter[c]=0;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" \n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" +" {\n" +" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" +" {\n" +" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" \n" +" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" +" {\n" +" int i = addr2+j;\n" +" if( i < n )\n" +" {\n" +" int tableIdx;\n" +" tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" +" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" +" counter[tableIdx] ++;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" +"{\n" +" \n" +" u32 gIdx = GET_GLOBAL_IDX;\n" +" u32 realLocalIdx = GET_LOCAL_IDX;\n" +" u32 wgIdx = GET_GROUP_IDX;\n" +" u32 wgSize = GET_GROUP_SIZE;\n" +" const int startBit = cb.m_startBit;\n" +" const int n = cb.m_n;\n" +" const int nWGs = cb.m_nWGs;\n" +" const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" +" int counter[NUM_BUCKET];\n" +" \n" +" if (realLocalIdx>0)\n" +" return;\n" +" \n" +" for (int c=0;c<NUM_BUCKET;c++)\n" +" counter[c]=0;\n" +" const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" +" \n" +" int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n" +" for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n" +" {\n" +" for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n" +" {\n" +" int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" +" \n" +" for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n" +" {\n" +" int i = addr2+j;\n" +" if( i < n )\n" +" {\n" +" int tableIdx;\n" +" tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n" +" gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" +" counter[tableIdx] ++;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" \n" +"}\n" +; |