From 3d7f1555865a981b7144becfc58d3f3f34362f5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= Date: Wed, 9 Mar 2022 21:15:53 +0100 Subject: Remove unused Bullet module and thirdparty code It has been disabled in `master` since one year (#45852) and our plan is for Bullet, and possibly other thirdparty physics engines, to be implemented via GDExtension so that they can be selected by the users who need them. --- .../kernels/BoundSearchKernels.cl | 106 -- .../kernels/BoundSearchKernelsCL.h | 86 -- .../ParallelPrimitives/kernels/CopyKernels.cl | 128 --- .../ParallelPrimitives/kernels/CopyKernelsCL.h | 131 --- .../ParallelPrimitives/kernels/FillKernels.cl | 107 -- .../ParallelPrimitives/kernels/FillKernelsCL.h | 90 -- .../kernels/PrefixScanFloat4Kernels.cl | 154 --- .../kernels/PrefixScanKernels.cl | 154 --- .../kernels/PrefixScanKernelsCL.h | 128 --- .../kernels/PrefixScanKernelsFloat4CL.h | 128 --- .../kernels/RadixSort32Kernels.cl | 1071 -------------------- .../kernels/RadixSort32KernelsCL.h | 909 ----------------- 12 files changed, 3192 deletions(-) delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernels.cl delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl delete mode 100644 thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h (limited to 'thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels') diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl deleted file mode 100644 index f3b4a1e8a7..0000000000 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernels.cl +++ /dev/null @@ -1,106 +0,0 @@ -/* -Copyright (c) 2012 Advanced Micro Devices, Inc. - -This software is provided 'as-is', without any express or implied warranty. -In no event will the authors be held liable for any damages arising from the use of this software. -Permission is granted to anyone to use this software for any purpose, -including commercial applications, and to alter it and redistribute it freely, -subject to the following restrictions: - -1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. -2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. -3. This notice may not be removed or altered from any source distribution. -*/ -//Originally written by Takahiro Harada - - -typedef unsigned int u32; -#define GET_GROUP_IDX get_group_id(0) -#define GET_LOCAL_IDX get_local_id(0) -#define GET_GLOBAL_IDX get_global_id(0) -#define GET_GROUP_SIZE get_local_size(0) -#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) - -typedef struct -{ - u32 m_key; - u32 m_value; -}SortData; - - - -typedef struct -{ - u32 m_nSrc; - u32 m_nDst; - u32 m_padding[2]; -} ConstBuffer; - - - -__attribute__((reqd_work_group_size(64,1,1))) -__kernel -void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, - unsigned int nSrc, unsigned int nDst) -{ - int gIdx = GET_GLOBAL_IDX; - - if( gIdx < nSrc ) - { - SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1); - SortData end; end.m_key = nDst; end.m_value = nDst; - - SortData iData = (gIdx==0)? first: src[gIdx-1]; - SortData jData = (gIdx==nSrc)? end: src[gIdx]; - - if( iData.m_key != jData.m_key ) - { -// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++) - u32 k = jData.m_key; - { - dst[k] = gIdx; - } - } - } -} - - -__attribute__((reqd_work_group_size(64,1,1))) -__kernel -void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, - unsigned int nSrc, unsigned int nDst) -{ - int gIdx = GET_GLOBAL_IDX+1; - - if( gIdx < nSrc+1 ) - { - SortData first; first.m_key = 0; first.m_value = 0; - SortData end; end.m_key = nDst; end.m_value = nDst; - - SortData iData = src[gIdx-1]; - SortData jData = (gIdx==nSrc)? end: src[gIdx]; - - if( iData.m_key != jData.m_key ) - { - u32 k = iData.m_key; - { - dst[k] = gIdx; - } - } - } -} - -__attribute__((reqd_work_group_size(64,1,1))) -__kernel -void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, - unsigned int nSrc, unsigned int nDst) -{ - int gIdx = GET_GLOBAL_IDX; - - - if( gIdx < nDst ) - { - C[gIdx] = A[gIdx] - B[gIdx]; - } -} - diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h deleted file mode 100644 index 1758dd41e3..0000000000 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h +++ /dev/null @@ -1,86 +0,0 @@ -//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* boundSearchKernelsCL = - "/*\n" - "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" - "This software is provided 'as-is', without any express or implied warranty.\n" - "In no event will the authors be held liable for any damages arising from the use of this software.\n" - "Permission is granted to anyone to use this software for any purpose, \n" - "including commercial applications, and to alter it and redistribute it freely, \n" - "subject to the following restrictions:\n" - "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" - "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" - "3. This notice may not be removed or altered from any source distribution.\n" - "*/\n" - "//Originally written by Takahiro Harada\n" - "typedef unsigned int u32;\n" - "#define GET_GROUP_IDX get_group_id(0)\n" - "#define GET_LOCAL_IDX get_local_id(0)\n" - "#define GET_GLOBAL_IDX get_global_id(0)\n" - "#define GET_GROUP_SIZE get_local_size(0)\n" - "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" - "typedef struct\n" - "{\n" - " u32 m_key; \n" - " u32 m_value;\n" - "}SortData;\n" - "typedef struct\n" - "{\n" - " u32 m_nSrc;\n" - " u32 m_nDst;\n" - " u32 m_padding[2];\n" - "} ConstBuffer;\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "__kernel\n" - "void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n" - " unsigned int nSrc, unsigned int nDst)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - " if( gIdx < nSrc )\n" - " {\n" - " SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n" - " SortData end; end.m_key = nDst; end.m_value = nDst;\n" - " SortData iData = (gIdx==0)? first: src[gIdx-1];\n" - " SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" - " if( iData.m_key != jData.m_key )\n" - " {\n" - "// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n" - " u32 k = jData.m_key;\n" - " {\n" - " dst[k] = gIdx;\n" - " }\n" - " }\n" - " }\n" - "}\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "__kernel\n" - "void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n" - " unsigned int nSrc, unsigned int nDst)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX+1;\n" - " if( gIdx < nSrc+1 )\n" - " {\n" - " SortData first; first.m_key = 0; first.m_value = 0;\n" - " SortData end; end.m_key = nDst; end.m_value = nDst;\n" - " SortData iData = src[gIdx-1];\n" - " SortData jData = (gIdx==nSrc)? end: src[gIdx];\n" - " if( iData.m_key != jData.m_key )\n" - " {\n" - " u32 k = iData.m_key;\n" - " {\n" - " dst[k] = gIdx;\n" - " }\n" - " }\n" - " }\n" - "}\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "__kernel\n" - "void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n" - " unsigned int nSrc, unsigned int nDst)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - " \n" - " if( gIdx < nDst )\n" - " {\n" - " C[gIdx] = A[gIdx] - B[gIdx];\n" - " }\n" - "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl deleted file mode 100644 index 2eee5752ec..0000000000 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernels.cl +++ /dev/null @@ -1,128 +0,0 @@ -/* -Copyright (c) 2012 Advanced Micro Devices, Inc. - -This software is provided 'as-is', without any express or implied warranty. -In no event will the authors be held liable for any damages arising from the use of this software. -Permission is granted to anyone to use this software for any purpose, -including commercial applications, and to alter it and redistribute it freely, -subject to the following restrictions: - -1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. -2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. -3. This notice may not be removed or altered from any source distribution. -*/ -//Originally written by Takahiro Harada - -#pragma OPENCL EXTENSION cl_amd_printf : enable -#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable - -typedef unsigned int u32; -#define GET_GROUP_IDX get_group_id(0) -#define GET_LOCAL_IDX get_local_id(0) -#define GET_GLOBAL_IDX get_global_id(0) -#define GET_GROUP_SIZE get_local_size(0) -#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) -#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) -#define AtomInc(x) atom_inc(&(x)) -#define AtomInc1(x, out) out = atom_inc(&(x)) - -#define make_uint4 (uint4) -#define make_uint2 (uint2) -#define make_int2 (int2) - -typedef struct -{ - int m_n; - int m_padding[3]; -} ConstBuffer; - - - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void Copy1F4Kernel(__global float4* dst, __global float4* src, - ConstBuffer cb) -{ - int gIdx = GET_GLOBAL_IDX; - - if( gIdx < cb.m_n ) - { - float4 a0 = src[gIdx]; - - dst[ gIdx ] = a0; - } -} - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void Copy2F4Kernel(__global float4* dst, __global float4* src, - ConstBuffer cb) -{ - int gIdx = GET_GLOBAL_IDX; - - if( 2*gIdx <= cb.m_n ) - { - float4 a0 = src[gIdx*2+0]; - float4 a1 = src[gIdx*2+1]; - - dst[ gIdx*2+0 ] = a0; - dst[ gIdx*2+1 ] = a1; - } -} - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void Copy4F4Kernel(__global float4* dst, __global float4* src, - ConstBuffer cb) -{ - int gIdx = GET_GLOBAL_IDX; - - if( 4*gIdx <= cb.m_n ) - { - int idx0 = gIdx*4+0; - int idx1 = gIdx*4+1; - int idx2 = gIdx*4+2; - int idx3 = gIdx*4+3; - - float4 a0 = src[idx0]; - float4 a1 = src[idx1]; - float4 a2 = src[idx2]; - float4 a3 = src[idx3]; - - dst[ idx0 ] = a0; - dst[ idx1 ] = a1; - dst[ idx2 ] = a2; - dst[ idx3 ] = a3; - } -} - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void CopyF1Kernel(__global float* dstF1, __global float* srcF1, - ConstBuffer cb) -{ - int gIdx = GET_GLOBAL_IDX; - - if( gIdx < cb.m_n ) - { - float a0 = srcF1[gIdx]; - - dstF1[ gIdx ] = a0; - } -} - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, - ConstBuffer cb) -{ - int gIdx = GET_GLOBAL_IDX; - - if( gIdx < cb.m_n ) - { - float2 a0 = srcF2[gIdx]; - - dstF2[ gIdx ] = a0; - } -} - diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h deleted file mode 100644 index 33c9279462..0000000000 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h +++ /dev/null @@ -1,131 +0,0 @@ -//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* copyKernelsCL = - "/*\n" - "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" - "\n" - "This software is provided 'as-is', without any express or implied warranty.\n" - "In no event will the authors be held liable for any damages arising from the use of this software.\n" - "Permission is granted to anyone to use this software for any purpose, \n" - "including commercial applications, and to alter it and redistribute it freely, \n" - "subject to the following restrictions:\n" - "\n" - "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" - "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" - "3. This notice may not be removed or altered from any source distribution.\n" - "*/\n" - "//Originally written by Takahiro Harada\n" - "\n" - "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" - "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" - "\n" - "typedef unsigned int u32;\n" - "#define GET_GROUP_IDX get_group_id(0)\n" - "#define GET_LOCAL_IDX get_local_id(0)\n" - "#define GET_GLOBAL_IDX get_global_id(0)\n" - "#define GET_GROUP_SIZE get_local_size(0)\n" - "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" - "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" - "#define AtomInc(x) atom_inc(&(x))\n" - "#define AtomInc1(x, out) out = atom_inc(&(x))\n" - "\n" - "#define make_uint4 (uint4)\n" - "#define make_uint2 (uint2)\n" - "#define make_int2 (int2)\n" - "\n" - "typedef struct\n" - "{\n" - " int m_n;\n" - " int m_padding[3];\n" - "} ConstBuffer;\n" - "\n" - "\n" - "\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void Copy1F4Kernel(__global float4* dst, __global float4* src, \n" - " ConstBuffer cb)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - "\n" - " if( gIdx < cb.m_n )\n" - " {\n" - " float4 a0 = src[gIdx];\n" - "\n" - " dst[ gIdx ] = a0;\n" - " }\n" - "}\n" - "\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void Copy2F4Kernel(__global float4* dst, __global float4* src, \n" - " ConstBuffer cb)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - "\n" - " if( 2*gIdx <= cb.m_n )\n" - " {\n" - " float4 a0 = src[gIdx*2+0];\n" - " float4 a1 = src[gIdx*2+1];\n" - "\n" - " dst[ gIdx*2+0 ] = a0;\n" - " dst[ gIdx*2+1 ] = a1;\n" - " }\n" - "}\n" - "\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void Copy4F4Kernel(__global float4* dst, __global float4* src, \n" - " ConstBuffer cb)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - "\n" - " if( 4*gIdx <= cb.m_n )\n" - " {\n" - " int idx0 = gIdx*4+0;\n" - " int idx1 = gIdx*4+1;\n" - " int idx2 = gIdx*4+2;\n" - " int idx3 = gIdx*4+3;\n" - "\n" - " float4 a0 = src[idx0];\n" - " float4 a1 = src[idx1];\n" - " float4 a2 = src[idx2];\n" - " float4 a3 = src[idx3];\n" - "\n" - " dst[ idx0 ] = a0;\n" - " dst[ idx1 ] = a1;\n" - " dst[ idx2 ] = a2;\n" - " dst[ idx3 ] = a3;\n" - " }\n" - "}\n" - "\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n" - " ConstBuffer cb)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - "\n" - " if( gIdx < cb.m_n )\n" - " {\n" - " float a0 = srcF1[gIdx];\n" - "\n" - " dstF1[ gIdx ] = a0;\n" - " }\n" - "}\n" - "\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n" - " ConstBuffer cb)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - "\n" - " if( gIdx < cb.m_n )\n" - " {\n" - " float2 a0 = srcF2[gIdx];\n" - "\n" - " dstF2[ gIdx ] = a0;\n" - " }\n" - "}\n" - "\n" - "\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl deleted file mode 100644 index 71c31075dd..0000000000 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernels.cl +++ /dev/null @@ -1,107 +0,0 @@ -/* -Copyright (c) 2012 Advanced Micro Devices, Inc. - -This software is provided 'as-is', without any express or implied warranty. -In no event will the authors be held liable for any damages arising from the use of this software. -Permission is granted to anyone to use this software for any purpose, -including commercial applications, and to alter it and redistribute it freely, -subject to the following restrictions: - -1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. -2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. -3. This notice may not be removed or altered from any source distribution. -*/ -//Originally written by Takahiro Harada - - -#pragma OPENCL EXTENSION cl_amd_printf : enable -#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable - -typedef unsigned int u32; -#define GET_GROUP_IDX get_group_id(0) -#define GET_LOCAL_IDX get_local_id(0) -#define GET_GLOBAL_IDX get_global_id(0) -#define GET_GROUP_SIZE get_local_size(0) -#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) -#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) -#define AtomInc(x) atom_inc(&(x)) -#define AtomInc1(x, out) out = atom_inc(&(x)) - -#define make_uint4 (uint4) -#define make_uint2 (uint2) -#define make_int2 (int2) - -typedef struct -{ - union - { - int4 m_data; - uint4 m_unsignedData; - float m_floatData; - }; - int m_offset; - int m_n; - int m_padding[2]; -} ConstBuffer; - - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset) -{ - int gIdx = GET_GLOBAL_IDX; - - if( gIdx < num_elements ) - { - dstInt[ offset+gIdx ] = value; - } -} - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset) -{ - int gIdx = GET_GLOBAL_IDX; - - if( gIdx < num_elements ) - { - dstFloat[ offset+gIdx ] = value; - } -} - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset) -{ - int gIdx = GET_GLOBAL_IDX; - - if( gIdx < num ) - { - dstInt[ offset+gIdx ] = value; - } -} - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset) -{ - int gIdx = GET_GLOBAL_IDX; - - if( gIdx < num ) - { - dstInt2[ gIdx + offset] = make_int2( value.x, value.y ); - } -} - -__kernel -__attribute__((reqd_work_group_size(64,1,1))) -void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset) -{ - int gIdx = GET_GLOBAL_IDX; - - if( gIdx < num ) - { - dstInt4[ offset+gIdx ] = value; - } -} - diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h deleted file mode 100644 index 983e652270..0000000000 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h +++ /dev/null @@ -1,90 +0,0 @@ -//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* fillKernelsCL = - "/*\n" - "Copyright (c) 2012 Advanced Micro Devices, Inc. \n" - "This software is provided 'as-is', without any express or implied warranty.\n" - "In no event will the authors be held liable for any damages arising from the use of this software.\n" - "Permission is granted to anyone to use this software for any purpose, \n" - "including commercial applications, and to alter it and redistribute it freely, \n" - "subject to the following restrictions:\n" - "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" - "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" - "3. This notice may not be removed or altered from any source distribution.\n" - "*/\n" - "//Originally written by Takahiro Harada\n" - "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" - "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" - "typedef unsigned int u32;\n" - "#define GET_GROUP_IDX get_group_id(0)\n" - "#define GET_LOCAL_IDX get_local_id(0)\n" - "#define GET_GLOBAL_IDX get_global_id(0)\n" - "#define GET_GROUP_SIZE get_local_size(0)\n" - "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" - "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" - "#define AtomInc(x) atom_inc(&(x))\n" - "#define AtomInc1(x, out) out = atom_inc(&(x))\n" - "#define make_uint4 (uint4)\n" - "#define make_uint2 (uint2)\n" - "#define make_int2 (int2)\n" - "typedef struct\n" - "{\n" - " union\n" - " {\n" - " int4 m_data;\n" - " uint4 m_unsignedData;\n" - " float m_floatData;\n" - " };\n" - " int m_offset;\n" - " int m_n;\n" - " int m_padding[2];\n" - "} ConstBuffer;\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - " if( gIdx < num_elements )\n" - " {\n" - " dstInt[ offset+gIdx ] = value;\n" - " }\n" - "}\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - " if( gIdx < num_elements )\n" - " {\n" - " dstFloat[ offset+gIdx ] = value;\n" - " }\n" - "}\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - " if( gIdx < num )\n" - " {\n" - " dstInt[ offset+gIdx ] = value;\n" - " }\n" - "}\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - " if( gIdx < num )\n" - " {\n" - " dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n" - " }\n" - "}\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(64,1,1)))\n" - "void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n" - "{\n" - " int gIdx = GET_GLOBAL_IDX;\n" - " if( gIdx < num )\n" - " {\n" - " dstInt4[ offset+gIdx ] = value;\n" - " }\n" - "}\n"; diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl deleted file mode 100644 index c9da79854a..0000000000 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanFloat4Kernels.cl +++ /dev/null @@ -1,154 +0,0 @@ -/* -Copyright (c) 2012 Advanced Micro Devices, Inc. - -This software is provided 'as-is', without any express or implied warranty. -In no event will the authors be held liable for any damages arising from the use of this software. -Permission is granted to anyone to use this software for any purpose, -including commercial applications, and to alter it and redistribute it freely, -subject to the following restrictions: - -1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. -2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. -3. This notice may not be removed or altered from any source distribution. -*/ -//Originally written by Takahiro Harada - - -typedef unsigned int u32; -#define GET_GROUP_IDX get_group_id(0) -#define GET_LOCAL_IDX get_local_id(0) -#define GET_GLOBAL_IDX get_global_id(0) -#define GET_GROUP_SIZE get_local_size(0) -#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) - -// takahiro end -#define WG_SIZE 128 -#define m_numElems x -#define m_numBlocks y -#define m_numScanBlocks z - -/*typedef struct -{ - uint m_numElems; - uint m_numBlocks; - uint m_numScanBlocks; - uint m_padding[1]; -} ConstBuffer; -*/ - -float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize) -{ - float4 blocksum; - int offset = 1; - for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1) - { - GROUP_LDS_BARRIER; - for(int iIdx=lIdx; iIdx>= 1; - for(int nActive=1; nActive>=1 ) - { - GROUP_LDS_BARRIER; - for( int iIdx = lIdx; iIdx>1; nActive>0; nActive>>=1, offset<<=1) - { - GROUP_LDS_BARRIER; - for(int iIdx=lIdx; iIdx>= 1; - for(int nActive=1; nActive>=1 ) - { - GROUP_LDS_BARRIER; - for( int iIdx = lIdx; iIdx>1; nActive>0; nActive>>=1, offset<<=1)\n" - " {\n" - " GROUP_LDS_BARRIER;\n" - " for(int iIdx=lIdx; iIdx>= 1;\n" - " for(int nActive=1; nActive>=1 )\n" - " {\n" - " GROUP_LDS_BARRIER;\n" - " for( int iIdx = lIdx; iIdx>1; nActive>0; nActive>>=1, offset<<=1)\n" - " {\n" - " GROUP_LDS_BARRIER;\n" - " for(int iIdx=lIdx; iIdx>= 1;\n" - " for(int nActive=1; nActive>=1 )\n" - " {\n" - " GROUP_LDS_BARRIER;\n" - " for( int iIdx = lIdx; iIdx 64 ) - { - sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; - GROUP_MEM_FENCE; - } - - sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; - GROUP_MEM_FENCE; - } -#else - if( lIdx < 64 ) - { - sorterSharedMemory[idx] += sorterSharedMemory[idx-1]; - GROUP_MEM_FENCE; - sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; - GROUP_MEM_FENCE; - sorterSharedMemory[idx] += sorterSharedMemory[idx-4]; - GROUP_MEM_FENCE; - sorterSharedMemory[idx] += sorterSharedMemory[idx-8]; - GROUP_MEM_FENCE; - sorterSharedMemory[idx] += sorterSharedMemory[idx-16]; - GROUP_MEM_FENCE; - sorterSharedMemory[idx] += sorterSharedMemory[idx-32]; - GROUP_MEM_FENCE; - if( wgSize > 64 ) - { - sorterSharedMemory[idx] += sorterSharedMemory[idx-64]; - GROUP_MEM_FENCE; - } - - sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2]; - GROUP_MEM_FENCE; - } -#endif - } - - GROUP_LDS_BARRIER; - - *totalSum = sorterSharedMemory[wgSize*2-1]; - u32 addValue = sorterSharedMemory[lIdx+wgSize-1]; - return addValue; -} - -//__attribute__((reqd_work_group_size(128,1,1))) -uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) -{ - u32 s4 = prefixScanVectorEx( &pData ); - u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 ); - return pData + make_uint4( rank, rank, rank, rank ); -} - - -//__attribute__((reqd_work_group_size(64,1,1))) -uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory ) -{ - u32 s4 = prefixScanVectorEx( &pData ); - u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 ); - return pData + make_uint4( rank, rank, rank, rank ); -} - -u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;} - -u32 bit8Scan(u32 v) -{ - return (v<<8) + (v<<16) + (v<<24); -} - -//=== - - - - -#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx] - - -__kernel -__attribute__((reqd_work_group_size(WG_SIZE,1,1))) -void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb ) -{ - __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE]; - - u32 gIdx = GET_GLOBAL_IDX; - u32 lIdx = GET_LOCAL_IDX; - u32 wgIdx = GET_GROUP_IDX; - u32 wgSize = GET_GROUP_SIZE; - const int startBit = cb.m_startBit; - const int n = cb.m_n; - const int nWGs = cb.m_nWGs; - const int nBlocksPerWG = cb.m_nBlocksPerWG; - - for(int i=0; i>startBit) & 0xf; -#if defined(NV_GPU) - MY_HISTOGRAM( localKey )++; -#else - AtomInc( MY_HISTOGRAM( localKey ) ); -#endif - } - } - } - - GROUP_LDS_BARRIER; - - if( lIdx < NUM_BUCKET ) - { - u32 sum = 0; - for(int i=0; i>startBit) & 0xf; -#if defined(NV_GPU) - MY_HISTOGRAM( localKey )++; -#else - AtomInc( MY_HISTOGRAM( localKey ) ); -#endif - } - } - } - - GROUP_LDS_BARRIER; - - if( lIdx < NUM_BUCKET ) - { - u32 sum = 0; - for(int i=0; i>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask ); - uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) ); - u32 total; - prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData ); - { - uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3); - uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total ); - dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) ); - - GROUP_LDS_BARRIER; - - ldsSortData[dstAddr.x] = sortData[0]; - ldsSortData[dstAddr.y] = sortData[1]; - ldsSortData[dstAddr.z] = sortData[2]; - ldsSortData[dstAddr.w] = sortData[3]; - - GROUP_LDS_BARRIER; - - sortData[0] = ldsSortData[localAddr.x]; - sortData[1] = ldsSortData[localAddr.y]; - sortData[2] = ldsSortData[localAddr.z]; - sortData[3] = ldsSortData[localAddr.w]; - - GROUP_LDS_BARRIER; - } - } -} - -// 2 scan, 2 exchange -void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData) -{ - for(uint ibit=0; ibit>(startBit+ibit)) & 0x3, - (sortData[1]>>(startBit+ibit)) & 0x3, - (sortData[2]>>(startBit+ibit)) & 0x3, - (sortData[3]>>(startBit+ibit)) & 0x3); - - u32 key4; - u32 sKeyPacked[4] = { 0, 0, 0, 0 }; - { - sKeyPacked[0] |= 1<<(8*b.x); - sKeyPacked[1] |= 1<<(8*b.y); - sKeyPacked[2] |= 1<<(8*b.z); - sKeyPacked[3] |= 1<<(8*b.w); - - key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; - } - - u32 rankPacked; - u32 sumPacked; - { - rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); - } - - GROUP_LDS_BARRIER; - - u32 newOffset[4] = { 0,0,0,0 }; - { - u32 sumScanned = bit8Scan( sumPacked ); - - u32 scannedKeys[4]; - scannedKeys[0] = 1<<(8*b.x); - scannedKeys[1] = 1<<(8*b.y); - scannedKeys[2] = 1<<(8*b.z); - scannedKeys[3] = 1<<(8*b.w); - { // 4 scans at once - u32 sum4 = 0; - for(int ie=0; ie<4; ie++) - { - u32 tmp = scannedKeys[ie]; - scannedKeys[ie] = sum4; - sum4 += tmp; - } - } - - { - u32 sumPlusRank = sumScanned + rankPacked; - { u32 ie = b.x; - scannedKeys[0] += sumPlusRank; - newOffset[0] = unpack4Key( scannedKeys[0], ie ); - } - { u32 ie = b.y; - scannedKeys[1] += sumPlusRank; - newOffset[1] = unpack4Key( scannedKeys[1], ie ); - } - { u32 ie = b.z; - scannedKeys[2] += sumPlusRank; - newOffset[2] = unpack4Key( scannedKeys[2], ie ); - } - { u32 ie = b.w; - scannedKeys[3] += sumPlusRank; - newOffset[3] = unpack4Key( scannedKeys[3], ie ); - } - } - } - - - GROUP_LDS_BARRIER; - - { - ldsSortData[newOffset[0]] = sortData[0]; - ldsSortData[newOffset[1]] = sortData[1]; - ldsSortData[newOffset[2]] = sortData[2]; - ldsSortData[newOffset[3]] = sortData[3]; - - GROUP_LDS_BARRIER; - - u32 dstAddr = 4*lIdx; - sortData[0] = ldsSortData[dstAddr+0]; - sortData[1] = ldsSortData[dstAddr+1]; - sortData[2] = ldsSortData[dstAddr+2]; - sortData[3] = ldsSortData[dstAddr+3]; - - GROUP_LDS_BARRIER; - } - } -} - -#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key] - -__kernel -__attribute__((reqd_work_group_size(WG_SIZE,1,1))) -void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) -{ - __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; - __local u32 localHistogramToCarry[NUM_BUCKET]; - __local u32 localHistogram[NUM_BUCKET*2]; - - u32 gIdx = GET_GLOBAL_IDX; - u32 lIdx = GET_LOCAL_IDX; - u32 wgIdx = GET_GROUP_IDX; - u32 wgSize = GET_GROUP_SIZE; - - const int n = cb.m_n; - const int nWGs = cb.m_nWGs; - const int startBit = cb.m_startBit; - const int nBlocksPerWG = cb.m_nBlocksPerWG; - - if( lIdx < (NUM_BUCKET) ) - { - localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; - } - - GROUP_LDS_BARRIER; - - const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; - - int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; - - int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; - - for(int iblock=0; iblock>startBit) & 0xf; - - { // create histogram - u32 setIdx = lIdx/16; - if( lIdx < NUM_BUCKET ) - { - localHistogram[lIdx] = 0; - } - ldsSortData[lIdx] = 0; - GROUP_LDS_BARRIER; - - for(int i=0; i>(startBit+ibit)) & 0x3, - (sortData[1]>>(startBit+ibit)) & 0x3, - (sortData[2]>>(startBit+ibit)) & 0x3, - (sortData[3]>>(startBit+ibit)) & 0x3); - - u32 key4; - u32 sKeyPacked[4] = { 0, 0, 0, 0 }; - { - sKeyPacked[0] |= 1<<(8*b.x); - sKeyPacked[1] |= 1<<(8*b.y); - sKeyPacked[2] |= 1<<(8*b.z); - sKeyPacked[3] |= 1<<(8*b.w); - - key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3]; - } - - u32 rankPacked; - u32 sumPacked; - { - rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE ); - } - - GROUP_LDS_BARRIER; - - u32 newOffset[4] = { 0,0,0,0 }; - { - u32 sumScanned = bit8Scan( sumPacked ); - - u32 scannedKeys[4]; - scannedKeys[0] = 1<<(8*b.x); - scannedKeys[1] = 1<<(8*b.y); - scannedKeys[2] = 1<<(8*b.z); - scannedKeys[3] = 1<<(8*b.w); - { // 4 scans at once - u32 sum4 = 0; - for(int ie=0; ie<4; ie++) - { - u32 tmp = scannedKeys[ie]; - scannedKeys[ie] = sum4; - sum4 += tmp; - } - } - - { - u32 sumPlusRank = sumScanned + rankPacked; - { u32 ie = b.x; - scannedKeys[0] += sumPlusRank; - newOffset[0] = unpack4Key( scannedKeys[0], ie ); - } - { u32 ie = b.y; - scannedKeys[1] += sumPlusRank; - newOffset[1] = unpack4Key( scannedKeys[1], ie ); - } - { u32 ie = b.z; - scannedKeys[2] += sumPlusRank; - newOffset[2] = unpack4Key( scannedKeys[2], ie ); - } - { u32 ie = b.w; - scannedKeys[3] += sumPlusRank; - newOffset[3] = unpack4Key( scannedKeys[3], ie ); - } - } - } - - - GROUP_LDS_BARRIER; - - { - ldsSortData[newOffset[0]] = sortData[0]; - ldsSortData[newOffset[1]] = sortData[1]; - ldsSortData[newOffset[2]] = sortData[2]; - ldsSortData[newOffset[3]] = sortData[3]; - - ldsSortVal[newOffset[0]] = sortVal[0]; - ldsSortVal[newOffset[1]] = sortVal[1]; - ldsSortVal[newOffset[2]] = sortVal[2]; - ldsSortVal[newOffset[3]] = sortVal[3]; - - GROUP_LDS_BARRIER; - - u32 dstAddr = 4*lIdx; - sortData[0] = ldsSortData[dstAddr+0]; - sortData[1] = ldsSortData[dstAddr+1]; - sortData[2] = ldsSortData[dstAddr+2]; - sortData[3] = ldsSortData[dstAddr+3]; - - sortVal[0] = ldsSortVal[dstAddr+0]; - sortVal[1] = ldsSortVal[dstAddr+1]; - sortVal[2] = ldsSortVal[dstAddr+2]; - sortVal[3] = ldsSortVal[dstAddr+3]; - - GROUP_LDS_BARRIER; - } - } -} - - - - -__kernel -__attribute__((reqd_work_group_size(WG_SIZE,1,1))) -void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb) -{ - __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; - __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16]; - __local u32 localHistogramToCarry[NUM_BUCKET]; - __local u32 localHistogram[NUM_BUCKET*2]; - - u32 gIdx = GET_GLOBAL_IDX; - u32 lIdx = GET_LOCAL_IDX; - u32 wgIdx = GET_GROUP_IDX; - u32 wgSize = GET_GROUP_SIZE; - - const int n = cb.m_n; - const int nWGs = cb.m_nWGs; - const int startBit = cb.m_startBit; - const int nBlocksPerWG = cb.m_nBlocksPerWG; - - if( lIdx < (NUM_BUCKET) ) - { - localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx]; - } - - GROUP_LDS_BARRIER; - - - const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE; - - int nBlocks = n/blockSize - nBlocksPerWG*wgIdx; - - int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx; - - for(int iblock=0; iblock>startBit) & 0xf; - - { // create histogram - u32 setIdx = lIdx/16; - if( lIdx < NUM_BUCKET ) - { - localHistogram[lIdx] = 0; - } - ldsSortData[lIdx] = 0; - GROUP_LDS_BARRIER; - - for(int i=0; i0) - return; - - for (int c=0;c>startBit) & 0xf;//0xf = NUM_TABLES-1 - gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; - counter[tableIdx] ++; - } - } - } - } - -} - - -__kernel -__attribute__((reqd_work_group_size(WG_SIZE,1,1))) -void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb ) -{ - - u32 gIdx = GET_GLOBAL_IDX; - u32 realLocalIdx = GET_LOCAL_IDX; - u32 wgIdx = GET_GROUP_IDX; - u32 wgSize = GET_GROUP_SIZE; - const int startBit = cb.m_startBit; - const int n = cb.m_n; - const int nWGs = cb.m_nWGs; - const int nBlocksPerWG = cb.m_nBlocksPerWG; - - int counter[NUM_BUCKET]; - - if (realLocalIdx>0) - return; - - for (int c=0;c>startBit) & 0xf;//0xf = NUM_TABLES-1 - gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i]; - counter[tableIdx] ++; - } - } - } - } - -} \ No newline at end of file diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h deleted file mode 100644 index fb4bdda303..0000000000 --- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h +++ /dev/null @@ -1,909 +0,0 @@ -//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project -static const char* radixSort32KernelsCL = - "/*\n" - "Bullet Continuous Collision Detection and Physics Library\n" - "Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org\n" - "This software is provided 'as-is', without any express or implied warranty.\n" - "In no event will the authors be held liable for any damages arising from the use of this software.\n" - "Permission is granted to anyone to use this software for any purpose, \n" - "including commercial applications, and to alter it and redistribute it freely, \n" - "subject to the following restrictions:\n" - "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" - "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" - "3. This notice may not be removed or altered from any source distribution.\n" - "*/\n" - "//Author Takahiro Harada\n" - "//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" - "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" - "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" - "typedef unsigned int u32;\n" - "#define GET_GROUP_IDX get_group_id(0)\n" - "#define GET_LOCAL_IDX get_local_id(0)\n" - "#define GET_GLOBAL_IDX get_global_id(0)\n" - "#define GET_GROUP_SIZE get_local_size(0)\n" - "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" - "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" - "#define AtomInc(x) atom_inc(&(x))\n" - "#define AtomInc1(x, out) out = atom_inc(&(x))\n" - "#define AtomAdd(x, value) atom_add(&(x), value)\n" - "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" - "#define make_uint4 (uint4)\n" - "#define make_uint2 (uint2)\n" - "#define make_int2 (int2)\n" - "#define WG_SIZE 64\n" - "#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n" - "#define BITS_PER_PASS 4\n" - "#define NUM_BUCKET (1< 64 )\n" - " {\n" - " sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" - " GROUP_MEM_FENCE;\n" - " }\n" - " sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" - " GROUP_MEM_FENCE;\n" - " }\n" - "#else\n" - " if( lIdx < 64 )\n" - " {\n" - " sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n" - " GROUP_MEM_FENCE;\n" - " sorterSharedMemory[idx] += sorterSharedMemory[idx-2]; \n" - " GROUP_MEM_FENCE;\n" - " sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n" - " GROUP_MEM_FENCE;\n" - " sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n" - " GROUP_MEM_FENCE;\n" - " sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n" - " GROUP_MEM_FENCE;\n" - " sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n" - " GROUP_MEM_FENCE;\n" - " if( wgSize > 64 )\n" - " {\n" - " sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n" - " GROUP_MEM_FENCE;\n" - " }\n" - " sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n" - " GROUP_MEM_FENCE;\n" - " }\n" - "#endif\n" - " }\n" - " GROUP_LDS_BARRIER;\n" - " *totalSum = sorterSharedMemory[wgSize*2-1];\n" - " u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n" - " return addValue;\n" - "}\n" - "//__attribute__((reqd_work_group_size(128,1,1)))\n" - "uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" - "{\n" - " u32 s4 = prefixScanVectorEx( &pData );\n" - " u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n" - " return pData + make_uint4( rank, rank, rank, rank );\n" - "}\n" - "//__attribute__((reqd_work_group_size(64,1,1)))\n" - "uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n" - "{\n" - " u32 s4 = prefixScanVectorEx( &pData );\n" - " u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n" - " return pData + make_uint4( rank, rank, rank, rank );\n" - "}\n" - "u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n" - "u32 bit8Scan(u32 v)\n" - "{\n" - " return (v<<8) + (v<<16) + (v<<24);\n" - "}\n" - "//===\n" - "#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" - "void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n" - "{\n" - " __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n" - " u32 gIdx = GET_GLOBAL_IDX;\n" - " u32 lIdx = GET_LOCAL_IDX;\n" - " u32 wgIdx = GET_GROUP_IDX;\n" - " u32 wgSize = GET_GROUP_SIZE;\n" - " const int startBit = cb.m_startBit;\n" - " const int n = cb.m_n;\n" - " const int nWGs = cb.m_nWGs;\n" - " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" - " for(int i=0; i>startBit) & 0xf;\n" - "#if defined(NV_GPU)\n" - " MY_HISTOGRAM( localKey )++;\n" - "#else\n" - " AtomInc( MY_HISTOGRAM( localKey ) );\n" - "#endif\n" - " }\n" - " }\n" - " }\n" - " GROUP_LDS_BARRIER;\n" - " \n" - " if( lIdx < NUM_BUCKET )\n" - " {\n" - " u32 sum = 0;\n" - " for(int i=0; i>startBit) & 0xf;\n" - "#if defined(NV_GPU)\n" - " MY_HISTOGRAM( localKey )++;\n" - "#else\n" - " AtomInc( MY_HISTOGRAM( localKey ) );\n" - "#endif\n" - " }\n" - " }\n" - " }\n" - " GROUP_LDS_BARRIER;\n" - " \n" - " if( lIdx < NUM_BUCKET )\n" - " {\n" - " u32 sum = 0;\n" - " for(int i=0; i>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n" - " uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n" - " u32 total;\n" - " prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n" - " {\n" - " uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n" - " uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n" - " dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n" - " GROUP_LDS_BARRIER;\n" - " ldsSortData[dstAddr.x] = sortData[0];\n" - " ldsSortData[dstAddr.y] = sortData[1];\n" - " ldsSortData[dstAddr.z] = sortData[2];\n" - " ldsSortData[dstAddr.w] = sortData[3];\n" - " GROUP_LDS_BARRIER;\n" - " sortData[0] = ldsSortData[localAddr.x];\n" - " sortData[1] = ldsSortData[localAddr.y];\n" - " sortData[2] = ldsSortData[localAddr.z];\n" - " sortData[3] = ldsSortData[localAddr.w];\n" - " GROUP_LDS_BARRIER;\n" - " }\n" - " }\n" - "}\n" - "// 2 scan, 2 exchange\n" - "void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n" - "{\n" - " for(uint ibit=0; ibit>(startBit+ibit)) & 0x3, \n" - " (sortData[1]>>(startBit+ibit)) & 0x3, \n" - " (sortData[2]>>(startBit+ibit)) & 0x3, \n" - " (sortData[3]>>(startBit+ibit)) & 0x3);\n" - " u32 key4;\n" - " u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" - " {\n" - " sKeyPacked[0] |= 1<<(8*b.x);\n" - " sKeyPacked[1] |= 1<<(8*b.y);\n" - " sKeyPacked[2] |= 1<<(8*b.z);\n" - " sKeyPacked[3] |= 1<<(8*b.w);\n" - " key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" - " }\n" - " u32 rankPacked;\n" - " u32 sumPacked;\n" - " {\n" - " rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" - " }\n" - " GROUP_LDS_BARRIER;\n" - " u32 newOffset[4] = { 0,0,0,0 };\n" - " {\n" - " u32 sumScanned = bit8Scan( sumPacked );\n" - " u32 scannedKeys[4];\n" - " scannedKeys[0] = 1<<(8*b.x);\n" - " scannedKeys[1] = 1<<(8*b.y);\n" - " scannedKeys[2] = 1<<(8*b.z);\n" - " scannedKeys[3] = 1<<(8*b.w);\n" - " { // 4 scans at once\n" - " u32 sum4 = 0;\n" - " for(int ie=0; ie<4; ie++)\n" - " {\n" - " u32 tmp = scannedKeys[ie];\n" - " scannedKeys[ie] = sum4;\n" - " sum4 += tmp;\n" - " }\n" - " }\n" - " {\n" - " u32 sumPlusRank = sumScanned + rankPacked;\n" - " { u32 ie = b.x;\n" - " scannedKeys[0] += sumPlusRank;\n" - " newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" - " }\n" - " { u32 ie = b.y;\n" - " scannedKeys[1] += sumPlusRank;\n" - " newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" - " }\n" - " { u32 ie = b.z;\n" - " scannedKeys[2] += sumPlusRank;\n" - " newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" - " }\n" - " { u32 ie = b.w;\n" - " scannedKeys[3] += sumPlusRank;\n" - " newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" - " }\n" - " }\n" - " }\n" - " GROUP_LDS_BARRIER;\n" - " {\n" - " ldsSortData[newOffset[0]] = sortData[0];\n" - " ldsSortData[newOffset[1]] = sortData[1];\n" - " ldsSortData[newOffset[2]] = sortData[2];\n" - " ldsSortData[newOffset[3]] = sortData[3];\n" - " GROUP_LDS_BARRIER;\n" - " u32 dstAddr = 4*lIdx;\n" - " sortData[0] = ldsSortData[dstAddr+0];\n" - " sortData[1] = ldsSortData[dstAddr+1];\n" - " sortData[2] = ldsSortData[dstAddr+2];\n" - " sortData[3] = ldsSortData[dstAddr+3];\n" - " GROUP_LDS_BARRIER;\n" - " }\n" - " }\n" - "}\n" - "#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" - "void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" - "{\n" - " __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" - " __local u32 localHistogramToCarry[NUM_BUCKET];\n" - " __local u32 localHistogram[NUM_BUCKET*2];\n" - " u32 gIdx = GET_GLOBAL_IDX;\n" - " u32 lIdx = GET_LOCAL_IDX;\n" - " u32 wgIdx = GET_GROUP_IDX;\n" - " u32 wgSize = GET_GROUP_SIZE;\n" - " const int n = cb.m_n;\n" - " const int nWGs = cb.m_nWGs;\n" - " const int startBit = cb.m_startBit;\n" - " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" - " if( lIdx < (NUM_BUCKET) )\n" - " {\n" - " localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" - " }\n" - " GROUP_LDS_BARRIER;\n" - " const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" - " int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" - " int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" - " for(int iblock=0; iblock>startBit) & 0xf;\n" - " { // create histogram\n" - " u32 setIdx = lIdx/16;\n" - " if( lIdx < NUM_BUCKET )\n" - " {\n" - " localHistogram[lIdx] = 0;\n" - " }\n" - " ldsSortData[lIdx] = 0;\n" - " GROUP_LDS_BARRIER;\n" - " for(int i=0; i>(startBit+ibit)) & 0x3, \n" - " (sortData[1]>>(startBit+ibit)) & 0x3, \n" - " (sortData[2]>>(startBit+ibit)) & 0x3, \n" - " (sortData[3]>>(startBit+ibit)) & 0x3);\n" - " u32 key4;\n" - " u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n" - " {\n" - " sKeyPacked[0] |= 1<<(8*b.x);\n" - " sKeyPacked[1] |= 1<<(8*b.y);\n" - " sKeyPacked[2] |= 1<<(8*b.z);\n" - " sKeyPacked[3] |= 1<<(8*b.w);\n" - " key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n" - " }\n" - " u32 rankPacked;\n" - " u32 sumPacked;\n" - " {\n" - " rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n" - " }\n" - " GROUP_LDS_BARRIER;\n" - " u32 newOffset[4] = { 0,0,0,0 };\n" - " {\n" - " u32 sumScanned = bit8Scan( sumPacked );\n" - " u32 scannedKeys[4];\n" - " scannedKeys[0] = 1<<(8*b.x);\n" - " scannedKeys[1] = 1<<(8*b.y);\n" - " scannedKeys[2] = 1<<(8*b.z);\n" - " scannedKeys[3] = 1<<(8*b.w);\n" - " { // 4 scans at once\n" - " u32 sum4 = 0;\n" - " for(int ie=0; ie<4; ie++)\n" - " {\n" - " u32 tmp = scannedKeys[ie];\n" - " scannedKeys[ie] = sum4;\n" - " sum4 += tmp;\n" - " }\n" - " }\n" - " {\n" - " u32 sumPlusRank = sumScanned + rankPacked;\n" - " { u32 ie = b.x;\n" - " scannedKeys[0] += sumPlusRank;\n" - " newOffset[0] = unpack4Key( scannedKeys[0], ie );\n" - " }\n" - " { u32 ie = b.y;\n" - " scannedKeys[1] += sumPlusRank;\n" - " newOffset[1] = unpack4Key( scannedKeys[1], ie );\n" - " }\n" - " { u32 ie = b.z;\n" - " scannedKeys[2] += sumPlusRank;\n" - " newOffset[2] = unpack4Key( scannedKeys[2], ie );\n" - " }\n" - " { u32 ie = b.w;\n" - " scannedKeys[3] += sumPlusRank;\n" - " newOffset[3] = unpack4Key( scannedKeys[3], ie );\n" - " }\n" - " }\n" - " }\n" - " GROUP_LDS_BARRIER;\n" - " {\n" - " ldsSortData[newOffset[0]] = sortData[0];\n" - " ldsSortData[newOffset[1]] = sortData[1];\n" - " ldsSortData[newOffset[2]] = sortData[2];\n" - " ldsSortData[newOffset[3]] = sortData[3];\n" - " ldsSortVal[newOffset[0]] = sortVal[0];\n" - " ldsSortVal[newOffset[1]] = sortVal[1];\n" - " ldsSortVal[newOffset[2]] = sortVal[2];\n" - " ldsSortVal[newOffset[3]] = sortVal[3];\n" - " GROUP_LDS_BARRIER;\n" - " u32 dstAddr = 4*lIdx;\n" - " sortData[0] = ldsSortData[dstAddr+0];\n" - " sortData[1] = ldsSortData[dstAddr+1];\n" - " sortData[2] = ldsSortData[dstAddr+2];\n" - " sortData[3] = ldsSortData[dstAddr+3];\n" - " sortVal[0] = ldsSortVal[dstAddr+0];\n" - " sortVal[1] = ldsSortVal[dstAddr+1];\n" - " sortVal[2] = ldsSortVal[dstAddr+2];\n" - " sortVal[3] = ldsSortVal[dstAddr+3];\n" - " GROUP_LDS_BARRIER;\n" - " }\n" - " }\n" - "}\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" - "void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n" - "{\n" - " __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" - " __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n" - " __local u32 localHistogramToCarry[NUM_BUCKET];\n" - " __local u32 localHistogram[NUM_BUCKET*2];\n" - " u32 gIdx = GET_GLOBAL_IDX;\n" - " u32 lIdx = GET_LOCAL_IDX;\n" - " u32 wgIdx = GET_GROUP_IDX;\n" - " u32 wgSize = GET_GROUP_SIZE;\n" - " const int n = cb.m_n;\n" - " const int nWGs = cb.m_nWGs;\n" - " const int startBit = cb.m_startBit;\n" - " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" - " if( lIdx < (NUM_BUCKET) )\n" - " {\n" - " localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n" - " }\n" - " GROUP_LDS_BARRIER;\n" - " \n" - " const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n" - " int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n" - " int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n" - " for(int iblock=0; iblock>startBit) & 0xf;\n" - " { // create histogram\n" - " u32 setIdx = lIdx/16;\n" - " if( lIdx < NUM_BUCKET )\n" - " {\n" - " localHistogram[lIdx] = 0;\n" - " }\n" - " ldsSortData[lIdx] = 0;\n" - " GROUP_LDS_BARRIER;\n" - " for(int i=0; i0)\n" - " return;\n" - " \n" - " for (int c=0;c>startBit) & 0xf;//0xf = NUM_TABLES-1\n" - " gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" - " counter[tableIdx] ++;\n" - " }\n" - " }\n" - " }\n" - " }\n" - " \n" - "}\n" - "__kernel\n" - "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" - "void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )\n" - "{\n" - " \n" - " u32 gIdx = GET_GLOBAL_IDX;\n" - " u32 realLocalIdx = GET_LOCAL_IDX;\n" - " u32 wgIdx = GET_GROUP_IDX;\n" - " u32 wgSize = GET_GROUP_SIZE;\n" - " const int startBit = cb.m_startBit;\n" - " const int n = cb.m_n;\n" - " const int nWGs = cb.m_nWGs;\n" - " const int nBlocksPerWG = cb.m_nBlocksPerWG;\n" - " int counter[NUM_BUCKET];\n" - " \n" - " if (realLocalIdx>0)\n" - " return;\n" - " \n" - " for (int c=0;c>startBit) & 0xf;//0xf = NUM_TABLES-1\n" - " gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n" - " counter[tableIdx] ++;\n" - " }\n" - " }\n" - " }\n" - " }\n" - " \n" - "}\n"; -- cgit v1.2.3