summaryrefslogtreecommitdiff
path: root/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
blob: 975bd80e5307447596652b1c460ab2580d9e22d0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

#ifndef B3_RADIXSORT32_H
#define B3_RADIXSORT32_H

#include "b3OpenCLArray.h"

struct b3SortData
{
	union
	{
		unsigned int m_key;
		unsigned int x;
	};

	union
	{
		unsigned int m_value;
		unsigned int y;
		
	};
};
#include "b3BufferInfoCL.h"

class  b3RadixSort32CL
{

		b3OpenCLArray<unsigned int>* m_workBuffer1;
		b3OpenCLArray<unsigned int>* m_workBuffer2;
		
		b3OpenCLArray<b3SortData>*	m_workBuffer3;
		b3OpenCLArray<b3SortData>*	m_workBuffer4;

		b3OpenCLArray<unsigned int>* m_workBuffer3a;
		b3OpenCLArray<unsigned int>* m_workBuffer4a;

		cl_command_queue	m_commandQueue;

		cl_kernel m_streamCountSortDataKernel;
		cl_kernel m_streamCountKernel;

		cl_kernel m_prefixScanKernel;
		cl_kernel m_sortAndScatterSortDataKernel;
		cl_kernel m_sortAndScatterKernel;


		bool	m_deviceCPU;

		class b3PrefixScanCL* m_scan;
		class b3FillCL*	m_fill;

public:
	struct b3ConstData
		{
			int m_n;
			int m_nWGs;
			int m_startBit;
			int m_nBlocksPerWG;
		};
	enum
		{
			DATA_ALIGNMENT = 256,
			WG_SIZE = 64,
            BLOCK_SIZE = 256,
			ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
			BITS_PER_PASS = 4,
			NUM_BUCKET=(1<<BITS_PER_PASS),
			//	if you change this, change nPerWI in kernel as well
			NUM_WGS = 20*6,	//	cypress
//			NUM_WGS = 24*6,	//	cayman
//			NUM_WGS = 32*4,	//	nv
		};


private:
		

public:

		b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);

		virtual ~b3RadixSort32CL();

		void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, 
								b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);

		///keys only
		void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits  = 32 );

		void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits  = 32 );
		void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
		void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);

};
#endif //B3_RADIXSORT32_H