summaryrefslogtreecommitdiff
path: root/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
blob: 69caf182d7f270913b2311aaeacc868d1440bd15 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

#ifndef B3_RADIXSORT32_H
#define B3_RADIXSORT32_H

#include "b3OpenCLArray.h"

struct b3SortData
{
	union {
		unsigned int m_key;
		unsigned int x;
	};

	union {
		unsigned int m_value;
		unsigned int y;
	};
};
#include "b3BufferInfoCL.h"

class b3RadixSort32CL
{
	b3OpenCLArray<unsigned int>* m_workBuffer1;
	b3OpenCLArray<unsigned int>* m_workBuffer2;

	b3OpenCLArray<b3SortData>* m_workBuffer3;
	b3OpenCLArray<b3SortData>* m_workBuffer4;

	b3OpenCLArray<unsigned int>* m_workBuffer3a;
	b3OpenCLArray<unsigned int>* m_workBuffer4a;

	cl_command_queue m_commandQueue;

	cl_kernel m_streamCountSortDataKernel;
	cl_kernel m_streamCountKernel;

	cl_kernel m_prefixScanKernel;
	cl_kernel m_sortAndScatterSortDataKernel;
	cl_kernel m_sortAndScatterKernel;

	bool m_deviceCPU;

	class b3PrefixScanCL* m_scan;
	class b3FillCL* m_fill;

public:
	struct b3ConstData
	{
		int m_n;
		int m_nWGs;
		int m_startBit;
		int m_nBlocksPerWG;
	};
	enum
	{
		DATA_ALIGNMENT = 256,
		WG_SIZE = 64,
		BLOCK_SIZE = 256,
		ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
		BITS_PER_PASS = 4,
		NUM_BUCKET = (1 << BITS_PER_PASS),
		//	if you change this, change nPerWI in kernel as well
		NUM_WGS = 20 * 6,  //	cypress
						   //			NUM_WGS = 24*6,	//	cayman
						   //			NUM_WGS = 32*4,	//	nv
	};

private:
public:
	b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);

	virtual ~b3RadixSort32CL();

	void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
				 b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);

	///keys only
	void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);

	void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
	void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
	void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
};
#endif  //B3_RADIXSORT32_H