20 files changed, 2449 insertions, 2603 deletions
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
index a4980f71e1..c0e11bfb26 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
@@ -19,149 +19,139 @@ subject to the following restrictions:
 #define KERNEL1 "SearchSortDataUpperKernel"
 #define KERNEL2 "SubtractKernel"
 
-
 #include "b3BoundSearchCL.h"
 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "b3LauncherCL.h"
 #include "kernels/BoundSearchKernelsCL.h"
 
 b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
-	:m_context(ctx),
-	m_device(device),
-	m_queue(queue)
+	: m_context(ctx),
+	  m_device(device),
+	  m_queue(queue)
 {
-
 	const char* additionalMacros = "";
 	//const char* srcFileNameForCaching="";
 
 	cl_int pErrNum;
 	const char* kernelSource = boundSearchKernelsCL;
 
-	cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
+	cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, BOUNDSEARCH_PATH);
 	b3Assert(boundSearchProg);
 
-	m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
-	b3Assert(m_lowerSortDataKernel );
+	m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg, additionalMacros);
+	b3Assert(m_lowerSortDataKernel);
 
-	m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
+	m_upperSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg, additionalMacros);
 	b3Assert(m_upperSortDataKernel);
 
 	m_subtractKernel = 0;
 
-	if( maxSize )
+	if (maxSize)
 	{
-		m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
+		m_subtractKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg, additionalMacros);
 		b3Assert(m_subtractKernel);
 	}
 
 	//m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST );
-	
-	m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize );
-	m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize );
 
-	m_filler = new b3FillCL(ctx,device,queue);
+	m_lower = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
+	m_upper = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
+
+	m_filler = new b3FillCL(ctx, device, queue);
 }
 
 b3BoundSearchCL::~b3BoundSearchCL()
 {
-	
 	delete m_lower;
 	delete m_upper;
 	delete m_filler;
-			
+
 	clReleaseKernel(m_lowerSortDataKernel);
 	clReleaseKernel(m_upperSortDataKernel);
 	clReleaseKernel(m_subtractKernel);
-	
-
 }
 
-
-void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option )
+void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option)
 {
 	b3Int4 constBuffer;
 	constBuffer.x = nSrc;
 	constBuffer.y = nDst;
 
-	if( option == BOUND_LOWER )
+	if (option == BOUND_LOWER)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) };
-
-		b3LauncherCL launcher( m_queue, m_lowerSortDataKernel,"m_lowerSortDataKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( nSrc );
-        launcher.setConst( nDst );
-        
-		launcher.launch1D( nSrc, 64 );
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_lowerSortDataKernel, "m_lowerSortDataKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(nSrc);
+		launcher.setConst(nDst);
+
+		launcher.launch1D(nSrc, 64);
 	}
-	else if( option == BOUND_UPPER )
+	else if (option == BOUND_UPPER)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
 
-		b3LauncherCL launcher(m_queue, m_upperSortDataKernel,"m_upperSortDataKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-        launcher.setConst( nSrc );
-        launcher.setConst( nDst );
+		b3LauncherCL launcher(m_queue, m_upperSortDataKernel, "m_upperSortDataKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(nSrc);
+		launcher.setConst(nDst);
 
-		launcher.launch1D( nSrc, 64 );
+		launcher.launch1D(nSrc, 64);
 	}
-	else if( option == COUNT )
+	else if (option == COUNT)
 	{
-		b3Assert( m_lower );
-		b3Assert( m_upper );
-		b3Assert( m_lower->capacity() <= (int)nDst );
-		b3Assert( m_upper->capacity() <= (int)nDst );
+		b3Assert(m_lower);
+		b3Assert(m_upper);
+		b3Assert(m_lower->capacity() <= (int)nDst);
+		b3Assert(m_upper->capacity() <= (int)nDst);
 
 		int zero = 0;
-		m_filler->execute( *m_lower, zero, nDst );
-		m_filler->execute( *m_upper, zero, nDst );
+		m_filler->execute(*m_lower, zero, nDst);
+		m_filler->execute(*m_upper, zero, nDst);
 
-		execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
-		execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
+		execute(src, nSrc, *m_lower, nDst, BOUND_LOWER);
+		execute(src, nSrc, *m_upper, nDst, BOUND_UPPER);
 
 		{
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_upper->getBufferCL(), true), b3BufferInfoCL(m_lower->getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
 
-			b3LauncherCL  launcher( m_queue, m_subtractKernel ,"m_subtractKernel");
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-            launcher.setConst( nSrc );
-            launcher.setConst( nDst );
+			b3LauncherCL launcher(m_queue, m_subtractKernel, "m_subtractKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(nSrc);
+			launcher.setConst(nDst);
 
-			launcher.launch1D( nDst, 64 );
+			launcher.launch1D(nDst, 64);
 		}
 	}
 	else
 	{
-		b3Assert( 0 );
+		b3Assert(0);
 	}
-
 }
 
-
-void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, 
-	b3AlignedObjectArray<unsigned int>& dst,  int nDst, Option option )
+void b3BoundSearchCL::executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc,
+								  b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option)
 {
+	for (int i = 0; i < nSrc - 1; i++)
+		b3Assert(src[i].m_key <= src[i + 1].m_key);
 
-
-	for(int i=0; i<nSrc-1; i++) 
-		b3Assert( src[i].m_key <= src[i+1].m_key );
-
-	b3SortData minData,zeroData,maxData;
+	b3SortData minData, zeroData, maxData;
 	minData.m_key = -1;
 	minData.m_value = -1;
-	zeroData.m_key=0;
-	zeroData.m_value=0;
+	zeroData.m_key = 0;
+	zeroData.m_value = 0;
 	maxData.m_key = nDst;
 	maxData.m_value = nDst;
 
-	if( option == BOUND_LOWER )
+	if (option == BOUND_LOWER)
 	{
-		for(int i=0; i<nSrc; i++)
+		for (int i = 0; i < nSrc; i++)
 		{
-			b3SortData& iData = (i==0)? minData: src[i-1];
-			b3SortData& jData = (i==nSrc)? maxData: src[i];
+			b3SortData& iData = (i == 0) ? minData : src[i - 1];
+			b3SortData& jData = (i == nSrc) ? maxData : src[i];
 
-			if( iData.m_key != jData.m_key )
+			if (iData.m_key != jData.m_key)
 			{
 				int k = jData.m_key;
 				{
@@ -170,14 +160,14 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS
 			}
 		}
 	}
-	else if( option == BOUND_UPPER )
+	else if (option == BOUND_UPPER)
 	{
-		for(int i=1; i<nSrc+1; i++)
+		for (int i = 1; i < nSrc + 1; i++)
 		{
-			b3SortData& iData = src[i-1];
-			b3SortData& jData = (i==nSrc)? maxData: src[i];
+			b3SortData& iData = src[i - 1];
+			b3SortData& jData = (i == nSrc) ? maxData : src[i];
 
-			if( iData.m_key != jData.m_key )
+			if (iData.m_key != jData.m_key)
 			{
 				int k = iData.m_key;
 				{
@@ -186,28 +176,28 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS
 			}
 		}
 	}
-	else if( option == COUNT )
+	else if (option == COUNT)
 	{
 		b3AlignedObjectArray<unsigned int> lower;
-		lower.resize(nDst );
+		lower.resize(nDst);
 		b3AlignedObjectArray<unsigned int> upper;
-		upper.resize(nDst );
+		upper.resize(nDst);
 
-		for(int i=0; i<nDst; i++) 
-		{ 
-			lower[i] = upper[i] = 0; 
+		for (int i = 0; i < nDst; i++)
+		{
+			lower[i] = upper[i] = 0;
 		}
 
-		executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
-		executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
+		executeHost(src, nSrc, lower, nDst, BOUND_LOWER);
+		executeHost(src, nSrc, upper, nDst, BOUND_UPPER);
 
-		for( int i=0; i<nDst; i++) 
-		{ 
-			dst[i] = upper[i] - lower[i]; 
+		for (int i = 0; i < nDst; i++)
+		{
+			dst[i] = upper[i] - lower[i];
 		}
 	}
 	else
 	{
-		b3Assert( 0 );
+		b3Assert(0);
 	}
 }
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
index 7e2940965c..0d633e3d23 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
@@ -26,42 +26,39 @@ subject to the following restrictions:
 
 #include "b3OpenCLArray.h"
 #include "b3FillCL.h"
-#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?)
+#include "b3RadixSort32CL.h"  //for b3SortData (perhaps move it?)
 class b3BoundSearchCL
 {
-	public:
+public:
+	enum Option
+	{
+		BOUND_LOWER,
+		BOUND_UPPER,
+		COUNT,
+	};
 
-		enum Option
-		{
-			BOUND_LOWER,
-			BOUND_UPPER,
-			COUNT,
-		};
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;
 
-		cl_context m_context;
-		cl_device_id m_device;
-		cl_command_queue m_queue;
+	cl_kernel m_lowerSortDataKernel;
+	cl_kernel m_upperSortDataKernel;
+	cl_kernel m_subtractKernel;
 
-		
-		cl_kernel m_lowerSortDataKernel;
-		cl_kernel m_upperSortDataKernel;
-		cl_kernel m_subtractKernel;
-		
-		b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
-		b3OpenCLArray<unsigned int>* m_lower;
-		b3OpenCLArray<unsigned int>* m_upper;
-		
-		b3FillCL* m_filler;
-		
-		b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
+	b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
+	b3OpenCLArray<unsigned int>* m_lower;
+	b3OpenCLArray<unsigned int>* m_upper;
 
-		virtual ~b3BoundSearchCL();
+	b3FillCL* m_filler;
 
-		//	src has to be src[i].m_key <= src[i+1].m_key
-		void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
+	b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
 
-		void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
-};
+	virtual ~b3BoundSearchCL();
+
+	//	src has to be src[i].m_key <= src[i+1].m_key
+	void execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
 
+	void executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
+};
 
-#endif //B3_BOUNDSEARCH_H
+#endif  //B3_BOUNDSEARCH_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
index 52f219ae3f..35fc467b20 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
@@ -4,16 +4,15 @@
 
 #include "b3OpenCLArray.h"
 
-
 struct b3BufferInfoCL
 {
 	//b3BufferInfoCL(){}
 
-//	template<typename T>
-	b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
+	//	template<typename T>
+	b3BufferInfoCL(cl_mem buff, bool isReadOnly = false) : m_clBuffer(buff), m_isReadOnly(isReadOnly) {}
 
 	cl_mem m_clBuffer;
 	bool m_isReadOnly;
 };
 
-#endif //B3_BUFFER_INFO_CL_H
+#endif  //B3_BUFFER_INFO_CL_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
index f05c2648f1..bd25bb2101 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
@@ -8,29 +8,26 @@
 #include "kernels/FillKernelsCL.h"
 
 b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	const char* kernelSource = fillKernelsCL;
 	cl_int pErrNum;
 	const char* additionalMacros = "";
 
-	cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
+	cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, FILL_CL_PROGRAM_PATH);
 	b3Assert(fillProg);
 
-	m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
+	m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillIntKernel);
 
-	m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
+	m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillIntKernel);
 
-	m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
+	m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillFloatKernel);
 
-	
-
-	m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
+	m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillKernelInt2);
-	
 }
 
 b3FillCL::~b3FillCL()
@@ -39,88 +36,84 @@ b3FillCL::~b3FillCL()
 	clReleaseKernel(m_fillIntKernel);
 	clReleaseKernel(m_fillUnsignedIntKernel);
 	clReleaseKernel(m_fillFloatKernel);
-
 }
 
 void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset)
 {
-	b3Assert( n>0 );
+	b3Assert(n > 0);
 
 	{
-		b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel,"m_fillFloatKernel" );
-		launcher.setBuffer( src.getBufferCL());
-		launcher.setConst( n );
-		launcher.setConst( value );
-		launcher.setConst( offset);
+		b3LauncherCL launcher(m_commandQueue, m_fillFloatKernel, "m_fillFloatKernel");
+		launcher.setBuffer(src.getBufferCL());
+		launcher.setConst(n);
+		launcher.setConst(value);
+		launcher.setConst(offset);
 
-		launcher.launch1D( n );
+		launcher.launch1D(n);
 	}
 }
 
 void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset)
 {
-	b3Assert( n>0 );
-	
+	b3Assert(n > 0);
 
 	{
-		b3LauncherCL launcher( m_commandQueue, m_fillIntKernel ,"m_fillIntKernel");
+		b3LauncherCL launcher(m_commandQueue, m_fillIntKernel, "m_fillIntKernel");
 		launcher.setBuffer(src.getBufferCL());
-		launcher.setConst( n);
-		launcher.setConst( value);
-		launcher.setConst( offset);
-		launcher.launch1D( n );
+		launcher.setConst(n);
+		launcher.setConst(value);
+		launcher.setConst(offset);
+		launcher.launch1D(n);
 	}
 }
 
-
 void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
 {
-	b3Assert( n>0 );
+	b3Assert(n > 0);
 
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};
 
-		b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel,"m_fillUnsignedIntKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( n );
-        launcher.setConst(value);
+		b3LauncherCL launcher(m_commandQueue, m_fillUnsignedIntKernel, "m_fillUnsignedIntKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(n);
+		launcher.setConst(value);
 		launcher.setConst(offset);
 
-		launcher.launch1D( n );
+		launcher.launch1D(n);
 	}
 }
 
-void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
+void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset)
 {
-	for (int i=0;i<n;i++)
+	for (int i = 0; i < n; i++)
 	{
-		src[i+offset]=value;
+		src[i + offset] = value;
 	}
 }
 
-void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset)
+void b3FillCL::executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset)
 {
-	for (int i=0;i<n;i++)
+	for (int i = 0; i < n; i++)
 	{
-		src[i+offset]=value;
+		src[i + offset] = value;
 	}
 }
 
-void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
+void b3FillCL::execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset)
 {
-	b3Assert( n>0 );
-	
+	b3Assert(n > 0);
 
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};
 
-		b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2,"m_fillKernelInt2");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2, "m_fillKernelInt2");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(n);
 		launcher.setConst(value);
 		launcher.setConst(offset);
 
 		//( constBuffer );
-		launcher.launch1D( n );
+		launcher.launch1D(n);
 	}
 }
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
index 1609676b9d..c92c3e5119 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
@@ -7,57 +7,46 @@
 #include "Bullet3Common/shared/b3Int2.h"
 #include "Bullet3Common/shared/b3Int4.h"
 
-
 class b3FillCL
 {
-	
-	cl_command_queue	m_commandQueue;
-	
-	cl_kernel			m_fillKernelInt2;
-	cl_kernel			m_fillIntKernel;
-	cl_kernel			m_fillUnsignedIntKernel;
-	cl_kernel			m_fillFloatKernel;
-
-	public:
-		
-		struct b3ConstData
-		{
-			union
-			{
-				b3Int4 m_data;
-				b3UnsignedInt4 m_UnsignedData;
-			};
-			int m_offset;
-			int m_n;
-			int m_padding[2];
+	cl_command_queue m_commandQueue;
+
+	cl_kernel m_fillKernelInt2;
+	cl_kernel m_fillIntKernel;
+	cl_kernel m_fillUnsignedIntKernel;
+	cl_kernel m_fillFloatKernel;
+
+public:
+	struct b3ConstData
+	{
+		union {
+			b3Int4 m_data;
+			b3UnsignedInt4 m_UnsignedData;
 		};
+		int m_offset;
+		int m_n;
+		int m_padding[2];
+	};
 
 protected:
-
 public:
+	b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
 
-		b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
+	virtual ~b3FillCL();
 
-		virtual ~b3FillCL();
+	void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
 
-		void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
-	
-		void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
+	void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
 
-		void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
+	void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
 
-		void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
+	void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
 
-		void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset);
+	void executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset);
 
-		void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset);
+	void executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset);
 
 	//	void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0);
-
 };
-		
-		
-		
-	
 
-#endif //B3_FILL_CL_H
+#endif  //B3_FILL_CL_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
index 94590d11ca..c97d02eb45 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
@@ -1,13 +1,13 @@
 #include "b3LauncherCL.h"
 
 bool gDebugLauncherCL = false;
-    
+
 b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name)
-:m_commandQueue(queue),
-m_kernel(kernel),
-m_idx(0),
-m_enableSerialization(false),
-m_name(name)
+	: m_commandQueue(queue),
+	  m_kernel(kernel),
+	  m_idx(0),
+	  m_enableSerialization(false),
+	  m_name(name)
 {
 	if (gDebugLauncherCL)
 	{
@@ -15,59 +15,58 @@ m_name(name)
 		printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name);
 	}
 
-      m_serializationSizeInBytes = sizeof(int);
+	m_serializationSizeInBytes = sizeof(int);
 }
-    
+
 b3LauncherCL::~b3LauncherCL()
-  {
-      for (int i=0;i<m_arrays.size();i++)
-      {
-		  delete (m_arrays[i]);
-      }
-
-	  m_arrays.clear();
-	  if (gDebugLauncherCL)
-	  {
+{
+	for (int i = 0; i < m_arrays.size(); i++)
+	{
+		delete (m_arrays[i]);
+	}
+
+	m_arrays.clear();
+	if (gDebugLauncherCL)
+	{
 		static int counter = 0;
-		printf("[%d] Finished launching OpenCL kernel %s\n", counter++,m_name);
-	  }
-  }
+		printf("[%d] Finished launching OpenCL kernel %s\n", counter++, m_name);
+	}
+}
 
-void b3LauncherCL::setBuffer( cl_mem clBuffer)
+void b3LauncherCL::setBuffer(cl_mem clBuffer)
 {
-		if (m_enableSerialization)
-		{
-			b3KernelArgData kernelArg;
-			kernelArg.m_argIndex = m_idx;
-			kernelArg.m_isBuffer = 1;
-			kernelArg.m_clBuffer = clBuffer;
-		
-			cl_mem_info param_name = CL_MEM_SIZE;
-			size_t param_value;
-			size_t sizeInBytes = sizeof(size_t);
-			size_t actualSizeInBytes;
-			cl_int err;
-			err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
-									  param_name,
-									  sizeInBytes,
-									  &param_value,
-									  &actualSizeInBytes);
-			
-			b3Assert( err == CL_SUCCESS );
-			kernelArg.m_argSizeInBytes = param_value;
-			
-			m_kernelArguments.push_back(kernelArg);
-			m_serializationSizeInBytes+= sizeof(b3KernelArgData);
-			m_serializationSizeInBytes+=param_value;
-            }
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
-		b3Assert( status == CL_SUCCESS );
-}
+	if (m_enableSerialization)
+	{
+		b3KernelArgData kernelArg;
+		kernelArg.m_argIndex = m_idx;
+		kernelArg.m_isBuffer = 1;
+		kernelArg.m_clBuffer = clBuffer;
 
+		cl_mem_info param_name = CL_MEM_SIZE;
+		size_t param_value;
+		size_t sizeInBytes = sizeof(size_t);
+		size_t actualSizeInBytes;
+		cl_int err;
+		err = clGetMemObjectInfo(kernelArg.m_clBuffer,
+								 param_name,
+								 sizeInBytes,
+								 &param_value,
+								 &actualSizeInBytes);
+
+		b3Assert(err == CL_SUCCESS);
+		kernelArg.m_argSizeInBytes = param_value;
+
+		m_kernelArguments.push_back(kernelArg);
+		m_serializationSizeInBytes += sizeof(b3KernelArgData);
+		m_serializationSizeInBytes += param_value;
+	}
+	cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
+	b3Assert(status == CL_SUCCESS);
+}
 
-void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n )
+void b3LauncherCL::setBuffers(b3BufferInfoCL* buffInfo, int n)
 {
-	for(int i=0; i<n; i++)
+	for (int i = 0; i < n; i++)
 	{
 		if (m_enableSerialization)
 		{
@@ -75,106 +74,103 @@ void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n )
 			kernelArg.m_argIndex = m_idx;
 			kernelArg.m_isBuffer = 1;
 			kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
-		
+
 			cl_mem_info param_name = CL_MEM_SIZE;
 			size_t param_value;
 			size_t sizeInBytes = sizeof(size_t);
 			size_t actualSizeInBytes;
 			cl_int err;
-			err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
-									  param_name,
-									  sizeInBytes,
-									  &param_value,
-									  &actualSizeInBytes);
-			
-			b3Assert( err == CL_SUCCESS );
+			err = clGetMemObjectInfo(kernelArg.m_clBuffer,
+									 param_name,
+									 sizeInBytes,
+									 &param_value,
+									 &actualSizeInBytes);
+
+			b3Assert(err == CL_SUCCESS);
 			kernelArg.m_argSizeInBytes = param_value;
-			
+
 			m_kernelArguments.push_back(kernelArg);
-			m_serializationSizeInBytes+= sizeof(b3KernelArgData);
-			m_serializationSizeInBytes+=param_value;
-            }
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
-		b3Assert( status == CL_SUCCESS );
-        }
+			m_serializationSizeInBytes += sizeof(b3KernelArgData);
+			m_serializationSizeInBytes += param_value;
+		}
+		cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
+		b3Assert(status == CL_SUCCESS);
+	}
 }
 
 struct b3KernelArgDataUnaligned
 {
-    int m_isBuffer;
-    int m_argIndex;
-    int m_argSizeInBytes;
+	int m_isBuffer;
+	int m_argIndex;
+	int m_argSizeInBytes;
 	int m_unusedPadding;
-    union
-    {
-        cl_mem m_clBuffer;
-        unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
-    };
-    
+	union {
+		cl_mem m_clBuffer;
+		unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
+	};
 };
 #include <string.h>
 
-
-
 int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
 {
-    int index=0;
-    
-    int numArguments = *(int*) &buf[index];
-    index+=sizeof(int);
-    
-    for (int i=0;i<numArguments;i++)
-    {
-        b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
-
-        index+=sizeof(b3KernelArgData);
-        if (arg->m_isBuffer)
-        {
-            b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
-            clData->resize(arg->m_argSizeInBytes);
-            
-            clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
-            
-            arg->m_clBuffer = clData->getBufferCL();
-            
-            m_arrays.push_back(clData);
-            
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
-		b3Assert( status == CL_SUCCESS );
-            index+=arg->m_argSizeInBytes;
-        } else 
-        {
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
-		b3Assert( status == CL_SUCCESS );
-        }
+	int index = 0;
+
+	int numArguments = *(int*)&buf[index];
+	index += sizeof(int);
+
+	for (int i = 0; i < numArguments; i++)
+	{
+		b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
+
+		index += sizeof(b3KernelArgData);
+		if (arg->m_isBuffer)
+		{
+			b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx, m_commandQueue, arg->m_argSizeInBytes);
+			clData->resize(arg->m_argSizeInBytes);
+
+			clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
+
+			arg->m_clBuffer = clData->getBufferCL();
+
+			m_arrays.push_back(clData);
+
+			cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
+			b3Assert(status == CL_SUCCESS);
+			index += arg->m_argSizeInBytes;
+		}
+		else
+		{
+			cl_int status = clSetKernelArg(m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
+			b3Assert(status == CL_SUCCESS);
+		}
 		b3KernelArgData b;
-		memcpy(&b,arg,sizeof(b3KernelArgDataUnaligned));
-	m_kernelArguments.push_back(b);
-    }
-m_serializationSizeInBytes = index;
-    return index;
+		memcpy(&b, arg, sizeof(b3KernelArgDataUnaligned));
+		m_kernelArguments.push_back(b);
+	}
+	m_serializationSizeInBytes = index;
+	return index;
 }
 
 int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
-  {
-	 int index=0;
-      
-      int numArguments = *(int*) &goldBuffer[index];
-      index+=sizeof(int);
+{
+	int index = 0;
+
+	int numArguments = *(int*)&goldBuffer[index];
+	index += sizeof(int);
 
 	if (numArguments != m_kernelArguments.size())
 	{
-		printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
+		printf("failed validation: expected %d arguments, found %d\n", numArguments, m_kernelArguments.size());
 		return -1;
 	}
-      
-      for (int ii=0;ii<numArguments;ii++)
-      {
-          b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
+
+	for (int ii = 0; ii < numArguments; ii++)
+	{
+		b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
 
 		if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
 		{
-			printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
+			printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n", ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
 			return -2;
 		}
 
@@ -184,125 +180,117 @@ int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapac
 
 			if (expected != found)
 			{
-				printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
+				printf("failed validation: argument %d isBuffer expected: %d, found %d\n", ii, expected, found);
 				return -3;
 			}
 		}
-		index+=sizeof(b3KernelArgData);
+		index += sizeof(b3KernelArgData);
 
 		if (argGold->m_isBuffer)
-          {
-
-			unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
+		{
+			unsigned char* memBuf = (unsigned char*)malloc(m_kernelArguments[ii].m_argSizeInBytes);
 			unsigned char* goldBuf = &goldBuffer[index];
-			for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
+			for (int j = 0; j < m_kernelArguments[j].m_argSizeInBytes; j++)
 			{
 				memBuf[j] = 0xaa;
 			}
 
 			cl_int status = 0;
-			status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
-                                           memBuf, 0,0,0 );
-              b3Assert( status==CL_SUCCESS );
-              clFinish(m_commandQueue);
+			status = clEnqueueReadBuffer(m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
+										 memBuf, 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
+			clFinish(m_commandQueue);
 
-			for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+			for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
 			{
 				int expected = goldBuf[b];
 				int found = memBuf[b];
 				if (expected != found)
 				{
 					printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
-						ii, b, expected, found);
+						   ii, b, expected, found);
 					return -4;
 				}
 			}
 
-              
-              index+=argGold->m_argSizeInBytes;
-          } else 
-          {
-			
+			index += argGold->m_argSizeInBytes;
+		}
+		else
+		{
 			//compare content
-			for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+			for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
 			{
 				int expected = argGold->m_argData[b];
-				int found =m_kernelArguments[ii].m_argData[b];
+				int found = m_kernelArguments[ii].m_argData[b];
 				if (expected != found)
 				{
 					printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
-						ii, b, expected, found);
+						   ii, b, expected, found);
 					return -5;
 				}
 			}
-
-          }
-      }
-      return index;
-
+		}
+	}
+	return index;
 }
 
 int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
 {
-//initialize to known values
-for (int i=0;i<destBufferCapacity;i++)
-	destBuffer[i] = 0xec;
-
-    assert(destBufferCapacity>=m_serializationSizeInBytes);
-    
-    //todo: use the b3Serializer for this to allow for 32/64bit, endianness etc        
-    int numArguments = m_kernelArguments.size();
-    int curBufferSize = 0;
-    int* dest = (int*)&destBuffer[curBufferSize];
-    *dest = numArguments;
-    curBufferSize += sizeof(int);
-    
-    
-    
-    for (int i=0;i<this->m_kernelArguments.size();i++)
-    {
-        b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize];
-        *arg = m_kernelArguments[i];
-        curBufferSize+=sizeof(b3KernelArgData);
-        if (arg->m_isBuffer==1)
-        {
-            //copy the OpenCL buffer content
-            cl_int status = 0;
-            status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
-                                         &destBuffer[curBufferSize], 0,0,0 );
-            b3Assert( status==CL_SUCCESS );
-            clFinish(m_commandQueue);
-            curBufferSize+=arg->m_argSizeInBytes;
-        }
-        
-    }
-    return curBufferSize;
+	//initialize to known values
+	for (int i = 0; i < destBufferCapacity; i++)
+		destBuffer[i] = 0xec;
+
+	assert(destBufferCapacity >= m_serializationSizeInBytes);
+
+	//todo: use the b3Serializer for this to allow for 32/64bit, endianness etc
+	int numArguments = m_kernelArguments.size();
+	int curBufferSize = 0;
+	int* dest = (int*)&destBuffer[curBufferSize];
+	*dest = numArguments;
+	curBufferSize += sizeof(int);
+
+	for (int i = 0; i < this->m_kernelArguments.size(); i++)
+	{
+		b3KernelArgData* arg = (b3KernelArgData*)&destBuffer[curBufferSize];
+		*arg = m_kernelArguments[i];
+		curBufferSize += sizeof(b3KernelArgData);
+		if (arg->m_isBuffer == 1)
+		{
+			//copy the OpenCL buffer content
+			cl_int status = 0;
+			status = clEnqueueReadBuffer(m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
+										 &destBuffer[curBufferSize], 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
+			clFinish(m_commandQueue);
+			curBufferSize += arg->m_argSizeInBytes;
+		}
+	}
+	return curBufferSize;
 }
 
 void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems)
 {
 	int num = numWorkItems;
 	int buffSize = getSerializationBufferSize();
-	unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
-	for (int i=0;i<buffSize+1;i++)
+	unsigned char* buf = new unsigned char[buffSize + sizeof(int)];
+	for (int i = 0; i < buffSize + 1; i++)
 	{
 		unsigned char* ptr = (unsigned char*)&buf[i];
 		*ptr = 0xff;
 	}
-//	int actualWrite = serializeArguments(buf,buffSize);
-              
-//	unsigned char* cptr = (unsigned char*)&buf[buffSize];
-//            printf("buf[buffSize] = %d\n",*cptr);
-              
-	assert(buf[buffSize]==0xff);//check for buffer overrun
+	//	int actualWrite = serializeArguments(buf,buffSize);
+
+	//	unsigned char* cptr = (unsigned char*)&buf[buffSize];
+	//            printf("buf[buffSize] = %d\n",*cptr);
+
+	assert(buf[buffSize] == 0xff);  //check for buffer overrun
 	int* ptr = (int*)&buf[buffSize];
-              
+
 	*ptr = num;
-              
-	FILE* f = fopen(fileName,"wb");
-	fwrite(buf,buffSize+sizeof(int),1,f);
+
+	FILE* f = fopen(fileName, "wb");
+	fwrite(buf, buffSize + sizeof(int), 1, f);
 	fclose(f);
 
 	delete[] buf;
-}		
-
+}
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
index 1b267b31ef..18e9c1db2b 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
@@ -9,60 +9,57 @@
 
 #define B3_DEBUG_SERIALIZE_CL
 
-
 #ifdef _WIN32
-#pragma warning(disable :4996)
+#pragma warning(disable : 4996)
 #endif
 #define B3_CL_MAX_ARG_SIZE 16
-B3_ATTRIBUTE_ALIGNED16(struct) b3KernelArgData
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3KernelArgData
 {
-    int m_isBuffer;
-    int m_argIndex;
-    int m_argSizeInBytes;
+	int m_isBuffer;
+	int m_argIndex;
+	int m_argSizeInBytes;
 	int m_unusedPadding;
-    union
-    {
-        cl_mem m_clBuffer;
-        unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
-    };
-    
+	union {
+		cl_mem m_clBuffer;
+		unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
+	};
 };
 
 class b3LauncherCL
 {
-
 	cl_command_queue m_commandQueue;
 	cl_kernel m_kernel;
 	int m_idx;
-	
-    b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
-    int m_serializationSizeInBytes;
-	bool	m_enableSerialization;
+
+	b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
+	int m_serializationSizeInBytes;
+	bool m_enableSerialization;
 
 	const char* m_name;
-	public:
-
-     b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays;
-    
-		b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
-    
-    virtual ~b3LauncherCL();
-    
-		void setBuffer( cl_mem clBuffer);
-
-		void setBuffers( b3BufferInfoCL* buffInfo, int n );
-    
-    int getSerializationBufferSize() const 
-    {
-        return m_serializationSizeInBytes;
-    }
-    
-    int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);
+
+public:
+	b3AlignedObjectArray<b3OpenCLArray<unsigned char>*> m_arrays;
+
+	b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
+
+	virtual ~b3LauncherCL();
+
+	void setBuffer(cl_mem clBuffer);
+
+	void setBuffers(b3BufferInfoCL* buffInfo, int n);
+
+	int getSerializationBufferSize() const
+	{
+		return m_serializationSizeInBytes;
+	}
+
+	int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);
 
 	inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx);
 
-    int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
-    
+	int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
+
 	int getNumArguments() const
 	{
 		return m_kernelArguments.size();
@@ -75,61 +72,57 @@ class b3LauncherCL
 
 	void serializeToFile(const char* fileName, int numWorkItems);
 
-	template<typename T>
-		inline void setConst( const T& consts )
-		{
-			int sz=sizeof(T);
-			b3Assert(sz<=B3_CL_MAX_ARG_SIZE);
-
-			if (m_enableSerialization)
-			{
-				b3KernelArgData kernelArg;
-				kernelArg.m_argIndex = m_idx;
-				kernelArg.m_isBuffer = 0;
-				T* destArg = (T*)kernelArg.m_argData;
-				*destArg = consts;
-				kernelArg.m_argSizeInBytes = sizeof(T);
-				m_kernelArguments.push_back(kernelArg);
-				m_serializationSizeInBytes+=sizeof(b3KernelArgData);
-			}
-            
-			cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
-			b3Assert( status == CL_SUCCESS );
-		}
+	template <typename T>
+	inline void setConst(const T& consts)
+	{
+		int sz = sizeof(T);
+		b3Assert(sz <= B3_CL_MAX_ARG_SIZE);
 
-		inline void launch1D( int numThreads, int localSize = 64)
+		if (m_enableSerialization)
 		{
-			launch2D( numThreads, 1, localSize, 1 );
+			b3KernelArgData kernelArg;
+			kernelArg.m_argIndex = m_idx;
+			kernelArg.m_isBuffer = 0;
+			T* destArg = (T*)kernelArg.m_argData;
+			*destArg = consts;
+			kernelArg.m_argSizeInBytes = sizeof(T);
+			m_kernelArguments.push_back(kernelArg);
+			m_serializationSizeInBytes += sizeof(b3KernelArgData);
 		}
 
-		inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
-		{
-			size_t gRange[3] = {1,1,1};
-			size_t lRange[3] = {1,1,1};
-			lRange[0] = localSizeX;
-			lRange[1] = localSizeY;
-			gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
-			gRange[0] *= lRange[0];
-			gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
-			gRange[1] *= lRange[1];
-
-			cl_int status = clEnqueueNDRangeKernel( m_commandQueue, 
-				m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
-            if (status != CL_SUCCESS)
-            {
-                printf("Error: OpenCL status = %d\n",status);
-            }
-			b3Assert( status == CL_SUCCESS );
+		cl_int status = clSetKernelArg(m_kernel, m_idx++, sz, &consts);
+		b3Assert(status == CL_SUCCESS);
+	}
 
-		}
-	
-		void	enableSerialization(bool serialize)
+	inline void launch1D(int numThreads, int localSize = 64)
+	{
+		launch2D(numThreads, 1, localSize, 1);
+	}
+
+	inline void launch2D(int numThreadsX, int numThreadsY, int localSizeX, int localSizeY)
+	{
+		size_t gRange[3] = {1, 1, 1};
+		size_t lRange[3] = {1, 1, 1};
+		lRange[0] = localSizeX;
+		lRange[1] = localSizeY;
+		gRange[0] = b3Max((size_t)1, (numThreadsX / lRange[0]) + (!(numThreadsX % lRange[0]) ? 0 : 1));
+		gRange[0] *= lRange[0];
+		gRange[1] = b3Max((size_t)1, (numThreadsY / lRange[1]) + (!(numThreadsY % lRange[1]) ? 0 : 1));
+		gRange[1] *= lRange[1];
+
+		cl_int status = clEnqueueNDRangeKernel(m_commandQueue,
+											   m_kernel, 2, NULL, gRange, lRange, 0, 0, 0);
+		if (status != CL_SUCCESS)
 		{
-			m_enableSerialization = serialize;
+			printf("Error: OpenCL status = %d\n", status);
 		}
-		
-};
-
+		b3Assert(status == CL_SUCCESS);
+	}
 
+	void enableSerialization(bool serialize)
+	{
+		m_enableSerialization = serialize;
+	}
+};
 
-#endif //B3_LAUNCHER_CL_H
+#endif  //B3_LAUNCHER_CL_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
index d70c30f53f..e837cceb66 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
@@ -7,16 +7,16 @@
 template <typename T>
 class b3OpenCLArray
 {
-	size_t	m_size;
-	size_t	m_capacity;
-	cl_mem	m_clBuffer;
+	size_t m_size;
+	size_t m_capacity;
+	cl_mem m_clBuffer;
 
-	cl_context		 m_clContext;
+	cl_context m_clContext;
 	cl_command_queue m_commandQueue;
 
-	bool	m_ownsMemory;
+	bool m_ownsMemory;
 
-	bool	m_allowGrowingCapacity;
+	bool m_allowGrowingCapacity;
 
 	void deallocate()
 	{
@@ -25,22 +25,19 @@ class b3OpenCLArray
 			clReleaseMemObject(m_clBuffer);
 		}
 		m_clBuffer = 0;
-		m_capacity=0;
+		m_capacity = 0;
 	}
 
 	b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src);
 
-	B3_FORCE_INLINE	size_t	allocSize(size_t size)
-		{
-			return (size ? size*2 : 1);
-		}
+	B3_FORCE_INLINE size_t allocSize(size_t size)
+	{
+		return (size ? size * 2 : 1);
+	}
 
 public:
-
-	b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity=0, bool allowGrowingCapacity=true)
-	:m_size(0),  m_capacity(0),m_clBuffer(0),
-	m_clContext(ctx),m_commandQueue(queue),
-	m_ownsMemory(true),m_allowGrowingCapacity(true)
+	b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity = 0, bool allowGrowingCapacity = true)
+		: m_size(0), m_capacity(0), m_clBuffer(0), m_clContext(ctx), m_commandQueue(queue), m_ownsMemory(true), m_allowGrowingCapacity(true)
 	{
 		if (initialCapacity)
 		{
@@ -60,34 +57,32 @@ public:
 		m_capacity = sizeInElements;
 	}
 
-// we could enable this assignment, but need to make sure to avoid accidental deep copies
-//	b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
-//	{
-//		copyFromArray(src);
-//		return *this;
-//	}
+	// we could enable this assignment, but need to make sure to avoid accidental deep copies
+	//	b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
+	//	{
+	//		copyFromArray(src);
+	//		return *this;
+	//	}
 
-
-	cl_mem	getBufferCL() const
+	cl_mem getBufferCL() const
 	{
 		return m_clBuffer;
 	}
 
-
 	virtual ~b3OpenCLArray()
 	{
 		deallocate();
-		m_size=0;
-		m_capacity=0;
+		m_size = 0;
+		m_capacity = 0;
 	}
 
-	B3_FORCE_INLINE	bool push_back(const T& _Val,bool waitForCompletion=true)
+	B3_FORCE_INLINE bool push_back(const T& _Val, bool waitForCompletion = true)
 	{
 		bool result = true;
 		size_t sz = size();
-		if( sz == capacity() )
+		if (sz == capacity())
 		{
-			result = reserve( allocSize(size()) );
+			result = reserve(allocSize(size()));
 		}
 		copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
 		m_size++;
@@ -96,23 +91,23 @@ public:
 
 	B3_FORCE_INLINE T forcedAt(size_t n) const
 	{
-		b3Assert(n>=0);
-		b3Assert(n<capacity());
+		b3Assert(n >= 0);
+		b3Assert(n < capacity());
 		T elem;
-		copyToHostPointer(&elem,1,n,true);
+		copyToHostPointer(&elem, 1, n, true);
 		return elem;
 	}
 
 	B3_FORCE_INLINE T at(size_t n) const
 	{
-		b3Assert(n>=0);
-		b3Assert(n<size());
+		b3Assert(n >= 0);
+		b3Assert(n < size());
 		T elem;
-		copyToHostPointer(&elem,1,n,true);
+		copyToHostPointer(&elem, 1, n, true);
 		return elem;
 	}
 
-	B3_FORCE_INLINE	bool resize(size_t newsize, bool copyOldContents=true)
+	B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents = true)
 	{
 		bool result = true;
 		size_t curSize = size();
@@ -120,11 +115,12 @@ public:
 		if (newsize < curSize)
 		{
 			//leave the OpenCL memory for now
-		} else
+		}
+		else
 		{
 			if (newsize > size())
 			{
-				result = reserve(newsize,copyOldContents);
+				result = reserve(newsize, copyOldContents);
 			}
 
 			//leave new data uninitialized (init in debug mode?)
@@ -134,7 +130,8 @@ public:
 		if (result)
 		{
 			m_size = newsize;
-		} else
+		}
+		else
 		{
 			m_size = 0;
 		}
@@ -146,25 +143,25 @@ public:
 		return m_size;
 	}
 
-	B3_FORCE_INLINE	size_t capacity() const
+	B3_FORCE_INLINE size_t capacity() const
 	{
 		return m_capacity;
 	}
 
-	B3_FORCE_INLINE	bool reserve(size_t _Count, bool copyOldContents=true)
+	B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents = true)
 	{
-		bool result=true;
+		bool result = true;
 		// determine new minimum length of allocated storage
 		if (capacity() < _Count)
-		{	// not enough room, reallocate
+		{  // not enough room, reallocate
 
 			if (m_allowGrowingCapacity)
 			{
 				cl_int ciErrNum;
 				//create a new OpenCL buffer
-				size_t memSizeInBytes = sizeof(T)*_Count;
+				size_t memSizeInBytes = sizeof(T) * _Count;
 				cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
-				if (ciErrNum!=CL_SUCCESS)
+				if (ciErrNum != CL_SUCCESS)
 				{
 					b3Error("OpenCL out-of-memory\n");
 					_Count = 0;
@@ -173,13 +170,13 @@ public:
 //#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
 #ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
 				unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
-				for (size_t i=0;i<memSizeInBytes;i++)
+				for (size_t i = 0; i < memSizeInBytes; i++)
 					src[i] = 0xbb;
-				ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
-				b3Assert(ciErrNum==CL_SUCCESS);
+				ciErrNum = clEnqueueWriteBuffer(m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0, 0, 0);
+				b3Assert(ciErrNum == CL_SUCCESS);
 				clFinish(m_commandQueue);
 				free(src);
-#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
+#endif  //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
 
 				if (result)
 				{
@@ -193,21 +190,21 @@ public:
 				m_clBuffer = buf;
 
 				m_capacity = _Count;
-			} else
+			}
+			else
 			{
 				//fail: assert and
 				b3Assert(0);
 				deallocate();
-				result=false;
+				result = false;
 			}
 		}
 		return result;
 	}
 
-
-	void copyToCL(cl_mem destination, size_t numElements, size_t firstElem=0, size_t dstOffsetInElems=0) const
+	void copyToCL(cl_mem destination, size_t numElements, size_t firstElem = 0, size_t dstOffsetInElems = 0) const
 	{
-		if (numElements<=0)
+		if (numElements <= 0)
 			return;
 
 		b3Assert(m_clBuffer);
@@ -216,75 +213,74 @@ public:
 		//likely some error, destination is same as source
 		b3Assert(m_clBuffer != destination);
 
-		b3Assert((firstElem+numElements)<=m_size);
+		b3Assert((firstElem + numElements) <= m_size);
 
 		cl_int status = 0;
 
+		b3Assert(numElements > 0);
+		b3Assert(numElements <= m_size);
 
-		b3Assert(numElements>0);
-		b3Assert(numElements<=m_size);
-
-		size_t srcOffsetBytes = sizeof(T)*firstElem;
-		size_t dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
+		size_t srcOffsetBytes = sizeof(T) * firstElem;
+		size_t dstOffsetInBytes = sizeof(T) * dstOffsetInElems;
 
-		status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
-			srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
+		status = clEnqueueCopyBuffer(m_commandQueue, m_clBuffer, destination,
+									 srcOffsetBytes, dstOffsetInBytes, sizeof(T) * numElements, 0, 0, 0);
 
-		b3Assert( status == CL_SUCCESS );
+		b3Assert(status == CL_SUCCESS);
 	}
 
-	void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
+	void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion = true)
 	{
 		size_t newSize = srcArray.size();
 
 		bool copyOldContents = false;
-		resize (newSize,copyOldContents);
+		resize(newSize, copyOldContents);
 		if (newSize)
-			copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
-
+			copyFromHostPointer(&srcArray[0], newSize, 0, waitForCompletion);
 	}
 
-	void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem= 0, bool waitForCompletion=true)
+	void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem = 0, bool waitForCompletion = true)
 	{
-		b3Assert(numElems+destFirstElem <= capacity());
+		b3Assert(numElems + destFirstElem <= capacity());
 
-		if (numElems+destFirstElem)
+		if (numElems + destFirstElem)
 		{
 			cl_int status = 0;
-			size_t sizeInBytes=sizeof(T)*numElems;
-			status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
-			src, 0,0,0 );
-			b3Assert(status == CL_SUCCESS );
+			size_t sizeInBytes = sizeof(T) * numElems;
+			status = clEnqueueWriteBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * destFirstElem, sizeInBytes,
+										  src, 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
 			if (waitForCompletion)
 				clFinish(m_commandQueue);
-		} else
+		}
+		else
 		{
 			b3Error("copyFromHostPointer invalid range\n");
 		}
 	}
 
-
-	void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
+	void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion = true) const
 	{
 		destArray.resize(this->size());
 		if (size())
-			copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
+			copyToHostPointer(&destArray[0], size(), 0, waitForCompletion);
 	}
 
-	void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem=0, bool waitForCompletion=true) const
+	void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem = 0, bool waitForCompletion = true) const
 	{
-		b3Assert(numElem+srcFirstElem <= capacity());
+		b3Assert(numElem + srcFirstElem <= capacity());
 
-		if(numElem+srcFirstElem <= capacity())
+		if (numElem + srcFirstElem <= capacity())
 		{
 			cl_int status = 0;
-			status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
-			destPtr, 0,0,0 );
-			b3Assert( status==CL_SUCCESS );
+			status = clEnqueueReadBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * srcFirstElem, sizeof(T) * numElem,
+										 destPtr, 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
 
 			if (waitForCompletion)
 				clFinish(m_commandQueue);
-		} else
+		}
+		else
 		{
 			b3Error("copyToHostPointer invalid range\n");
 		}
@@ -296,11 +292,9 @@ public:
 		resize(newSize);
 		if (size())
 		{
-			src.copyToCL(m_clBuffer,size());
+			src.copyToCL(m_clBuffer, size());
 		}
 	}
-
 };
 
-
-#endif //B3_OPENCL_ARRAY_H
+#endif  //B3_OPENCL_ARRAY_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
index 42cd197740..822b511633 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
@@ -7,25 +7,24 @@
 #include "kernels/PrefixScanKernelsCL.h"
 
 b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	const char* scanKernelSource = prefixScanKernelsCL;
 	cl_int pErrNum;
-	char* additionalMacros=0;
+	char* additionalMacros = 0;
 
-	m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size);
-	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH);
+	m_workBuffer = new b3OpenCLArray<unsigned int>(ctx, queue, size);
+	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_PROG_PATH);
 	b3Assert(scanProg);
 
-	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_localScanKernel );
-	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_blockSumKernel );
-	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_propagationKernel );
+	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_localScanKernel);
+	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_blockSumKernel);
+	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_propagationKernel);
 }
 
-
 b3PrefixScanCL::~b3PrefixScanCL()
 {
 	delete m_workBuffer;
@@ -34,20 +33,19 @@ b3PrefixScanCL::~b3PrefixScanCL()
 	clReleaseKernel(m_propagationKernel);
 }
 
-template<class T>
+template <class T>
 T b3NextPowerOf2(T n)
 {
 	n -= 1;
-	for(int i=0; i<sizeof(T)*8; i++)
-		n = n | (n>>i);
-	return n+1;
+	for (int i = 0; i < sizeof(T) * 8; i++)
+		n = n | (n >> i);
+	return n + 1;
 }
 
 void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
 {
-	
-//	b3Assert( data->m_option == EXCLUSIVE );
-	const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+	//	b3Assert( data->m_option == EXCLUSIVE );
+	const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));
 
 	dst.resize(src.size());
 	m_workBuffer->resize(src.size());
@@ -55,55 +53,51 @@ void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<uns
 	b3Int4 constBuffer;
 	constBuffer.x = n;
 	constBuffer.y = numBlocks;
-	constBuffer.z = (int)b3NextPowerOf2( numBlocks );
+	constBuffer.z = (int)b3NextPowerOf2(numBlocks);
 
 	b3OpenCLArray<unsigned int>* srcNative = &src;
 	b3OpenCLArray<unsigned int>* dstNative = &dst;
-	
+
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
 
-		b3LauncherCL launcher( m_commandQueue, m_localScanKernel,"m_localScanKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst(  constBuffer );
-		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+		b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
 	}
 
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};
 
-		b3LauncherCL launcher( m_commandQueue, m_blockSumKernel,"m_blockSumKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+		b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
 	}
-	
 
-	if( numBlocks > 1 )
+	if (numBlocks > 1)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
-		b3LauncherCL launcher( m_commandQueue, m_propagationKernel,"m_propagationKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+		b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
 	}
 
-
-	if( sum )
+	if (sum)
 	{
 		clFinish(m_commandQueue);
-		dstNative->copyToHostPointer(sum,1,n-1,true);
+		dstNative->copyToHostPointer(sum, 1, n - 1, true);
 	}
-
 }
 
-
 void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
 {
 	unsigned int s = 0;
 	//if( data->m_option == EXCLUSIVE )
 	{
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
 			dst[i] = s;
 			s += src[i];
@@ -119,8 +113,8 @@ void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3Alig
 	}
 	*/
 
-	if( sum )
+	if (sum)
 	{
-		*sum = dst[n-1];
+		*sum = dst[n - 1];
 	}
 }
 \ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
index a9a2e61b9e..346efa0c73 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
@@ -13,9 +13,9 @@ class b3PrefixScanCL
 		BLOCK_SIZE = 128
 	};
 
-//	Option m_option;
+	//	Option m_option;
 
-	cl_command_queue	m_commandQueue;
+	cl_command_queue m_commandQueue;
 
 	cl_kernel m_localScanKernel;
 	cl_kernel m_blockSumKernel;
@@ -23,15 +23,13 @@ class b3PrefixScanCL
 
 	b3OpenCLArray<unsigned int>* m_workBuffer;
 
-
-	public:
-		
-	b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
+public:
+	b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);
 
 	virtual ~b3PrefixScanCL();
 
 	void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
-	void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum=0);
+	void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum = 0);
 };
 
-#endif //B3_PREFIX_SCAN_CL_H
+#endif  //B3_PREFIX_SCAN_CL_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
index 80560d793d..1cac97c988 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
@@ -7,25 +7,24 @@
 #include "kernels/PrefixScanKernelsFloat4CL.h"
 
 b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	const char* scanKernelSource = prefixScanKernelsFloat4CL;
 	cl_int pErrNum;
-	char* additionalMacros=0;
+	char* additionalMacros = 0;
 
-	m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx,queue,size);
-	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
+	m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx, queue, size);
+	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
 	b3Assert(scanProg);
 
-	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_localScanKernel );
-	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_blockSumKernel );
-	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_propagationKernel );
+	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_localScanKernel);
+	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_blockSumKernel);
+	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_propagationKernel);
 }
 
-
 b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
 {
 	delete m_workBuffer;
@@ -34,20 +33,19 @@ b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
 	clReleaseKernel(m_propagationKernel);
 }
 
-template<class T>
+template <class T>
 T b3NextPowerOf2(T n)
 {
 	n -= 1;
-	for(int i=0; i<sizeof(T)*8; i++)
-		n = n | (n>>i);
-	return n+1;
+	for (int i = 0; i < sizeof(T) * 8; i++)
+		n = n | (n >> i);
+	return n + 1;
 }
 
 void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum)
 {
-	
-//	b3Assert( data->m_option == EXCLUSIVE );
-	const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+	//	b3Assert( data->m_option == EXCLUSIVE );
+	const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));
 
 	dst.resize(src.size());
 	m_workBuffer->resize(src.size());
@@ -55,55 +53,51 @@ void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<
 	b3Int4 constBuffer;
 	constBuffer.x = n;
 	constBuffer.y = numBlocks;
-	constBuffer.z = (int)b3NextPowerOf2( numBlocks );
+	constBuffer.z = (int)b3NextPowerOf2(numBlocks);
 
 	b3OpenCLArray<b3Vector3>* srcNative = &src;
 	b3OpenCLArray<b3Vector3>* dstNative = &dst;
-	
+
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
 
-		b3LauncherCL launcher( m_commandQueue, m_localScanKernel ,"m_localScanKernel");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst(  constBuffer );
-		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+		b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
 	}
 
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};
 
-		b3LauncherCL launcher( m_commandQueue, m_blockSumKernel ,"m_blockSumKernel");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+		b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
 	}
-	
 
-	if( numBlocks > 1 )
+	if (numBlocks > 1)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
-		b3LauncherCL launcher( m_commandQueue, m_propagationKernel ,"m_propagationKernel");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+		b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
 	}
 
-
-	if( sum )
+	if (sum)
 	{
 		clFinish(m_commandQueue);
-		dstNative->copyToHostPointer(sum,1,n-1,true);
+		dstNative->copyToHostPointer(sum, 1, n - 1, true);
 	}
-
 }
 
-
 void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum)
 {
-	b3Vector3 s=b3MakeVector3(0,0,0);
+	b3Vector3 s = b3MakeVector3(0, 0, 0);
 	//if( data->m_option == EXCLUSIVE )
 	{
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
 			dst[i] = s;
 			s += src[i];
@@ -119,8 +113,8 @@ void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3A
 	}
 	*/
 
-	if( sum )
+	if (sum)
 	{
-		*sum = dst[n-1];
+		*sum = dst[n - 1];
 	}
 }
 \ No newline at end of file
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
index 2c8003c1bb..122b0bfd68 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
@@ -14,9 +14,9 @@ class b3PrefixScanFloat4CL
 		BLOCK_SIZE = 128
 	};
 
-//	Option m_option;
+	//	Option m_option;
 
-	cl_command_queue	m_commandQueue;
+	cl_command_queue m_commandQueue;
 
 	cl_kernel m_localScanKernel;
 	cl_kernel m_blockSumKernel;
@@ -24,10 +24,8 @@ class b3PrefixScanFloat4CL
 
 	b3OpenCLArray<b3Vector3>* m_workBuffer;
 
-
-	public:
-		
-	b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
+public:
+	b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);
 
 	virtual ~b3PrefixScanFloat4CL();
 
@@ -35,4 +33,4 @@ class b3PrefixScanFloat4CL
 	void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum);
 };
 
-#endif //B3_PREFIX_SCAN_CL_H
+#endif  //B3_PREFIX_SCAN_CL_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
index f11ae4bcdb..e86af6583f 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
@@ -10,21 +10,20 @@
 #include "kernels/RadixSort32KernelsCL.h"
 
 b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	b3OpenCLDeviceInfo info;
-	b3OpenCLUtils::getDeviceInfo(device,&info);
-	m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
+	b3OpenCLUtils::getDeviceInfo(device, &info);
+	m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
 
-	m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue);
-	m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue);
-	m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue);
-	m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue);
-	m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue);
-	m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue);
+	m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
+	m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
+	m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
+	m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
+	m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
+	m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
 
-
-	if (initialCapacity>0)
+	if (initialCapacity > 0)
 	{
 		m_workBuffer1->resize(initialCapacity);
 		m_workBuffer3->resize(initialCapacity);
@@ -33,45 +32,40 @@ b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command
 		m_workBuffer4a->resize(initialCapacity);
 	}
 
-	m_scan = new b3PrefixScanCL(ctx,device,queue);
-	m_fill = new b3FillCL(ctx,device,queue);
-	
+	m_scan = new b3PrefixScanCL(ctx, device, queue);
+	m_fill = new b3FillCL(ctx, device, queue);
+
 	const char* additionalMacros = "";
 
 	cl_int pErrNum;
 	const char* kernelSource = radixSort32KernelsCL;
-	
-	cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
-	b3Assert(sortProg);
 
-	m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
-	b3Assert(m_streamCountSortDataKernel );
+	cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
+	b3Assert(sortProg);
 
+	m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
+	b3Assert(m_streamCountSortDataKernel);
 
-	
-	m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
+	m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
 	b3Assert(m_streamCountKernel);
 
-
-	
 	if (m_deviceCPU)
 	{
-		
-		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
+		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
 		b3Assert(m_sortAndScatterSortDataKernel);
-		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
+		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
 		b3Assert(m_sortAndScatterKernel);
-	} else
+	}
+	else
 	{
-		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
+		m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
 		b3Assert(m_sortAndScatterSortDataKernel);
-		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
+		m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
 		b3Assert(m_sortAndScatterKernel);
 	}
-		
-	m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
+
+	m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
 	b3Assert(m_prefixScanKernel);
-		
 }
 
 b3RadixSort32CL::~b3RadixSort32CL()
@@ -96,8 +90,7 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s
 {
 	int n = inout.size();
 	const int BITS_PER_PASS = 8;
-	const int NUM_TABLES = (1<<BITS_PER_PASS);
-
+	const int NUM_TABLES = (1 << BITS_PER_PASS);
 
 	int tables[NUM_TABLES];
 	int counter[NUM_TABLES];
@@ -107,34 +100,33 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s
 	workbuffer.resize(inout.size());
 	b3SortData* dst = &workbuffer[0];
 
-	int count=0;
-	for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
+	int count = 0;
+	for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
 	{
-		for(int i=0; i<NUM_TABLES; i++)
+		for (int i = 0; i < NUM_TABLES; i++)
 		{
 			tables[i] = 0;
 		}
 
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
-			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
+			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
 			tables[tableIdx]++;
 		}
 //#define TEST
 #ifdef TEST
-		printf("histogram size=%d\n",NUM_TABLES);
-		for (int i=0;i<NUM_TABLES;i++)
+		printf("histogram size=%d\n", NUM_TABLES);
+		for (int i = 0; i < NUM_TABLES; i++)
 		{
-			if (tables[i]!=0)
+			if (tables[i] != 0)
 			{
-				printf("tables[%d]=%d]\n",i,tables[i]);
+				printf("tables[%d]=%d]\n", i, tables[i]);
 			}
-
 		}
-#endif //TEST
-		//	prefix scan
+#endif  //TEST \
+	//	prefix scan
 		int sum = 0;
-		for(int i=0; i<NUM_TABLES; i++)
+		for (int i = 0; i < NUM_TABLES; i++)
 		{
 			int iData = tables[i];
 			tables[i] = sum;
@@ -143,71 +135,65 @@ void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int s
 		}
 
 		//	distribute
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
-			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
-			
+			int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
+
 			dst[tables[tableIdx] + counter[tableIdx]] = src[i];
-			counter[tableIdx] ++;
+			counter[tableIdx]++;
 		}
 
-		b3Swap( src, dst );
+		b3Swap(src, dst);
 		count++;
 	}
 
-	if (count&1)
+	if (count & 1)
 	{
-		b3Assert(0);//need to copy 
-
+		b3Assert(0);  //need to copy
 	}
 }
 
 void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
 {
-
 	b3AlignedObjectArray<b3SortData> inout;
 	keyValuesInOut.copyToHost(inout);
 
-	executeHost(inout,sortBits);
+	executeHost(inout, sortBits);
 
 	keyValuesInOut.copyFromHost(inout);
 }
 
-void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, 
-								b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
+void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
+							  b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
 {
-
 }
 
 //#define DEBUG_RADIXSORT
 //#define DEBUG_RADIXSORT2
 
-
 void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
 {
-	
 	int originalSize = keyValuesInOut.size();
 	int workingSize = originalSize;
-	
-			
+
 	int dataAlignment = DATA_ALIGNMENT;
 
 #ifdef DEBUG_RADIXSORT2
-    b3AlignedObjectArray<b3SortData>   test2;
-    keyValuesInOut.copyToHost(test2);
-    printf("numElem = %d\n",test2.size());
-    for (int i=0;i<test2.size();i++)
-    {
-        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
-        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
-    }
-#endif //DEBUG_RADIXSORT2
-    
+	b3AlignedObjectArray<b3SortData> test2;
+	keyValuesInOut.copyToHost(test2);
+	printf("numElem = %d\n", test2.size());
+	for (int i = 0; i < test2.size(); i++)
+	{
+		printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+		printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+	}
+#endif  //DEBUG_RADIXSORT2
+
 	b3OpenCLArray<b3SortData>* src = 0;
 
-	if (workingSize%dataAlignment)
+	if (workingSize % dataAlignment)
 	{
-		workingSize += dataAlignment-(workingSize%dataAlignment);
+		workingSize += dataAlignment - (workingSize % dataAlignment);
 		m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
 		m_workBuffer4->resize(workingSize);
 		b3SortData fillValue;
@@ -216,327 +202,301 @@ void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sor
 
 #define USE_BTFILL
 #ifdef USE_BTFILL
-		m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize);
+		m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
 #else
 		//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
-		
-		for (int i=originalSize; i<workingSize;i++)
+
+		for (int i = originalSize; i < workingSize; i++)
 		{
-			m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
+			m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
 		}
-#endif//USE_BTFILL
+#endif  //USE_BTFILL
 
 		src = m_workBuffer4;
-	} else
+	}
+	else
 	{
 		src = &keyValuesInOut;
 		m_workBuffer4->resize(0);
 	}
-		
-	b3Assert( workingSize%DATA_ALIGNMENT == 0 );
-	int minCap = NUM_BUCKET*NUM_WGS;
 
+	b3Assert(workingSize % DATA_ALIGNMENT == 0);
+	int minCap = NUM_BUCKET * NUM_WGS;
 
 	int n = workingSize;
 
 	m_workBuffer1->resize(minCap);
 	m_workBuffer3->resize(workingSize);
-	
 
-//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
-	b3Assert( BITS_PER_PASS == 4 );
-	b3Assert( WG_SIZE == 64 );
-	b3Assert( (sortBits&0x3) == 0 );
+	//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	b3Assert(BITS_PER_PASS == 4);
+	b3Assert(WG_SIZE == 64);
+	b3Assert((sortBits & 0x3) == 0);
 
-	
-	
 	b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
 
 	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
 	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
 
-
 	int nWGs = NUM_WGS;
 	b3ConstData cdata;
 
 	{
-        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
-     	int nBlocks = (n+blockSize-1)/(blockSize);
+		int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
+		int nBlocks = (n + blockSize - 1) / (blockSize);
 		cdata.m_n = n;
 		cdata.m_nWGs = NUM_WGS;
 		cdata.m_startBit = 0;
-		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
-		if( nBlocks < NUM_WGS )
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
+		if (nBlocks < NUM_WGS)
 		{
 			cdata.m_nBlocksPerWG = 1;
 			nWGs = nBlocks;
 		}
 	}
 
-	int count=0;
-	for(int ib=0; ib<sortBits; ib+=4)
+	int count = 0;
+	for (int ib = 0; ib < sortBits; ib += 4)
 	{
 #ifdef DEBUG_RADIXSORT2
-        keyValuesInOut.copyToHost(test2);
-        printf("numElem = %d\n",test2.size());
-        for (int i=0;i<test2.size();i++)
-        {
-            if (test2[i].m_key != test2[i].m_value)
-            {
-                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
-                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
-            }
-        }
-#endif //DEBUG_RADIXSORT2
-        
+		keyValuesInOut.copyToHost(test2);
+		printf("numElem = %d\n", test2.size());
+		for (int i = 0; i < test2.size(); i++)
+		{
+			if (test2[i].m_key != test2[i].m_value)
+			{
+				printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+				printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+			}
+		}
+#endif  //DEBUG_RADIXSORT2
+
 		cdata.m_startBit = ib;
-		
+
 		if (src->size())
 		{
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
-			b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel");
-
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			
-			int num = NUM_WGS*WG_SIZE;
-			launcher.launch1D( num, WG_SIZE );
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
+
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+
+			int num = NUM_WGS * WG_SIZE;
+			launcher.launch1D(num, WG_SIZE);
 		}
 
-        
-        
 #ifdef DEBUG_RADIXSORT
 		b3AlignedObjectArray<unsigned int> testHist;
 		srcHisto->copyToHost(testHist);
-		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
-		for (int i=0;i<testHist.size();i++)
+		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
+		for (int i = 0; i < testHist.size(); i++)
 		{
-			if (testHist[i]!=0)
-				printf("testHist[%d]=%d\n",i,testHist[i]);
+			if (testHist[i] != 0)
+				printf("testHist[%d]=%d\n", i, testHist[i]);
 		}
-#endif //DEBUG_RADIXSORT
-	
-	
+#endif  //DEBUG_RADIXSORT
 
 //fast prefix scan is not working properly on Mac OSX yet
 #ifdef __APPLE__
-	bool fastScan=false;
+		bool fastScan = false;
 #else
-	bool fastScan=!m_deviceCPU;//only use fast scan on GPU
+		bool fastScan = !m_deviceCPU;  //only use fast scan on GPU
 #endif
 
 		if (fastScan)
-		{//	prefix scan group histogram
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
-			b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			launcher.launch1D( 128, 128 );
+		{  //	prefix scan group histogram
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+			launcher.launch1D(128, 128);
 			destHisto = srcHisto;
-		}else
+		}
+		else
 		{
 			//unsigned int sum; //for debugging
-            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+			m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
 		}
 
-
 #ifdef DEBUG_RADIXSORT
 		destHisto->copyToHost(testHist);
-		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
-		for (int i=0;i<testHist.size();i++)
+		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
+		for (int i = 0; i < testHist.size(); i++)
 		{
-			if (testHist[i]!=0)
-				printf("testHist[%d]=%d\n",i,testHist[i]);
+			if (testHist[i] != 0)
+				printf("testHist[%d]=%d\n", i, testHist[i]);
 		}
-        
-        for (int i=0;i<testHist.size();i+=NUM_WGS)
+
+		for (int i = 0; i < testHist.size(); i += NUM_WGS)
 		{
-				printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
+			printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
 		}
 
-#endif //DEBUG_RADIXSORT
+#endif  //DEBUG_RADIXSORT
 
 #define USE_GPU
 #ifdef USE_GPU
-        
+
 		if (src->size())
-		{//	local sort and distribute
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
-			b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" );
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
-            
+		{  //	local sort and distribute
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+			launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
 		}
 #else
-        {
+		{
 #define NUM_TABLES 16
 //#define SEQUENTIAL
 #ifdef SEQUENTIAL
-            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-            int tables[NUM_TABLES];
-            int startBit = ib;
-            
-            destHisto->copyToHost(testHist);
-            b3AlignedObjectArray<b3SortData> srcHost;
-            b3AlignedObjectArray<b3SortData> dstHost;
-            dstHost.resize(src->size());
-            
-            src->copyToHost(srcHost);
-            
-            for (int i=0;i<NUM_TABLES;i++)
-            {
-                tables[i] = testHist[i*NUM_WGS];
-            }
-            
-            //	distribute
-            for(int i=0; i<n; i++)
-            {
-                int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-                
-                dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
-                counter2[tableIdx] ++;
-            }
-            
-            
+			int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+			int tables[NUM_TABLES];
+			int startBit = ib;
+
+			destHisto->copyToHost(testHist);
+			b3AlignedObjectArray<b3SortData> srcHost;
+			b3AlignedObjectArray<b3SortData> dstHost;
+			dstHost.resize(src->size());
+
+			src->copyToHost(srcHost);
+
+			for (int i = 0; i < NUM_TABLES; i++)
+			{
+				tables[i] = testHist[i * NUM_WGS];
+			}
+
+			//	distribute
+			for (int i = 0; i < n; i++)
+			{
+				int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
+
+				dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+				counter2[tableIdx]++;
+			}
+
 #else
-          
-            int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-            
-            int tables[NUM_TABLES];
-             b3AlignedObjectArray<b3SortData> dstHostOK;
-            dstHostOK.resize(src->size());
-
-            destHisto->copyToHost(testHist);
-            b3AlignedObjectArray<b3SortData> srcHost;
-            src->copyToHost(srcHost);
-        
-            int blockSize = 256;
-            int nBlocksPerWG = cdata.m_nBlocksPerWG;
-            int startBit = ib;
-
-            {
-                for (int i=0;i<NUM_TABLES;i++)
-                {
-                    tables[i] = testHist[i*NUM_WGS];
-                }
-                
-                //	distribute
-                for(int i=0; i<n; i++)
-                {
-                    int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-                    
-                    dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
-                    counter2[tableIdx] ++;
-                }
-
-            
-            }
-            
-            
-            b3AlignedObjectArray<b3SortData> dstHost;
-            dstHost.resize(src->size());
-            
-            
-            int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-            
-            
-            
-            for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
-            {
-              int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-              int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
-                
-              for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
-              {
-                for (int lIdx = 0;lIdx < 64;lIdx++)
-                {
-                    int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
-                    
-                    //	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
-                    //	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
-                    //	AMD: AtomInc performs better while NV prefers ++
-                    for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
-                    {
-                        if( addr+j < n )
-                        {
-                          //  printf ("addr+j=%d\n", addr+j);
-                            
-                            int i = addr+j;
-                            
-                            int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-                            
-                            int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
-                            
-                            b3SortData ok = dstHostOK[destIndex];
-                                                    
-                            if (ok.m_key != srcHost[i].m_key)
-                            {
-                                printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
-                                printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
-                            }
-                            if (ok.m_value != srcHost[i].m_value)
-                            {
-                                
-                               printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
-                                printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
-
-                            }
-                   
-                            dstHost[destIndex] = srcHost[i];
-                            counter[tableIdx] ++;
-                            
-                        }
-                    }
-                }
-              }
-            }
-            
-         
-#endif //SEQUENTIAL
-            
-            dst->copyFromHost(dstHost);
-        }
-#endif//USE_GPU
-        
-        
-        
+
+			int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+			int tables[NUM_TABLES];
+			b3AlignedObjectArray<b3SortData> dstHostOK;
+			dstHostOK.resize(src->size());
+
+			destHisto->copyToHost(testHist);
+			b3AlignedObjectArray<b3SortData> srcHost;
+			src->copyToHost(srcHost);
+
+			int blockSize = 256;
+			int nBlocksPerWG = cdata.m_nBlocksPerWG;
+			int startBit = ib;
+
+			{
+				for (int i = 0; i < NUM_TABLES; i++)
+				{
+					tables[i] = testHist[i * NUM_WGS];
+				}
+
+				//	distribute
+				for (int i = 0; i < n; i++)
+				{
+					int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
+
+					dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
+					counter2[tableIdx]++;
+				}
+			}
+
+			b3AlignedObjectArray<b3SortData> dstHost;
+			dstHost.resize(src->size());
+
+			int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+			for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
+			{
+				int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+				int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
+
+				for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
+				{
+					for (int lIdx = 0; lIdx < 64; lIdx++)
+					{
+						int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
+
+						//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
+						//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
+						//	AMD: AtomInc performs better while NV prefers ++
+						for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
+						{
+							if (addr + j < n)
+							{
+								//  printf ("addr+j=%d\n", addr+j);
+
+								int i = addr + j;
+
+								int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
+
+								int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
+
+								b3SortData ok = dstHostOK[destIndex];
+
+								if (ok.m_key != srcHost[i].m_key)
+								{
+									printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
+									printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
+								}
+								if (ok.m_value != srcHost[i].m_value)
+								{
+									printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
+									printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
+								}
+
+								dstHost[destIndex] = srcHost[i];
+								counter[tableIdx]++;
+							}
+						}
+					}
+				}
+			}
+
+#endif  //SEQUENTIAL
+
+			dst->copyFromHost(dstHost);
+		}
+#endif  //USE_GPU
+
 #ifdef DEBUG_RADIXSORT
 		destHisto->copyToHost(testHist);
-		printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
-		for (int i=0;i<testHist.size();i++)
+		printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
+		for (int i = 0; i < testHist.size(); i++)
 		{
-			if (testHist[i]!=0)
-				printf("testHist[%d]=%d\n",i,testHist[i]);
+			if (testHist[i] != 0)
+				printf("testHist[%d]=%d\n", i, testHist[i]);
 		}
-#endif //DEBUG_RADIXSORT
-		b3Swap(src, dst );
-		b3Swap(srcHisto,destHisto);
+#endif  //DEBUG_RADIXSORT
+		b3Swap(src, dst);
+		b3Swap(srcHisto, destHisto);
 
 #ifdef DEBUG_RADIXSORT2
-        keyValuesInOut.copyToHost(test2);
-        printf("numElem = %d\n",test2.size());
-        for (int i=0;i<test2.size();i++)
-        {
-            if (test2[i].m_key != test2[i].m_value)
-            {
-                printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
-                printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
-            }
-        }
-#endif //DEBUG_RADIXSORT2
-        
-        count++;
-                
-        
+		keyValuesInOut.copyToHost(test2);
+		printf("numElem = %d\n", test2.size());
+		for (int i = 0; i < test2.size(); i++)
+		{
+			if (test2[i].m_key != test2[i].m_value)
+			{
+				printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+				printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+			}
+		}
+#endif  //DEBUG_RADIXSORT2
+
+		count++;
 	}
-	
-   
-    
-	if (count&1)
+
+	if (count & 1)
 	{
-		b3Assert(0);//need to copy from workbuffer to keyValuesInOut
+		b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
 	}
 
 	if (m_workBuffer4->size())
@@ -545,153 +505,137 @@ void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sor
 		keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
 	}
 
-
 #ifdef DEBUG_RADIXSORT
-    keyValuesInOut.copyToHost(test2);
-   
-    printf("numElem = %d\n",test2.size());
-    for (int i=0;i<test2.size();i++)
-    {
-        printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
-        printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
-    }
-#endif    
-	
-}
-
-
-
-
+	keyValuesInOut.copyToHost(test2);
 
+	printf("numElem = %d\n", test2.size());
+	for (int i = 0; i < test2.size(); i++)
+	{
+		printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
+		printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
+	}
+#endif
+}
 
 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
 {
 	int originalSize = keysInOut.size();
 	int workingSize = originalSize;
-	
-			
+
 	int dataAlignment = DATA_ALIGNMENT;
 
 	b3OpenCLArray<unsigned int>* src = 0;
 
-	if (workingSize%dataAlignment)
+	if (workingSize % dataAlignment)
 	{
-		workingSize += dataAlignment-(workingSize%dataAlignment);
+		workingSize += dataAlignment - (workingSize % dataAlignment);
 		m_workBuffer4a->copyFromOpenCLArray(keysInOut);
 		m_workBuffer4a->resize(workingSize);
 		unsigned int fillValue = 0xffffffff;
-		
-		m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
+
+		m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
 
 		src = m_workBuffer4a;
-	} else
+	}
+	else
 	{
 		src = &keysInOut;
 		m_workBuffer4a->resize(0);
 	}
-	
-	
-
-	b3Assert( workingSize%DATA_ALIGNMENT == 0 );
-	int minCap = NUM_BUCKET*NUM_WGS;
 
+	b3Assert(workingSize % DATA_ALIGNMENT == 0);
+	int minCap = NUM_BUCKET * NUM_WGS;
 
 	int n = workingSize;
 
-	
 	m_workBuffer1->resize(minCap);
 	m_workBuffer3->resize(workingSize);
 	m_workBuffer3a->resize(workingSize);
 
-//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
-	b3Assert( BITS_PER_PASS == 4 );
-	b3Assert( WG_SIZE == 64 );
-	b3Assert( (sortBits&0x3) == 0 );
+	//	ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
+	b3Assert(BITS_PER_PASS == 4);
+	b3Assert(WG_SIZE == 64);
+	b3Assert((sortBits & 0x3) == 0);
 
-	
-	
 	b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
 
 	b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
 	b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
 
-
 	int nWGs = NUM_WGS;
 	b3ConstData cdata;
 
 	{
-        int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
-     	int nBlocks = (n+blockSize-1)/(blockSize);
+		int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
+		int nBlocks = (n + blockSize - 1) / (blockSize);
 		cdata.m_n = n;
 		cdata.m_nWGs = NUM_WGS;
 		cdata.m_startBit = 0;
-		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
-		if( nBlocks < NUM_WGS )
+		cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
+		if (nBlocks < NUM_WGS)
 		{
 			cdata.m_nBlocksPerWG = 1;
 			nWGs = nBlocks;
 		}
 	}
 
-	int count=0;
-	for(int ib=0; ib<sortBits; ib+=4)
+	int count = 0;
+	for (int ib = 0; ib < sortBits; ib += 4)
 	{
 		cdata.m_startBit = ib;
-		
+
 		if (src->size())
 		{
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
-			b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel");
-
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			
-			int num = NUM_WGS*WG_SIZE;
-			launcher.launch1D( num, WG_SIZE );
-		}
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
+
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
 
-        
+			int num = NUM_WGS * WG_SIZE;
+			launcher.launch1D(num, WG_SIZE);
+		}
 
 //fast prefix scan is not working properly on Mac OSX yet
 #ifdef __APPLE__
-	bool fastScan=false;	
+		bool fastScan = false;
 #else
-	bool fastScan=!m_deviceCPU;
+		bool fastScan = !m_deviceCPU;
 #endif
 
 		if (fastScan)
-		{//	prefix scan group histogram
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
-			b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			launcher.launch1D( 128, 128 );
+		{  //	prefix scan group histogram
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+			launcher.launch1D(128, 128);
 			destHisto = srcHisto;
-		}else
+		}
+		else
 		{
 			//unsigned int sum; //for debugging
-            m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
+			m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
 		}
 
 		if (src->size())
-		{//	local sort and distribute
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
-			b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel");
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
-            
+		{  //	local sort and distribute
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
+			b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(cdata);
+			launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
 		}
-        
-		b3Swap(src, dst );
-		b3Swap(srcHisto,destHisto);
 
-        count++;
+		b3Swap(src, dst);
+		b3Swap(srcHisto, destHisto);
+
+		count++;
 	}
-    
-	if (count&1)
+
+	if (count & 1)
 	{
-		b3Assert(0);//need to copy from workbuffer to keyValuesInOut
+		b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
 	}
 
 	if (m_workBuffer4a->size())
@@ -699,12 +643,4 @@ void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBi
 		m_workBuffer4a->resize(originalSize);
 		keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
 	}
-	
 }
-
-
-
-
-
-
-
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
index 975bd80e53..69caf182d7 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
@@ -6,90 +6,79 @@
 
 struct b3SortData
 {
-	union
-	{
+	union {
 		unsigned int m_key;
 		unsigned int x;
 	};
 
-	union
-	{
+	union {
 		unsigned int m_value;
 		unsigned int y;
-		
 	};
 };
 #include "b3BufferInfoCL.h"
 
-class  b3RadixSort32CL
+class b3RadixSort32CL
 {
+	b3OpenCLArray<unsigned int>* m_workBuffer1;
+	b3OpenCLArray<unsigned int>* m_workBuffer2;
 
-		b3OpenCLArray<unsigned int>* m_workBuffer1;
-		b3OpenCLArray<unsigned int>* m_workBuffer2;
-		
-		b3OpenCLArray<b3SortData>*	m_workBuffer3;
-		b3OpenCLArray<b3SortData>*	m_workBuffer4;
-
-		b3OpenCLArray<unsigned int>* m_workBuffer3a;
-		b3OpenCLArray<unsigned int>* m_workBuffer4a;
+	b3OpenCLArray<b3SortData>* m_workBuffer3;
+	b3OpenCLArray<b3SortData>* m_workBuffer4;
 
-		cl_command_queue	m_commandQueue;
+	b3OpenCLArray<unsigned int>* m_workBuffer3a;
+	b3OpenCLArray<unsigned int>* m_workBuffer4a;
 
-		cl_kernel m_streamCountSortDataKernel;
-		cl_kernel m_streamCountKernel;
+	cl_command_queue m_commandQueue;
 
-		cl_kernel m_prefixScanKernel;
-		cl_kernel m_sortAndScatterSortDataKernel;
-		cl_kernel m_sortAndScatterKernel;
+	cl_kernel m_streamCountSortDataKernel;
+	cl_kernel m_streamCountKernel;
 
+	cl_kernel m_prefixScanKernel;
+	cl_kernel m_sortAndScatterSortDataKernel;
+	cl_kernel m_sortAndScatterKernel;
 
-		bool	m_deviceCPU;
+	bool m_deviceCPU;
 
-		class b3PrefixScanCL* m_scan;
-		class b3FillCL*	m_fill;
+	class b3PrefixScanCL* m_scan;
+	class b3FillCL* m_fill;
 
 public:
 	struct b3ConstData
-		{
-			int m_n;
-			int m_nWGs;
-			int m_startBit;
-			int m_nBlocksPerWG;
-		};
+	{
+		int m_n;
+		int m_nWGs;
+		int m_startBit;
+		int m_nBlocksPerWG;
+	};
 	enum
-		{
-			DATA_ALIGNMENT = 256,
-			WG_SIZE = 64,
-            BLOCK_SIZE = 256,
-			ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
-			BITS_PER_PASS = 4,
-			NUM_BUCKET=(1<<BITS_PER_PASS),
-			//	if you change this, change nPerWI in kernel as well
-			NUM_WGS = 20*6,	//	cypress
-//			NUM_WGS = 24*6,	//	cayman
-//			NUM_WGS = 32*4,	//	nv
-		};
-
+	{
+		DATA_ALIGNMENT = 256,
+		WG_SIZE = 64,
+		BLOCK_SIZE = 256,
+		ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
+		BITS_PER_PASS = 4,
+		NUM_BUCKET = (1 << BITS_PER_PASS),
+		//	if you change this, change nPerWI in kernel as well
+		NUM_WGS = 20 * 6,  //	cypress
+						   //			NUM_WGS = 24*6,	//	cayman
+						   //			NUM_WGS = 32*4,	//	nv
+	};
 
 private:
-		
-
 public:
+	b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);
 
-		b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
+	virtual ~b3RadixSort32CL();
 
-		virtual ~b3RadixSort32CL();
+	void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
+				 b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
 
-		void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, 
-								b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
-
-		///keys only
-		void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits  = 32 );
-
-		void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits  = 32 );
-		void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
-		void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+	///keys only
+	void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);
 
+	void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+	void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+	void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
 };
-#endif //B3_RADIXSORT32_H
-
+#endif  //B3_RADIXSORT32_H
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
index 9c9e847138..1758dd41e3 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
@@ -1,87 +1,86 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* boundSearchKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"typedef struct\n"
-"{\n"
-"	u32 m_key; \n"
-"	u32 m_value;\n"
-"}SortData;\n"
-"typedef struct\n"
-"{\n"
-"	u32 m_nSrc;\n"
-"	u32 m_nDst;\n"
-"	u32 m_padding[2];\n"
-"} ConstBuffer;\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"__kernel\n"
-"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
-"					unsigned int nSrc, unsigned int nDst)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < nSrc )\n"
-"	{\n"
-"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
-"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
-"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-"		if( iData.m_key != jData.m_key )\n"
-"		{\n"
-"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
-"			u32 k = jData.m_key;\n"
-"			{\n"
-"				dst[k] = gIdx;\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"__kernel\n"
-"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
-"					unsigned int nSrc, unsigned int nDst)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX+1;\n"
-"	if( gIdx < nSrc+1 )\n"
-"	{\n"
-"		SortData first; first.m_key = 0; first.m_value = 0;\n"
-"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-"		SortData iData = src[gIdx-1];\n"
-"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-"		if( iData.m_key != jData.m_key )\n"
-"		{\n"
-"			u32 k = iData.m_key;\n"
-"			{\n"
-"				dst[k] = gIdx;\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"__kernel\n"
-"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
-"					unsigned int nSrc, unsigned int nDst)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	\n"
-"	if( gIdx < nDst )\n"
-"	{\n"
-"		C[gIdx] = A[gIdx] - B[gIdx];\n"
-"	}\n"
-"}\n"
-;
+static const char* boundSearchKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_key; \n"
+	"	u32 m_value;\n"
+	"}SortData;\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_nSrc;\n"
+	"	u32 m_nDst;\n"
+	"	u32 m_padding[2];\n"
+	"} ConstBuffer;\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"__kernel\n"
+	"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
+	"					unsigned int nSrc, unsigned int nDst)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < nSrc )\n"
+	"	{\n"
+	"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
+	"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+	"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+	"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+	"		if( iData.m_key != jData.m_key )\n"
+	"		{\n"
+	"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+	"			u32 k = jData.m_key;\n"
+	"			{\n"
+	"				dst[k] = gIdx;\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"__kernel\n"
+	"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
+	"					unsigned int nSrc, unsigned int nDst)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX+1;\n"
+	"	if( gIdx < nSrc+1 )\n"
+	"	{\n"
+	"		SortData first; first.m_key = 0; first.m_value = 0;\n"
+	"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+	"		SortData iData = src[gIdx-1];\n"
+	"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+	"		if( iData.m_key != jData.m_key )\n"
+	"		{\n"
+	"			u32 k = iData.m_key;\n"
+	"			{\n"
+	"				dst[k] = gIdx;\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"__kernel\n"
+	"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
+	"					unsigned int nSrc, unsigned int nDst)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	\n"
+	"	if( gIdx < nDst )\n"
+	"	{\n"
+	"		C[gIdx] = A[gIdx] - B[gIdx];\n"
+	"	}\n"
+	"}\n";
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
index e5670e3cd3..33c9279462 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
@@ -1,132 +1,131 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* copyKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"\n"
-"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"\n"
-"#define make_uint4 (uint4)\n"
-"#define make_uint2 (uint2)\n"
-"#define make_int2 (int2)\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	int m_n;\n"
-"	int m_padding[3];\n"
-"} ConstBuffer;\n"
-"\n"
-"\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( gIdx < cb.m_n )\n"
-"	{\n"
-"		float4 a0 = src[gIdx];\n"
-"\n"
-"		dst[ gIdx ] = a0;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( 2*gIdx <= cb.m_n )\n"
-"	{\n"
-"		float4 a0 = src[gIdx*2+0];\n"
-"		float4 a1 = src[gIdx*2+1];\n"
-"\n"
-"		dst[ gIdx*2+0 ] = a0;\n"
-"		dst[ gIdx*2+1 ] = a1;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( 4*gIdx <= cb.m_n )\n"
-"	{\n"
-"		int idx0 = gIdx*4+0;\n"
-"		int idx1 = gIdx*4+1;\n"
-"		int idx2 = gIdx*4+2;\n"
-"		int idx3 = gIdx*4+3;\n"
-"\n"
-"		float4 a0 = src[idx0];\n"
-"		float4 a1 = src[idx1];\n"
-"		float4 a2 = src[idx2];\n"
-"		float4 a3 = src[idx3];\n"
-"\n"
-"		dst[ idx0 ] = a0;\n"
-"		dst[ idx1 ] = a1;\n"
-"		dst[ idx2 ] = a2;\n"
-"		dst[ idx3 ] = a3;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( gIdx < cb.m_n )\n"
-"	{\n"
-"		float a0 = srcF1[gIdx];\n"
-"\n"
-"		dstF1[ gIdx ] = a0;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( gIdx < cb.m_n )\n"
-"	{\n"
-"		float2 a0 = srcF2[gIdx];\n"
-"\n"
-"		dstF2[ gIdx ] = a0;\n"
-"	}\n"
-"}\n"
-"\n"
-"\n"
-;
+static const char* copyKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"\n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"\n"
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"\n"
+	"#define make_uint4 (uint4)\n"
+	"#define make_uint2 (uint2)\n"
+	"#define make_int2 (int2)\n"
+	"\n"
+	"typedef struct\n"
+	"{\n"
+	"	int m_n;\n"
+	"	int m_padding[3];\n"
+	"} ConstBuffer;\n"
+	"\n"
+	"\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( gIdx < cb.m_n )\n"
+	"	{\n"
+	"		float4 a0 = src[gIdx];\n"
+	"\n"
+	"		dst[ gIdx ] = a0;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( 2*gIdx <= cb.m_n )\n"
+	"	{\n"
+	"		float4 a0 = src[gIdx*2+0];\n"
+	"		float4 a1 = src[gIdx*2+1];\n"
+	"\n"
+	"		dst[ gIdx*2+0 ] = a0;\n"
+	"		dst[ gIdx*2+1 ] = a1;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( 4*gIdx <= cb.m_n )\n"
+	"	{\n"
+	"		int idx0 = gIdx*4+0;\n"
+	"		int idx1 = gIdx*4+1;\n"
+	"		int idx2 = gIdx*4+2;\n"
+	"		int idx3 = gIdx*4+3;\n"
+	"\n"
+	"		float4 a0 = src[idx0];\n"
+	"		float4 a1 = src[idx1];\n"
+	"		float4 a2 = src[idx2];\n"
+	"		float4 a3 = src[idx3];\n"
+	"\n"
+	"		dst[ idx0 ] = a0;\n"
+	"		dst[ idx1 ] = a1;\n"
+	"		dst[ idx2 ] = a2;\n"
+	"		dst[ idx3 ] = a3;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( gIdx < cb.m_n )\n"
+	"	{\n"
+	"		float a0 = srcF1[gIdx];\n"
+	"\n"
+	"		dstF1[ gIdx ] = a0;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( gIdx < cb.m_n )\n"
+	"	{\n"
+	"		float2 a0 = srcF2[gIdx];\n"
+	"\n"
+	"		dstF2[ gIdx ] = a0;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"\n";
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
index 4f8b96e489..983e652270 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
@@ -1,91 +1,90 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* fillKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"#define make_uint4 (uint4)\n"
-"#define make_uint2 (uint2)\n"
-"#define make_int2 (int2)\n"
-"typedef struct\n"
-"{\n"
-"	union\n"
-"	{\n"
-"		int4 m_data;\n"
-"		uint4 m_unsignedData;\n"
-"		float	m_floatData;\n"
-"	};\n"
-"	int m_offset;\n"
-"	int m_n;\n"
-"	int m_padding[2];\n"
-"} ConstBuffer;\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num_elements )\n"
-"	{\n"
-"		dstInt[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num_elements )\n"
-"	{\n"
-"		dstFloat[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num )\n"
-"	{\n"
-"		dstInt[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num )\n"
-"	{\n"
-"		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num )\n"
-"	{\n"
-"		dstInt4[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-;
+static const char* fillKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"#define make_uint4 (uint4)\n"
+	"#define make_uint2 (uint2)\n"
+	"#define make_int2 (int2)\n"
+	"typedef struct\n"
+	"{\n"
+	"	union\n"
+	"	{\n"
+	"		int4 m_data;\n"
+	"		uint4 m_unsignedData;\n"
+	"		float	m_floatData;\n"
+	"	};\n"
+	"	int m_offset;\n"
+	"	int m_n;\n"
+	"	int m_padding[2];\n"
+	"} ConstBuffer;\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num_elements )\n"
+	"	{\n"
+	"		dstInt[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num_elements )\n"
+	"	{\n"
+	"		dstFloat[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num )\n"
+	"	{\n"
+	"		dstInt[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num )\n"
+	"	{\n"
+	"		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num )\n"
+	"	{\n"
+	"		dstInt4[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n";
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
index 27baab8331..fc5e7b865c 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
@@ -1,129 +1,128 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* prefixScanKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"// takahiro end\n"
-"#define WG_SIZE 128 \n"
-"#define m_numElems x\n"
-"#define m_numBlocks y\n"
-"#define m_numScanBlocks z\n"
-"/*typedef struct\n"
-"{\n"
-"	uint m_numElems;\n"
-"	uint m_numBlocks;\n"
-"	uint m_numScanBlocks;\n"
-"	uint m_padding[1];\n"
-"} ConstBuffer;\n"
-"*/\n"
-"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
-"{\n"
-"	u32 blocksum;\n"
-"    int offset = 1;\n"
-"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            data[bi] += data[ai];\n"
-"        }\n"
-"	}\n"
-"    GROUP_LDS_BARRIER;\n"
-"    if( lIdx == 0 )\n"
-"	{\n"
-"		blocksum = data[ n-1 ];\n"
-"        data[ n-1 ] = 0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	offset >>= 1;\n"
-"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            u32 temp = data[ai];\n"
-"            data[ai] = data[bi];\n"
-"            data[bi] += temp;\n"
-"        }\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	return blocksum;\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
-"		uint4 cb)\n"
-"{\n"
-"	__local u32 ldsData[WG_SIZE*2];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
-"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
-"	if( (2*gIdx) < cb.m_numElems )\n"
-"    {\n"
-"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
-"	}\n"
-"	if( (2*gIdx + 1) < cb.m_numElems )\n"
-"	{\n"
-"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
-"    }\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
-"{\n"
-"	const u32 blockSize = WG_SIZE*2;\n"
-"	int myIdx = GET_GROUP_IDX+1;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	u32 iBlockSum = blockSum[myIdx];\n"
-"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
-"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
-"	{\n"
-"		dst[i] += iBlockSum;\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
-"{\n"
-"	__local u32 ldsData[2048];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	int lSize = GET_GROUP_SIZE;\n"
-"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
-"	{\n"
-"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
-"	{\n"
-"		dst[i] = ldsData[i];\n"
-"	}\n"
-"	if( gIdx == 0 )\n"
-"	{\n"
-"		dst[cb.m_numBlocks] = sum;\n"
-"	}\n"
-"}\n"
-;
+static const char* prefixScanKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"// takahiro end\n"
+	"#define WG_SIZE 128 \n"
+	"#define m_numElems x\n"
+	"#define m_numBlocks y\n"
+	"#define m_numScanBlocks z\n"
+	"/*typedef struct\n"
+	"{\n"
+	"	uint m_numElems;\n"
+	"	uint m_numBlocks;\n"
+	"	uint m_numScanBlocks;\n"
+	"	uint m_padding[1];\n"
+	"} ConstBuffer;\n"
+	"*/\n"
+	"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
+	"{\n"
+	"	u32 blocksum;\n"
+	"    int offset = 1;\n"
+	"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            data[bi] += data[ai];\n"
+	"        }\n"
+	"	}\n"
+	"    GROUP_LDS_BARRIER;\n"
+	"    if( lIdx == 0 )\n"
+	"	{\n"
+	"		blocksum = data[ n-1 ];\n"
+	"        data[ n-1 ] = 0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	offset >>= 1;\n"
+	"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            u32 temp = data[ai];\n"
+	"            data[ai] = data[bi];\n"
+	"            data[bi] += temp;\n"
+	"        }\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	return blocksum;\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
+	"		uint4 cb)\n"
+	"{\n"
+	"	__local u32 ldsData[WG_SIZE*2];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+	"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+	"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
+	"	if( (2*gIdx) < cb.m_numElems )\n"
+	"    {\n"
+	"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+	"	}\n"
+	"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+	"	{\n"
+	"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+	"    }\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
+	"{\n"
+	"	const u32 blockSize = WG_SIZE*2;\n"
+	"	int myIdx = GET_GROUP_IDX+1;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	u32 iBlockSum = blockSum[myIdx];\n"
+	"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+	"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+	"	{\n"
+	"		dst[i] += iBlockSum;\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
+	"{\n"
+	"	__local u32 ldsData[2048];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int lSize = GET_GROUP_SIZE;\n"
+	"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+	"	{\n"
+	"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+	"	{\n"
+	"		dst[i] = ldsData[i];\n"
+	"	}\n"
+	"	if( gIdx == 0 )\n"
+	"	{\n"
+	"		dst[cb.m_numBlocks] = sum;\n"
+	"	}\n"
+	"}\n";
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
index 5b13254796..15d1bc5195 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
@@ -1,129 +1,128 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* prefixScanKernelsFloat4CL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"// takahiro end\n"
-"#define WG_SIZE 128 \n"
-"#define m_numElems x\n"
-"#define m_numBlocks y\n"
-"#define m_numScanBlocks z\n"
-"/*typedef struct\n"
-"{\n"
-"	uint m_numElems;\n"
-"	uint m_numBlocks;\n"
-"	uint m_numScanBlocks;\n"
-"	uint m_padding[1];\n"
-"} ConstBuffer;\n"
-"*/\n"
-"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
-"{\n"
-"	float4 blocksum;\n"
-"    int offset = 1;\n"
-"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            data[bi] += data[ai];\n"
-"        }\n"
-"	}\n"
-"    GROUP_LDS_BARRIER;\n"
-"    if( lIdx == 0 )\n"
-"	{\n"
-"		blocksum = data[ n-1 ];\n"
-"    data[ n-1 ] = 0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	offset >>= 1;\n"
-"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            float4 temp = data[ai];\n"
-"            data[ai] = data[bi];\n"
-"            data[bi] += temp;\n"
-"        }\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	return blocksum;\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)\n"
-"{\n"
-"	__local float4 ldsData[WG_SIZE*2];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
-"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-"	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	if( lIdx == 0 ) \n"
-"		sumBuffer[GET_GROUP_IDX] = sum;\n"
-"	if( (2*gIdx) < cb.m_numElems )\n"
-"    {\n"
-"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
-"	}\n"
-"	if( (2*gIdx + 1) < cb.m_numElems )\n"
-"	{\n"
-"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
-"    }\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
-"{\n"
-"	const u32 blockSize = WG_SIZE*2;\n"
-"	int myIdx = GET_GROUP_IDX+1;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	float4 iBlockSum = blockSum[myIdx];\n"
-"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
-"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
-"	{\n"
-"		dst[i] += iBlockSum;\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
-"{\n"
-"	__local float4 ldsData[2048];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	int lSize = GET_GROUP_SIZE;\n"
-"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
-"	{\n"
-"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
-"	{\n"
-"		dst[i] = ldsData[i];\n"
-"	}\n"
-"	if( gIdx == 0 )\n"
-"	{\n"
-"		dst[cb.m_numBlocks] = sum;\n"
-"	}\n"
-"}\n"
-;
+static const char* prefixScanKernelsFloat4CL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"// takahiro end\n"
+	"#define WG_SIZE 128 \n"
+	"#define m_numElems x\n"
+	"#define m_numBlocks y\n"
+	"#define m_numScanBlocks z\n"
+	"/*typedef struct\n"
+	"{\n"
+	"	uint m_numElems;\n"
+	"	uint m_numBlocks;\n"
+	"	uint m_numScanBlocks;\n"
+	"	uint m_padding[1];\n"
+	"} ConstBuffer;\n"
+	"*/\n"
+	"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
+	"{\n"
+	"	float4 blocksum;\n"
+	"    int offset = 1;\n"
+	"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            data[bi] += data[ai];\n"
+	"        }\n"
+	"	}\n"
+	"    GROUP_LDS_BARRIER;\n"
+	"    if( lIdx == 0 )\n"
+	"	{\n"
+	"		blocksum = data[ n-1 ];\n"
+	"    data[ n-1 ] = 0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	offset >>= 1;\n"
+	"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            float4 temp = data[ai];\n"
+	"            data[ai] = data[bi];\n"
+	"            data[bi] += temp;\n"
+	"        }\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	return blocksum;\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)\n"
+	"{\n"
+	"	__local float4 ldsData[WG_SIZE*2];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+	"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+	"	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	if( lIdx == 0 ) \n"
+	"		sumBuffer[GET_GROUP_IDX] = sum;\n"
+	"	if( (2*gIdx) < cb.m_numElems )\n"
+	"    {\n"
+	"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+	"	}\n"
+	"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+	"	{\n"
+	"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+	"    }\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
+	"{\n"
+	"	const u32 blockSize = WG_SIZE*2;\n"
+	"	int myIdx = GET_GROUP_IDX+1;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	float4 iBlockSum = blockSum[myIdx];\n"
+	"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+	"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+	"	{\n"
+	"		dst[i] += iBlockSum;\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
+	"{\n"
+	"	__local float4 ldsData[2048];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int lSize = GET_GROUP_SIZE;\n"
+	"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+	"	{\n"
+	"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+	"	{\n"
+	"		dst[i] = ldsData[i];\n"
+	"	}\n"
+	"	if( gIdx == 0 )\n"
+	"	{\n"
+	"		dst[cb.m_numBlocks] = sum;\n"
+	"	}\n"
+	"}\n";
diff --git a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
index 8876c16aa6..fb4bdda303 100644
--- a/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
+++ b/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
@@ -1,910 +1,909 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* radixSort32KernelsCL= \
-"/*\n"
-"Bullet Continuous Collision Detection and Physics Library\n"
-"Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Author Takahiro Harada\n"
-"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"#define AtomAdd(x, value) atom_add(&(x), value)\n"
-"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"#define make_uint4 (uint4)\n"
-"#define make_uint2 (uint2)\n"
-"#define make_int2 (int2)\n"
-"#define WG_SIZE 64\n"
-"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n"
-"#define BITS_PER_PASS 4\n"
-"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
-"typedef uchar u8;\n"
-"//	this isn't optimization for VLIW. But just reducing writes. \n"
-"#define USE_2LEVEL_REDUCE 1\n"
-"//#define CHECK_BOUNDARY 1\n"
-"//#define NV_GPU 1\n"
-"//	Cypress\n"
-"#define nPerWI 16\n"
-"//	Cayman\n"
-"//#define nPerWI 20\n"
-"#define m_n x\n"
-"#define m_nWGs y\n"
-"#define m_startBit z\n"
-"#define m_nBlocksPerWG w\n"
-"/*\n"
-"typedef struct\n"
-"{\n"
-"	int m_n;\n"
-"	int m_nWGs;\n"
-"	int m_startBit;\n"
-"	int m_nBlocksPerWG;\n"
-"} ConstBuffer;\n"
-"*/\n"
-"typedef struct\n"
-"{\n"
-"	unsigned int m_key;\n"
-"	unsigned int m_value;\n"
-"} SortDataCL;\n"
-"uint prefixScanVectorEx( uint4* data )\n"
-"{\n"
-"	u32 sum = 0;\n"
-"	u32 tmp = data[0].x;\n"
-"	data[0].x = sum;\n"
-"	sum += tmp;\n"
-"	tmp = data[0].y;\n"
-"	data[0].y = sum;\n"
-"	sum += tmp;\n"
-"	tmp = data[0].z;\n"
-"	data[0].z = sum;\n"
-"	sum += tmp;\n"
-"	tmp = data[0].w;\n"
-"	data[0].w = sum;\n"
-"	sum += tmp;\n"
-"	return sum;\n"
-"}\n"
-"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n"
-"{\n"
-"	{	//	Set data\n"
-"		sorterSharedMemory[lIdx] = 0;\n"
-"		sorterSharedMemory[lIdx+wgSize] = pData;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	{	//	Prefix sum\n"
-"		int idx = 2*lIdx + (wgSize+1);\n"
-"#if defined(USE_2LEVEL_REDUCE)\n"
-"		if( lIdx < 64 )\n"
-"		{\n"
-"			u32 u0, u1, u2;\n"
-"			u0 = sorterSharedMemory[idx-3];\n"
-"			u1 = sorterSharedMemory[idx-2];\n"
-"			u2 = sorterSharedMemory[idx-1];\n"
-"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
-"			GROUP_MEM_FENCE;\n"
-"			u0 = sorterSharedMemory[idx-12];\n"
-"			u1 = sorterSharedMemory[idx-8];\n"
-"			u2 = sorterSharedMemory[idx-4];\n"
-"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
-"			GROUP_MEM_FENCE;\n"
-"			u0 = sorterSharedMemory[idx-48];\n"
-"			u1 = sorterSharedMemory[idx-32];\n"
-"			u2 = sorterSharedMemory[idx-16];\n"
-"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
-"			GROUP_MEM_FENCE;\n"
-"			if( wgSize > 64 )\n"
-"			{\n"
-"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
-"				GROUP_MEM_FENCE;\n"
-"			}\n"
-"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
-"			GROUP_MEM_FENCE;\n"
-"		}\n"
-"#else\n"
-"		if( lIdx < 64 )\n"
-"		{\n"
-"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
-"			GROUP_MEM_FENCE;\n"
-"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];			\n"
-"			GROUP_MEM_FENCE;\n"
-"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
-"			GROUP_MEM_FENCE;\n"
-"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
-"			GROUP_MEM_FENCE;\n"
-"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
-"			GROUP_MEM_FENCE;\n"
-"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n"
-"			GROUP_MEM_FENCE;\n"
-"			if( wgSize > 64 )\n"
-"			{\n"
-"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
-"				GROUP_MEM_FENCE;\n"
-"			}\n"
-"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
-"			GROUP_MEM_FENCE;\n"
-"		}\n"
-"#endif\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	*totalSum = sorterSharedMemory[wgSize*2-1];\n"
-"	u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n"
-"	return addValue;\n"
-"}\n"
-"//__attribute__((reqd_work_group_size(128,1,1)))\n"
-"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
-"{\n"
-"	u32 s4 = prefixScanVectorEx( &pData );\n"
-"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n"
-"	return pData + make_uint4( rank, rank, rank, rank );\n"
-"}\n"
-"//__attribute__((reqd_work_group_size(64,1,1)))\n"
-"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
-"{\n"
-"	u32 s4 = prefixScanVectorEx( &pData );\n"
-"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n"
-"	return pData + make_uint4( rank, rank, rank, rank );\n"
-"}\n"
-"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n"
-"u32 bit8Scan(u32 v)\n"
-"{\n"
-"	return (v<<8) + (v<<16) + (v<<24);\n"
-"}\n"
-"//===\n"
-"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n"
-"{\n"
-"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
-"	u32 gIdx = GET_GLOBAL_IDX;\n"
-"	u32 lIdx = GET_LOCAL_IDX;\n"
-"	u32 wgIdx = GET_GROUP_IDX;\n"
-"	u32 wgSize = GET_GROUP_SIZE;\n"
-"	const int startBit = cb.m_startBit;\n"
-"	const int n = cb.m_n;\n"
-"	const int nWGs = cb.m_nWGs;\n"
-"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"	for(int i=0; i<NUM_BUCKET; i++)\n"
-"	{\n"
-"		MY_HISTOGRAM(i) = 0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-"	u32 localKey;\n"
-"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
-"	{\n"
-"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
-"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
-"		//	AMD: AtomInc performs better while NV prefers ++\n"
-"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-"		{\n"
-"#if defined(CHECK_BOUNDARY)\n"
-"			if( addr+i < n )\n"
-"#endif\n"
-"			{\n"
-"				localKey = (gSrc[addr+i]>>startBit) & 0xf;\n"
-"#if defined(NV_GPU)\n"
-"				MY_HISTOGRAM( localKey )++;\n"
-"#else\n"
-"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
-"#endif\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	\n"
-"	if( lIdx < NUM_BUCKET )\n"
-"	{\n"
-"		u32 sum = 0;\n"
-"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
-"		{\n"
-"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
-"		}\n"
-"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )\n"
-"{\n"
-"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
-"	u32 gIdx = GET_GLOBAL_IDX;\n"
-"	u32 lIdx = GET_LOCAL_IDX;\n"
-"	u32 wgIdx = GET_GROUP_IDX;\n"
-"	u32 wgSize = GET_GROUP_SIZE;\n"
-"	const int startBit = cb.m_startBit;\n"
-"	const int n = cb.m_n;\n"
-"	const int nWGs = cb.m_nWGs;\n"
-"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"	for(int i=0; i<NUM_BUCKET; i++)\n"
-"	{\n"
-"		MY_HISTOGRAM(i) = 0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-"	u32 localKey;\n"
-"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
-"	{\n"
-"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
-"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
-"		//	AMD: AtomInc performs better while NV prefers ++\n"
-"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-"		{\n"
-"#if defined(CHECK_BOUNDARY)\n"
-"			if( addr+i < n )\n"
-"#endif\n"
-"			{\n"
-"				localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n"
-"#if defined(NV_GPU)\n"
-"				MY_HISTOGRAM( localKey )++;\n"
-"#else\n"
-"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
-"#endif\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	\n"
-"	if( lIdx < NUM_BUCKET )\n"
-"	{\n"
-"		u32 sum = 0;\n"
-"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
-"		{\n"
-"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
-"		}\n"
-"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
-"	}\n"
-"}\n"
-"#define nPerLane (nPerWI/4)\n"
-"//	NUM_BUCKET*nWGs < 128*nPerWI\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(128,1,1)))\n"
-"void PrefixScanKernel( __global u32* wHistogram1, int4  cb )\n"
-"{\n"
-"	__local u32 ldsTopScanData[128*2];\n"
-"	u32 lIdx = GET_LOCAL_IDX;\n"
-"	u32 wgIdx = GET_GROUP_IDX;\n"
-"	const int nWGs = cb.m_nWGs;\n"
-"	u32 data[nPerWI];\n"
-"	for(int i=0; i<nPerWI; i++)\n"
-"	{\n"
-"		data[i] = 0;\n"
-"		if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n"
-"			data[i] = wHistogram1[nPerWI*lIdx+i];\n"
-"	}\n"
-"	uint4 myData = make_uint4(0,0,0,0);\n"
-"	for(int i=0; i<nPerLane; i++)\n"
-"	{\n"
-"		myData.x += data[nPerLane*0+i];\n"
-"		myData.y += data[nPerLane*1+i];\n"
-"		myData.z += data[nPerLane*2+i];\n"
-"		myData.w += data[nPerLane*3+i];\n"
-"	}\n"
-"	uint totalSum;\n"
-"	uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n"
-"//	for(int j=0; j<4; j++) //	somehow it introduces a lot of branches\n"
-"	{	int j = 0;\n"
-"		u32 sum = 0;\n"
-"		for(int i=0; i<nPerLane; i++)\n"
-"		{\n"
-"			u32 tmp = data[nPerLane*j+i];\n"
-"			data[nPerLane*j+i] = sum;\n"
-"			sum += tmp;\n"
-"		}\n"
-"	}\n"
-"	{	int j = 1;\n"
-"		u32 sum = 0;\n"
-"		for(int i=0; i<nPerLane; i++)\n"
-"		{\n"
-"			u32 tmp = data[nPerLane*j+i];\n"
-"			data[nPerLane*j+i] = sum;\n"
-"			sum += tmp;\n"
-"		}\n"
-"	}\n"
-"	{	int j = 2;\n"
-"		u32 sum = 0;\n"
-"		for(int i=0; i<nPerLane; i++)\n"
-"		{\n"
-"			u32 tmp = data[nPerLane*j+i];\n"
-"			data[nPerLane*j+i] = sum;\n"
-"			sum += tmp;\n"
-"		}\n"
-"	}\n"
-"	{	int j = 3;\n"
-"		u32 sum = 0;\n"
-"		for(int i=0; i<nPerLane; i++)\n"
-"		{\n"
-"			u32 tmp = data[nPerLane*j+i];\n"
-"			data[nPerLane*j+i] = sum;\n"
-"			sum += tmp;\n"
-"		}\n"
-"	}\n"
-"	for(int i=0; i<nPerLane; i++)\n"
-"	{\n"
-"		data[nPerLane*0+i] += scanned.x;\n"
-"		data[nPerLane*1+i] += scanned.y;\n"
-"		data[nPerLane*2+i] += scanned.z;\n"
-"		data[nPerLane*3+i] += scanned.w;\n"
-"	}\n"
-"	for(int i=0; i<nPerWI; i++)\n"
-"	{\n"
-"		int index = nPerWI*lIdx+i;\n"
-"		if (index < NUM_BUCKET*nWGs)\n"
-"			wHistogram1[nPerWI*lIdx+i] = data[i];\n"
-"	}\n"
-"}\n"
-"//	4 scan, 4 exchange\n"
-"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
-"{\n"
-"	for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n"
-"	{\n"
-"		u32 mask = (1<<bitIdx);\n"
-"		uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n"
-"		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
-"		u32 total;\n"
-"		prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n"
-"		{\n"
-"			uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
-"			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
-"			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
-"			GROUP_LDS_BARRIER;\n"
-"			ldsSortData[dstAddr.x] = sortData[0];\n"
-"			ldsSortData[dstAddr.y] = sortData[1];\n"
-"			ldsSortData[dstAddr.z] = sortData[2];\n"
-"			ldsSortData[dstAddr.w] = sortData[3];\n"
-"			GROUP_LDS_BARRIER;\n"
-"			sortData[0] = ldsSortData[localAddr.x];\n"
-"			sortData[1] = ldsSortData[localAddr.y];\n"
-"			sortData[2] = ldsSortData[localAddr.z];\n"
-"			sortData[3] = ldsSortData[localAddr.w];\n"
-"			GROUP_LDS_BARRIER;\n"
-"		}\n"
-"	}\n"
-"}\n"
-"//	2 scan, 2 exchange\n"
-"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
-"{\n"
-"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
-"	{\n"
-"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
-"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
-"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
-"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
-"		u32 key4;\n"
-"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
-"		{\n"
-"			sKeyPacked[0] |= 1<<(8*b.x);\n"
-"			sKeyPacked[1] |= 1<<(8*b.y);\n"
-"			sKeyPacked[2] |= 1<<(8*b.z);\n"
-"			sKeyPacked[3] |= 1<<(8*b.w);\n"
-"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
-"		}\n"
-"		u32 rankPacked;\n"
-"		u32 sumPacked;\n"
-"		{\n"
-"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"		u32 newOffset[4] = { 0,0,0,0 };\n"
-"		{\n"
-"			u32 sumScanned = bit8Scan( sumPacked );\n"
-"			u32 scannedKeys[4];\n"
-"			scannedKeys[0] = 1<<(8*b.x);\n"
-"			scannedKeys[1] = 1<<(8*b.y);\n"
-"			scannedKeys[2] = 1<<(8*b.z);\n"
-"			scannedKeys[3] = 1<<(8*b.w);\n"
-"			{	//	4 scans at once\n"
-"				u32 sum4 = 0;\n"
-"				for(int ie=0; ie<4; ie++)\n"
-"				{\n"
-"					u32 tmp = scannedKeys[ie];\n"
-"					scannedKeys[ie] = sum4;\n"
-"					sum4 += tmp;\n"
-"				}\n"
-"			}\n"
-"			{\n"
-"				u32 sumPlusRank = sumScanned + rankPacked;\n"
-"				{	u32 ie = b.x;\n"
-"					scannedKeys[0] += sumPlusRank;\n"
-"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
-"				}\n"
-"				{	u32 ie = b.y;\n"
-"					scannedKeys[1] += sumPlusRank;\n"
-"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
-"				}\n"
-"				{	u32 ie = b.z;\n"
-"					scannedKeys[2] += sumPlusRank;\n"
-"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
-"				}\n"
-"				{	u32 ie = b.w;\n"
-"					scannedKeys[3] += sumPlusRank;\n"
-"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"		{\n"
-"			ldsSortData[newOffset[0]] = sortData[0];\n"
-"			ldsSortData[newOffset[1]] = sortData[1];\n"
-"			ldsSortData[newOffset[2]] = sortData[2];\n"
-"			ldsSortData[newOffset[3]] = sortData[3];\n"
-"			GROUP_LDS_BARRIER;\n"
-"			u32 dstAddr = 4*lIdx;\n"
-"			sortData[0] = ldsSortData[dstAddr+0];\n"
-"			sortData[1] = ldsSortData[dstAddr+1];\n"
-"			sortData[2] = ldsSortData[dstAddr+2];\n"
-"			sortData[3] = ldsSortData[dstAddr+3];\n"
-"			GROUP_LDS_BARRIER;\n"
-"		}\n"
-"	}\n"
-"}\n"
-"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
-"{\n"
-"	__local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
-"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
-"	__local u32 localHistogram[NUM_BUCKET*2];\n"
-"	u32 gIdx = GET_GLOBAL_IDX;\n"
-"	u32 lIdx = GET_LOCAL_IDX;\n"
-"	u32 wgIdx = GET_GROUP_IDX;\n"
-"	u32 wgSize = GET_GROUP_SIZE;\n"
-"	const int n = cb.m_n;\n"
-"	const int nWGs = cb.m_nWGs;\n"
-"	const int startBit = cb.m_startBit;\n"
-"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"	if( lIdx < (NUM_BUCKET) )\n"
-"	{\n"
-"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
-"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
-"	{\n"
-"		u32 myHistogram = 0;\n"
-"		u32 sortData[ELEMENTS_PER_WORK_ITEM];\n"
-"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-"#if defined(CHECK_BOUNDARY)\n"
-"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n"
-"#else\n"
-"			sortData[i] = gSrc[ addr+i ];\n"
-"#endif\n"
-"		sort4Bits(sortData, startBit, lIdx, ldsSortData);\n"
-"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
-"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
-"		{	//	create histogram\n"
-"			u32 setIdx = lIdx/16;\n"
-"			if( lIdx < NUM_BUCKET )\n"
-"			{\n"
-"				localHistogram[lIdx] = 0;\n"
-"			}\n"
-"			ldsSortData[lIdx] = 0;\n"
-"			GROUP_LDS_BARRIER;\n"
-"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-"#if defined(CHECK_BOUNDARY)\n"
-"				if( addr+i < n )\n"
-"#endif\n"
-"#if defined(NV_GPU)\n"
-"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
-"#else\n"
-"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
-"#endif\n"
-"			\n"
-"			GROUP_LDS_BARRIER;\n"
-"			\n"
-"			uint hIdx = NUM_BUCKET+lIdx;\n"
-"			if( lIdx < NUM_BUCKET )\n"
-"			{\n"
-"				u32 sum = 0;\n"
-"				for(int i=0; i<WG_SIZE/16; i++)\n"
-"				{\n"
-"					sum += SET_HISTOGRAM( i, lIdx );\n"
-"				}\n"
-"				myHistogram = sum;\n"
-"				localHistogram[hIdx] = sum;\n"
-"			}\n"
-"			GROUP_LDS_BARRIER;\n"
-"#if defined(USE_2LEVEL_REDUCE)\n"
-"			if( lIdx < NUM_BUCKET )\n"
-"			{\n"
-"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
-"				GROUP_MEM_FENCE;\n"
-"				u32 u0, u1, u2;\n"
-"				u0 = localHistogram[hIdx-3];\n"
-"				u1 = localHistogram[hIdx-2];\n"
-"				u2 = localHistogram[hIdx-1];\n"
-"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
-"				GROUP_MEM_FENCE;\n"
-"				u0 = localHistogram[hIdx-12];\n"
-"				u1 = localHistogram[hIdx-8];\n"
-"				u2 = localHistogram[hIdx-4];\n"
-"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
-"				GROUP_MEM_FENCE;\n"
-"			}\n"
-"#else\n"
-"			if( lIdx < NUM_BUCKET )\n"
-"			{\n"
-"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
-"				GROUP_MEM_FENCE;\n"
-"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
-"				GROUP_MEM_FENCE;\n"
-"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
-"				GROUP_MEM_FENCE;\n"
-"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
-"				GROUP_MEM_FENCE;\n"
-"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
-"				GROUP_MEM_FENCE;\n"
-"			}\n"
-"#endif\n"
-"			GROUP_LDS_BARRIER;\n"
-"		}\n"
-"		{\n"
-"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
-"			{\n"
-"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
-"				int binIdx = keys[ie];\n"
-"				int groupOffset = localHistogramToCarry[binIdx];\n"
-"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
-"#if defined(CHECK_BOUNDARY)\n"
-"				if( addr+ie < n )\n"
-"#endif\n"
-"				gDst[ groupOffset + myIdx ] = sortData[ie];\n"
-"			}\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"		if( lIdx < NUM_BUCKET )\n"
-"		{\n"
-"			localHistogramToCarry[lIdx] += myHistogram;\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"	}\n"
-"}\n"
-"//	2 scan, 2 exchange\n"
-"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n"
-"{\n"
-"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
-"	{\n"
-"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
-"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
-"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
-"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
-"		u32 key4;\n"
-"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
-"		{\n"
-"			sKeyPacked[0] |= 1<<(8*b.x);\n"
-"			sKeyPacked[1] |= 1<<(8*b.y);\n"
-"			sKeyPacked[2] |= 1<<(8*b.z);\n"
-"			sKeyPacked[3] |= 1<<(8*b.w);\n"
-"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
-"		}\n"
-"		u32 rankPacked;\n"
-"		u32 sumPacked;\n"
-"		{\n"
-"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"		u32 newOffset[4] = { 0,0,0,0 };\n"
-"		{\n"
-"			u32 sumScanned = bit8Scan( sumPacked );\n"
-"			u32 scannedKeys[4];\n"
-"			scannedKeys[0] = 1<<(8*b.x);\n"
-"			scannedKeys[1] = 1<<(8*b.y);\n"
-"			scannedKeys[2] = 1<<(8*b.z);\n"
-"			scannedKeys[3] = 1<<(8*b.w);\n"
-"			{	//	4 scans at once\n"
-"				u32 sum4 = 0;\n"
-"				for(int ie=0; ie<4; ie++)\n"
-"				{\n"
-"					u32 tmp = scannedKeys[ie];\n"
-"					scannedKeys[ie] = sum4;\n"
-"					sum4 += tmp;\n"
-"				}\n"
-"			}\n"
-"			{\n"
-"				u32 sumPlusRank = sumScanned + rankPacked;\n"
-"				{	u32 ie = b.x;\n"
-"					scannedKeys[0] += sumPlusRank;\n"
-"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
-"				}\n"
-"				{	u32 ie = b.y;\n"
-"					scannedKeys[1] += sumPlusRank;\n"
-"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
-"				}\n"
-"				{	u32 ie = b.z;\n"
-"					scannedKeys[2] += sumPlusRank;\n"
-"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
-"				}\n"
-"				{	u32 ie = b.w;\n"
-"					scannedKeys[3] += sumPlusRank;\n"
-"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"		{\n"
-"			ldsSortData[newOffset[0]] = sortData[0];\n"
-"			ldsSortData[newOffset[1]] = sortData[1];\n"
-"			ldsSortData[newOffset[2]] = sortData[2];\n"
-"			ldsSortData[newOffset[3]] = sortData[3];\n"
-"			ldsSortVal[newOffset[0]] = sortVal[0];\n"
-"			ldsSortVal[newOffset[1]] = sortVal[1];\n"
-"			ldsSortVal[newOffset[2]] = sortVal[2];\n"
-"			ldsSortVal[newOffset[3]] = sortVal[3];\n"
-"			GROUP_LDS_BARRIER;\n"
-"			u32 dstAddr = 4*lIdx;\n"
-"			sortData[0] = ldsSortData[dstAddr+0];\n"
-"			sortData[1] = ldsSortData[dstAddr+1];\n"
-"			sortData[2] = ldsSortData[dstAddr+2];\n"
-"			sortData[3] = ldsSortData[dstAddr+3];\n"
-"			sortVal[0] = ldsSortVal[dstAddr+0];\n"
-"			sortVal[1] = ldsSortVal[dstAddr+1];\n"
-"			sortVal[2] = ldsSortVal[dstAddr+2];\n"
-"			sortVal[3] = ldsSortVal[dstAddr+3];\n"
-"			GROUP_LDS_BARRIER;\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
-"{\n"
-"	__local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
-"	__local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
-"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
-"	__local u32 localHistogram[NUM_BUCKET*2];\n"
-"	u32 gIdx = GET_GLOBAL_IDX;\n"
-"	u32 lIdx = GET_LOCAL_IDX;\n"
-"	u32 wgIdx = GET_GROUP_IDX;\n"
-"	u32 wgSize = GET_GROUP_SIZE;\n"
-"	const int n = cb.m_n;\n"
-"	const int nWGs = cb.m_nWGs;\n"
-"	const int startBit = cb.m_startBit;\n"
-"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"	if( lIdx < (NUM_BUCKET) )\n"
-"	{\n"
-"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"    \n"
-"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
-"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
-"	{\n"
-"		u32 myHistogram = 0;\n"
-"		int sortData[ELEMENTS_PER_WORK_ITEM];\n"
-"		int sortVal[ELEMENTS_PER_WORK_ITEM];\n"
-"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-"#if defined(CHECK_BOUNDARY)\n"
-"		{\n"
-"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n"
-"			sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n"
-"		}\n"
-"#else\n"
-"		{\n"
-"			sortData[i] = gSrc[ addr+i ].m_key;\n"
-"			sortVal[i] = gSrc[ addr+i ].m_value;\n"
-"		}\n"
-"#endif\n"
-"		sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n"
-"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
-"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
-"		{	//	create histogram\n"
-"			u32 setIdx = lIdx/16;\n"
-"			if( lIdx < NUM_BUCKET )\n"
-"			{\n"
-"				localHistogram[lIdx] = 0;\n"
-"			}\n"
-"			ldsSortData[lIdx] = 0;\n"
-"			GROUP_LDS_BARRIER;\n"
-"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
-"#if defined(CHECK_BOUNDARY)\n"
-"				if( addr+i < n )\n"
-"#endif\n"
-"#if defined(NV_GPU)\n"
-"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
-"#else\n"
-"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
-"#endif\n"
-"			\n"
-"			GROUP_LDS_BARRIER;\n"
-"			\n"
-"			uint hIdx = NUM_BUCKET+lIdx;\n"
-"			if( lIdx < NUM_BUCKET )\n"
-"			{\n"
-"				u32 sum = 0;\n"
-"				for(int i=0; i<WG_SIZE/16; i++)\n"
-"				{\n"
-"					sum += SET_HISTOGRAM( i, lIdx );\n"
-"				}\n"
-"				myHistogram = sum;\n"
-"				localHistogram[hIdx] = sum;\n"
-"			}\n"
-"			GROUP_LDS_BARRIER;\n"
-"#if defined(USE_2LEVEL_REDUCE)\n"
-"			if( lIdx < NUM_BUCKET )\n"
-"			{\n"
-"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
-"				GROUP_MEM_FENCE;\n"
-"				u32 u0, u1, u2;\n"
-"				u0 = localHistogram[hIdx-3];\n"
-"				u1 = localHistogram[hIdx-2];\n"
-"				u2 = localHistogram[hIdx-1];\n"
-"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
-"				GROUP_MEM_FENCE;\n"
-"				u0 = localHistogram[hIdx-12];\n"
-"				u1 = localHistogram[hIdx-8];\n"
-"				u2 = localHistogram[hIdx-4];\n"
-"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
-"				GROUP_MEM_FENCE;\n"
-"			}\n"
-"#else\n"
-"			if( lIdx < NUM_BUCKET )\n"
-"			{\n"
-"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
-"				GROUP_MEM_FENCE;\n"
-"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
-"				GROUP_MEM_FENCE;\n"
-"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
-"				GROUP_MEM_FENCE;\n"
-"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
-"				GROUP_MEM_FENCE;\n"
-"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
-"				GROUP_MEM_FENCE;\n"
-"			}\n"
-"#endif\n"
-"			GROUP_LDS_BARRIER;\n"
-"		}\n"
-"    	{\n"
-"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
-"			{\n"
-"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
-"				int binIdx = keys[ie];\n"
-"				int groupOffset = localHistogramToCarry[binIdx];\n"
-"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
-"#if defined(CHECK_BOUNDARY)\n"
-"				if( addr+ie < n )\n"
-"				{\n"
-"                    if ((groupOffset + myIdx)<n)\n"
-"                    {\n"
-"                        if (sortData[ie]==sortVal[ie])\n"
-"                        {\n"
-"                            \n"
-"                            SortDataCL tmp;\n"
-"                            tmp.m_key = sortData[ie];\n"
-"                            tmp.m_value = sortVal[ie];\n"
-"                            if (tmp.m_key == tmp.m_value)\n"
-"                                gDst[groupOffset + myIdx ] = tmp;\n"
-"                        }\n"
-"                        \n"
-"                    }\n"
-"				}\n"
-"#else\n"
-"                if ((groupOffset + myIdx)<n)\n"
-"                {\n"
-"                    gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n"
-"                    gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n"
-"                }\n"
-"#endif\n"
-"			}\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"		if( lIdx < NUM_BUCKET )\n"
-"		{\n"
-"			localHistogramToCarry[lIdx] += myHistogram;\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
-"{\n"
-"    \n"
-"	u32 gIdx = GET_GLOBAL_IDX;\n"
-"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
-"	u32 wgIdx = GET_GROUP_IDX;\n"
-"	u32 wgSize = GET_GROUP_SIZE;\n"
-"	const int startBit = cb.m_startBit;\n"
-"	const int n = cb.m_n;\n"
-"	const int nWGs = cb.m_nWGs;\n"
-"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"    int counter[NUM_BUCKET];\n"
-"    \n"
-"    if (realLocalIdx>0)\n"
-"        return;\n"
-"    \n"
-"    for (int c=0;c<NUM_BUCKET;c++)\n"
-"        counter[c]=0;\n"
-"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-"	\n"
-"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
-"  {\n"
-"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
-" 	{\n"
-"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"        \n"
-"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
-"		{\n"
-"            int i = addr2+j;\n"
-"			if( i < n )\n"
-"			{\n"
-"                int tableIdx;\n"
-"				tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
-"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
-"                counter[tableIdx] ++;\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"  }\n"
-"    \n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
-"{\n"
-"    \n"
-"	u32 gIdx = GET_GLOBAL_IDX;\n"
-"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
-"	u32 wgIdx = GET_GROUP_IDX;\n"
-"	u32 wgSize = GET_GROUP_SIZE;\n"
-"	const int startBit = cb.m_startBit;\n"
-"	const int n = cb.m_n;\n"
-"	const int nWGs = cb.m_nWGs;\n"
-"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"    int counter[NUM_BUCKET];\n"
-"    \n"
-"    if (realLocalIdx>0)\n"
-"        return;\n"
-"    \n"
-"    for (int c=0;c<NUM_BUCKET;c++)\n"
-"        counter[c]=0;\n"
-"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-"	\n"
-"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
-"  {\n"
-"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
-" 	{\n"
-"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"        \n"
-"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
-"		{\n"
-"            int i = addr2+j;\n"
-"			if( i < n )\n"
-"			{\n"
-"                int tableIdx;\n"
-"				tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
-"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
-"                counter[tableIdx] ++;\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"  }\n"
-"    \n"
-"}\n"
-;
+static const char* radixSort32KernelsCL =
+	"/*\n"
+	"Bullet Continuous Collision Detection and Physics Library\n"
+	"Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Author Takahiro Harada\n"
+	"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+	"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+	"#define make_uint4 (uint4)\n"
+	"#define make_uint2 (uint2)\n"
+	"#define make_int2 (int2)\n"
+	"#define WG_SIZE 64\n"
+	"#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n"
+	"#define BITS_PER_PASS 4\n"
+	"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
+	"typedef uchar u8;\n"
+	"//	this isn't optimization for VLIW. But just reducing writes. \n"
+	"#define USE_2LEVEL_REDUCE 1\n"
+	"//#define CHECK_BOUNDARY 1\n"
+	"//#define NV_GPU 1\n"
+	"//	Cypress\n"
+	"#define nPerWI 16\n"
+	"//	Cayman\n"
+	"//#define nPerWI 20\n"
+	"#define m_n x\n"
+	"#define m_nWGs y\n"
+	"#define m_startBit z\n"
+	"#define m_nBlocksPerWG w\n"
+	"/*\n"
+	"typedef struct\n"
+	"{\n"
+	"	int m_n;\n"
+	"	int m_nWGs;\n"
+	"	int m_startBit;\n"
+	"	int m_nBlocksPerWG;\n"
+	"} ConstBuffer;\n"
+	"*/\n"
+	"typedef struct\n"
+	"{\n"
+	"	unsigned int m_key;\n"
+	"	unsigned int m_value;\n"
+	"} SortDataCL;\n"
+	"uint prefixScanVectorEx( uint4* data )\n"
+	"{\n"
+	"	u32 sum = 0;\n"
+	"	u32 tmp = data[0].x;\n"
+	"	data[0].x = sum;\n"
+	"	sum += tmp;\n"
+	"	tmp = data[0].y;\n"
+	"	data[0].y = sum;\n"
+	"	sum += tmp;\n"
+	"	tmp = data[0].z;\n"
+	"	data[0].z = sum;\n"
+	"	sum += tmp;\n"
+	"	tmp = data[0].w;\n"
+	"	data[0].w = sum;\n"
+	"	sum += tmp;\n"
+	"	return sum;\n"
+	"}\n"
+	"u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n"
+	"{\n"
+	"	{	//	Set data\n"
+	"		sorterSharedMemory[lIdx] = 0;\n"
+	"		sorterSharedMemory[lIdx+wgSize] = pData;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	{	//	Prefix sum\n"
+	"		int idx = 2*lIdx + (wgSize+1);\n"
+	"#if defined(USE_2LEVEL_REDUCE)\n"
+	"		if( lIdx < 64 )\n"
+	"		{\n"
+	"			u32 u0, u1, u2;\n"
+	"			u0 = sorterSharedMemory[idx-3];\n"
+	"			u1 = sorterSharedMemory[idx-2];\n"
+	"			u2 = sorterSharedMemory[idx-1];\n"
+	"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
+	"			GROUP_MEM_FENCE;\n"
+	"			u0 = sorterSharedMemory[idx-12];\n"
+	"			u1 = sorterSharedMemory[idx-8];\n"
+	"			u2 = sorterSharedMemory[idx-4];\n"
+	"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
+	"			GROUP_MEM_FENCE;\n"
+	"			u0 = sorterSharedMemory[idx-48];\n"
+	"			u1 = sorterSharedMemory[idx-32];\n"
+	"			u2 = sorterSharedMemory[idx-16];\n"
+	"			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
+	"			GROUP_MEM_FENCE;\n"
+	"			if( wgSize > 64 )\n"
+	"			{\n"
+	"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"			}\n"
+	"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+	"			GROUP_MEM_FENCE;\n"
+	"		}\n"
+	"#else\n"
+	"		if( lIdx < 64 )\n"
+	"		{\n"
+	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
+	"			GROUP_MEM_FENCE;\n"
+	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];			\n"
+	"			GROUP_MEM_FENCE;\n"
+	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
+	"			GROUP_MEM_FENCE;\n"
+	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
+	"			GROUP_MEM_FENCE;\n"
+	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
+	"			GROUP_MEM_FENCE;\n"
+	"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];\n"
+	"			GROUP_MEM_FENCE;\n"
+	"			if( wgSize > 64 )\n"
+	"			{\n"
+	"				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"			}\n"
+	"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
+	"			GROUP_MEM_FENCE;\n"
+	"		}\n"
+	"#endif\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	*totalSum = sorterSharedMemory[wgSize*2-1];\n"
+	"	u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n"
+	"	return addValue;\n"
+	"}\n"
+	"//__attribute__((reqd_work_group_size(128,1,1)))\n"
+	"uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
+	"{\n"
+	"	u32 s4 = prefixScanVectorEx( &pData );\n"
+	"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n"
+	"	return pData + make_uint4( rank, rank, rank, rank );\n"
+	"}\n"
+	"//__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
+	"{\n"
+	"	u32 s4 = prefixScanVectorEx( &pData );\n"
+	"	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n"
+	"	return pData + make_uint4( rank, rank, rank, rank );\n"
+	"}\n"
+	"u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n"
+	"u32 bit8Scan(u32 v)\n"
+	"{\n"
+	"	return (v<<8) + (v<<16) + (v<<24);\n"
+	"}\n"
+	"//===\n"
+	"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n"
+	"{\n"
+	"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
+	"	u32 gIdx = GET_GLOBAL_IDX;\n"
+	"	u32 lIdx = GET_LOCAL_IDX;\n"
+	"	u32 wgIdx = GET_GROUP_IDX;\n"
+	"	u32 wgSize = GET_GROUP_SIZE;\n"
+	"	const int startBit = cb.m_startBit;\n"
+	"	const int n = cb.m_n;\n"
+	"	const int nWGs = cb.m_nWGs;\n"
+	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+	"	for(int i=0; i<NUM_BUCKET; i++)\n"
+	"	{\n"
+	"		MY_HISTOGRAM(i) = 0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+	"	u32 localKey;\n"
+	"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+	"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+	"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+	"	{\n"
+	"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
+	"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
+	"		//	AMD: AtomInc performs better while NV prefers ++\n"
+	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+	"		{\n"
+	"#if defined(CHECK_BOUNDARY)\n"
+	"			if( addr+i < n )\n"
+	"#endif\n"
+	"			{\n"
+	"				localKey = (gSrc[addr+i]>>startBit) & 0xf;\n"
+	"#if defined(NV_GPU)\n"
+	"				MY_HISTOGRAM( localKey )++;\n"
+	"#else\n"
+	"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
+	"#endif\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	\n"
+	"	if( lIdx < NUM_BUCKET )\n"
+	"	{\n"
+	"		u32 sum = 0;\n"
+	"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
+	"		{\n"
+	"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
+	"		}\n"
+	"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )\n"
+	"{\n"
+	"	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
+	"	u32 gIdx = GET_GLOBAL_IDX;\n"
+	"	u32 lIdx = GET_LOCAL_IDX;\n"
+	"	u32 wgIdx = GET_GROUP_IDX;\n"
+	"	u32 wgSize = GET_GROUP_SIZE;\n"
+	"	const int startBit = cb.m_startBit;\n"
+	"	const int n = cb.m_n;\n"
+	"	const int nWGs = cb.m_nWGs;\n"
+	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+	"	for(int i=0; i<NUM_BUCKET; i++)\n"
+	"	{\n"
+	"		MY_HISTOGRAM(i) = 0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+	"	u32 localKey;\n"
+	"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+	"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+	"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+	"	{\n"
+	"		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
+	"		//	Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops\n"
+	"		//	AMD: AtomInc performs better while NV prefers ++\n"
+	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+	"		{\n"
+	"#if defined(CHECK_BOUNDARY)\n"
+	"			if( addr+i < n )\n"
+	"#endif\n"
+	"			{\n"
+	"				localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;\n"
+	"#if defined(NV_GPU)\n"
+	"				MY_HISTOGRAM( localKey )++;\n"
+	"#else\n"
+	"				AtomInc( MY_HISTOGRAM( localKey ) );\n"
+	"#endif\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	\n"
+	"	if( lIdx < NUM_BUCKET )\n"
+	"	{\n"
+	"		u32 sum = 0;\n"
+	"		for(int i=0; i<GET_GROUP_SIZE; i++)\n"
+	"		{\n"
+	"			sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];\n"
+	"		}\n"
+	"		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
+	"	}\n"
+	"}\n"
+	"#define nPerLane (nPerWI/4)\n"
+	"//	NUM_BUCKET*nWGs < 128*nPerWI\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(128,1,1)))\n"
+	"void PrefixScanKernel( __global u32* wHistogram1, int4  cb )\n"
+	"{\n"
+	"	__local u32 ldsTopScanData[128*2];\n"
+	"	u32 lIdx = GET_LOCAL_IDX;\n"
+	"	u32 wgIdx = GET_GROUP_IDX;\n"
+	"	const int nWGs = cb.m_nWGs;\n"
+	"	u32 data[nPerWI];\n"
+	"	for(int i=0; i<nPerWI; i++)\n"
+	"	{\n"
+	"		data[i] = 0;\n"
+	"		if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n"
+	"			data[i] = wHistogram1[nPerWI*lIdx+i];\n"
+	"	}\n"
+	"	uint4 myData = make_uint4(0,0,0,0);\n"
+	"	for(int i=0; i<nPerLane; i++)\n"
+	"	{\n"
+	"		myData.x += data[nPerLane*0+i];\n"
+	"		myData.y += data[nPerLane*1+i];\n"
+	"		myData.z += data[nPerLane*2+i];\n"
+	"		myData.w += data[nPerLane*3+i];\n"
+	"	}\n"
+	"	uint totalSum;\n"
+	"	uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n"
+	"//	for(int j=0; j<4; j++) //	somehow it introduces a lot of branches\n"
+	"	{	int j = 0;\n"
+	"		u32 sum = 0;\n"
+	"		for(int i=0; i<nPerLane; i++)\n"
+	"		{\n"
+	"			u32 tmp = data[nPerLane*j+i];\n"
+	"			data[nPerLane*j+i] = sum;\n"
+	"			sum += tmp;\n"
+	"		}\n"
+	"	}\n"
+	"	{	int j = 1;\n"
+	"		u32 sum = 0;\n"
+	"		for(int i=0; i<nPerLane; i++)\n"
+	"		{\n"
+	"			u32 tmp = data[nPerLane*j+i];\n"
+	"			data[nPerLane*j+i] = sum;\n"
+	"			sum += tmp;\n"
+	"		}\n"
+	"	}\n"
+	"	{	int j = 2;\n"
+	"		u32 sum = 0;\n"
+	"		for(int i=0; i<nPerLane; i++)\n"
+	"		{\n"
+	"			u32 tmp = data[nPerLane*j+i];\n"
+	"			data[nPerLane*j+i] = sum;\n"
+	"			sum += tmp;\n"
+	"		}\n"
+	"	}\n"
+	"	{	int j = 3;\n"
+	"		u32 sum = 0;\n"
+	"		for(int i=0; i<nPerLane; i++)\n"
+	"		{\n"
+	"			u32 tmp = data[nPerLane*j+i];\n"
+	"			data[nPerLane*j+i] = sum;\n"
+	"			sum += tmp;\n"
+	"		}\n"
+	"	}\n"
+	"	for(int i=0; i<nPerLane; i++)\n"
+	"	{\n"
+	"		data[nPerLane*0+i] += scanned.x;\n"
+	"		data[nPerLane*1+i] += scanned.y;\n"
+	"		data[nPerLane*2+i] += scanned.z;\n"
+	"		data[nPerLane*3+i] += scanned.w;\n"
+	"	}\n"
+	"	for(int i=0; i<nPerWI; i++)\n"
+	"	{\n"
+	"		int index = nPerWI*lIdx+i;\n"
+	"		if (index < NUM_BUCKET*nWGs)\n"
+	"			wHistogram1[nPerWI*lIdx+i] = data[i];\n"
+	"	}\n"
+	"}\n"
+	"//	4 scan, 4 exchange\n"
+	"void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
+	"{\n"
+	"	for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)\n"
+	"	{\n"
+	"		u32 mask = (1<<bitIdx);\n"
+	"		uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );\n"
+	"		uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );\n"
+	"		u32 total;\n"
+	"		prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );\n"
+	"		{\n"
+	"			uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
+	"			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
+	"			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"			ldsSortData[dstAddr.x] = sortData[0];\n"
+	"			ldsSortData[dstAddr.y] = sortData[1];\n"
+	"			ldsSortData[dstAddr.z] = sortData[2];\n"
+	"			ldsSortData[dstAddr.w] = sortData[3];\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"			sortData[0] = ldsSortData[localAddr.x];\n"
+	"			sortData[1] = ldsSortData[localAddr.y];\n"
+	"			sortData[2] = ldsSortData[localAddr.z];\n"
+	"			sortData[3] = ldsSortData[localAddr.w];\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"//	2 scan, 2 exchange\n"
+	"void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
+	"{\n"
+	"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
+	"	{\n"
+	"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
+	"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
+	"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
+	"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
+	"		u32 key4;\n"
+	"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
+	"		{\n"
+	"			sKeyPacked[0] |= 1<<(8*b.x);\n"
+	"			sKeyPacked[1] |= 1<<(8*b.y);\n"
+	"			sKeyPacked[2] |= 1<<(8*b.z);\n"
+	"			sKeyPacked[3] |= 1<<(8*b.w);\n"
+	"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
+	"		}\n"
+	"		u32 rankPacked;\n"
+	"		u32 sumPacked;\n"
+	"		{\n"
+	"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"		u32 newOffset[4] = { 0,0,0,0 };\n"
+	"		{\n"
+	"			u32 sumScanned = bit8Scan( sumPacked );\n"
+	"			u32 scannedKeys[4];\n"
+	"			scannedKeys[0] = 1<<(8*b.x);\n"
+	"			scannedKeys[1] = 1<<(8*b.y);\n"
+	"			scannedKeys[2] = 1<<(8*b.z);\n"
+	"			scannedKeys[3] = 1<<(8*b.w);\n"
+	"			{	//	4 scans at once\n"
+	"				u32 sum4 = 0;\n"
+	"				for(int ie=0; ie<4; ie++)\n"
+	"				{\n"
+	"					u32 tmp = scannedKeys[ie];\n"
+	"					scannedKeys[ie] = sum4;\n"
+	"					sum4 += tmp;\n"
+	"				}\n"
+	"			}\n"
+	"			{\n"
+	"				u32 sumPlusRank = sumScanned + rankPacked;\n"
+	"				{	u32 ie = b.x;\n"
+	"					scannedKeys[0] += sumPlusRank;\n"
+	"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
+	"				}\n"
+	"				{	u32 ie = b.y;\n"
+	"					scannedKeys[1] += sumPlusRank;\n"
+	"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
+	"				}\n"
+	"				{	u32 ie = b.z;\n"
+	"					scannedKeys[2] += sumPlusRank;\n"
+	"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
+	"				}\n"
+	"				{	u32 ie = b.w;\n"
+	"					scannedKeys[3] += sumPlusRank;\n"
+	"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"		{\n"
+	"			ldsSortData[newOffset[0]] = sortData[0];\n"
+	"			ldsSortData[newOffset[1]] = sortData[1];\n"
+	"			ldsSortData[newOffset[2]] = sortData[2];\n"
+	"			ldsSortData[newOffset[3]] = sortData[3];\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"			u32 dstAddr = 4*lIdx;\n"
+	"			sortData[0] = ldsSortData[dstAddr+0];\n"
+	"			sortData[1] = ldsSortData[dstAddr+1];\n"
+	"			sortData[2] = ldsSortData[dstAddr+2];\n"
+	"			sortData[3] = ldsSortData[dstAddr+3];\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
+	"{\n"
+	"	__local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
+	"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
+	"	__local u32 localHistogram[NUM_BUCKET*2];\n"
+	"	u32 gIdx = GET_GLOBAL_IDX;\n"
+	"	u32 lIdx = GET_LOCAL_IDX;\n"
+	"	u32 wgIdx = GET_GROUP_IDX;\n"
+	"	u32 wgSize = GET_GROUP_SIZE;\n"
+	"	const int n = cb.m_n;\n"
+	"	const int nWGs = cb.m_nWGs;\n"
+	"	const int startBit = cb.m_startBit;\n"
+	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+	"	if( lIdx < (NUM_BUCKET) )\n"
+	"	{\n"
+	"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+	"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
+	"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+	"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+	"	{\n"
+	"		u32 myHistogram = 0;\n"
+	"		u32 sortData[ELEMENTS_PER_WORK_ITEM];\n"
+	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+	"#if defined(CHECK_BOUNDARY)\n"
+	"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;\n"
+	"#else\n"
+	"			sortData[i] = gSrc[ addr+i ];\n"
+	"#endif\n"
+	"		sort4Bits(sortData, startBit, lIdx, ldsSortData);\n"
+	"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
+	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+	"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
+	"		{	//	create histogram\n"
+	"			u32 setIdx = lIdx/16;\n"
+	"			if( lIdx < NUM_BUCKET )\n"
+	"			{\n"
+	"				localHistogram[lIdx] = 0;\n"
+	"			}\n"
+	"			ldsSortData[lIdx] = 0;\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+	"#if defined(CHECK_BOUNDARY)\n"
+	"				if( addr+i < n )\n"
+	"#endif\n"
+	"#if defined(NV_GPU)\n"
+	"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
+	"#else\n"
+	"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
+	"#endif\n"
+	"			\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"			\n"
+	"			uint hIdx = NUM_BUCKET+lIdx;\n"
+	"			if( lIdx < NUM_BUCKET )\n"
+	"			{\n"
+	"				u32 sum = 0;\n"
+	"				for(int i=0; i<WG_SIZE/16; i++)\n"
+	"				{\n"
+	"					sum += SET_HISTOGRAM( i, lIdx );\n"
+	"				}\n"
+	"				myHistogram = sum;\n"
+	"				localHistogram[hIdx] = sum;\n"
+	"			}\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"#if defined(USE_2LEVEL_REDUCE)\n"
+	"			if( lIdx < NUM_BUCKET )\n"
+	"			{\n"
+	"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				u32 u0, u1, u2;\n"
+	"				u0 = localHistogram[hIdx-3];\n"
+	"				u1 = localHistogram[hIdx-2];\n"
+	"				u2 = localHistogram[hIdx-1];\n"
+	"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				u0 = localHistogram[hIdx-12];\n"
+	"				u1 = localHistogram[hIdx-8];\n"
+	"				u2 = localHistogram[hIdx-4];\n"
+	"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+	"				GROUP_MEM_FENCE;\n"
+	"			}\n"
+	"#else\n"
+	"			if( lIdx < NUM_BUCKET )\n"
+	"			{\n"
+	"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"			}\n"
+	"#endif\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"		}\n"
+	"		{\n"
+	"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+	"			{\n"
+	"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
+	"				int binIdx = keys[ie];\n"
+	"				int groupOffset = localHistogramToCarry[binIdx];\n"
+	"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+	"#if defined(CHECK_BOUNDARY)\n"
+	"				if( addr+ie < n )\n"
+	"#endif\n"
+	"				gDst[ groupOffset + myIdx ] = sortData[ie];\n"
+	"			}\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"		if( lIdx < NUM_BUCKET )\n"
+	"		{\n"
+	"			localHistogramToCarry[lIdx] += myHistogram;\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"	}\n"
+	"}\n"
+	"//	2 scan, 2 exchange\n"
+	"void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n"
+	"{\n"
+	"	for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)\n"
+	"	{\n"
+	"		uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, \n"
+	"			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
+	"			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
+	"			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
+	"		u32 key4;\n"
+	"		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
+	"		{\n"
+	"			sKeyPacked[0] |= 1<<(8*b.x);\n"
+	"			sKeyPacked[1] |= 1<<(8*b.y);\n"
+	"			sKeyPacked[2] |= 1<<(8*b.z);\n"
+	"			sKeyPacked[3] |= 1<<(8*b.w);\n"
+	"			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
+	"		}\n"
+	"		u32 rankPacked;\n"
+	"		u32 sumPacked;\n"
+	"		{\n"
+	"			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"		u32 newOffset[4] = { 0,0,0,0 };\n"
+	"		{\n"
+	"			u32 sumScanned = bit8Scan( sumPacked );\n"
+	"			u32 scannedKeys[4];\n"
+	"			scannedKeys[0] = 1<<(8*b.x);\n"
+	"			scannedKeys[1] = 1<<(8*b.y);\n"
+	"			scannedKeys[2] = 1<<(8*b.z);\n"
+	"			scannedKeys[3] = 1<<(8*b.w);\n"
+	"			{	//	4 scans at once\n"
+	"				u32 sum4 = 0;\n"
+	"				for(int ie=0; ie<4; ie++)\n"
+	"				{\n"
+	"					u32 tmp = scannedKeys[ie];\n"
+	"					scannedKeys[ie] = sum4;\n"
+	"					sum4 += tmp;\n"
+	"				}\n"
+	"			}\n"
+	"			{\n"
+	"				u32 sumPlusRank = sumScanned + rankPacked;\n"
+	"				{	u32 ie = b.x;\n"
+	"					scannedKeys[0] += sumPlusRank;\n"
+	"					newOffset[0] = unpack4Key( scannedKeys[0], ie );\n"
+	"				}\n"
+	"				{	u32 ie = b.y;\n"
+	"					scannedKeys[1] += sumPlusRank;\n"
+	"					newOffset[1] = unpack4Key( scannedKeys[1], ie );\n"
+	"				}\n"
+	"				{	u32 ie = b.z;\n"
+	"					scannedKeys[2] += sumPlusRank;\n"
+	"					newOffset[2] = unpack4Key( scannedKeys[2], ie );\n"
+	"				}\n"
+	"				{	u32 ie = b.w;\n"
+	"					scannedKeys[3] += sumPlusRank;\n"
+	"					newOffset[3] = unpack4Key( scannedKeys[3], ie );\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"		{\n"
+	"			ldsSortData[newOffset[0]] = sortData[0];\n"
+	"			ldsSortData[newOffset[1]] = sortData[1];\n"
+	"			ldsSortData[newOffset[2]] = sortData[2];\n"
+	"			ldsSortData[newOffset[3]] = sortData[3];\n"
+	"			ldsSortVal[newOffset[0]] = sortVal[0];\n"
+	"			ldsSortVal[newOffset[1]] = sortVal[1];\n"
+	"			ldsSortVal[newOffset[2]] = sortVal[2];\n"
+	"			ldsSortVal[newOffset[3]] = sortVal[3];\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"			u32 dstAddr = 4*lIdx;\n"
+	"			sortData[0] = ldsSortData[dstAddr+0];\n"
+	"			sortData[1] = ldsSortData[dstAddr+1];\n"
+	"			sortData[2] = ldsSortData[dstAddr+2];\n"
+	"			sortData[3] = ldsSortData[dstAddr+3];\n"
+	"			sortVal[0] = ldsSortVal[dstAddr+0];\n"
+	"			sortVal[1] = ldsSortVal[dstAddr+1];\n"
+	"			sortVal[2] = ldsSortVal[dstAddr+2];\n"
+	"			sortVal[3] = ldsSortVal[dstAddr+3];\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
+	"{\n"
+	"	__local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
+	"	__local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
+	"	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
+	"	__local u32 localHistogram[NUM_BUCKET*2];\n"
+	"	u32 gIdx = GET_GLOBAL_IDX;\n"
+	"	u32 lIdx = GET_LOCAL_IDX;\n"
+	"	u32 wgIdx = GET_GROUP_IDX;\n"
+	"	u32 wgSize = GET_GROUP_SIZE;\n"
+	"	const int n = cb.m_n;\n"
+	"	const int nWGs = cb.m_nWGs;\n"
+	"	const int startBit = cb.m_startBit;\n"
+	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+	"	if( lIdx < (NUM_BUCKET) )\n"
+	"	{\n"
+	"		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"    \n"
+	"	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+	"	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
+	"	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+	"	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
+	"	{\n"
+	"		u32 myHistogram = 0;\n"
+	"		int sortData[ELEMENTS_PER_WORK_ITEM];\n"
+	"		int sortVal[ELEMENTS_PER_WORK_ITEM];\n"
+	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+	"#if defined(CHECK_BOUNDARY)\n"
+	"		{\n"
+	"			sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;\n"
+	"			sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;\n"
+	"		}\n"
+	"#else\n"
+	"		{\n"
+	"			sortData[i] = gSrc[ addr+i ].m_key;\n"
+	"			sortVal[i] = gSrc[ addr+i ].m_value;\n"
+	"		}\n"
+	"#endif\n"
+	"		sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n"
+	"		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
+	"		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+	"			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
+	"		{	//	create histogram\n"
+	"			u32 setIdx = lIdx/16;\n"
+	"			if( lIdx < NUM_BUCKET )\n"
+	"			{\n"
+	"				localHistogram[lIdx] = 0;\n"
+	"			}\n"
+	"			ldsSortData[lIdx] = 0;\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
+	"#if defined(CHECK_BOUNDARY)\n"
+	"				if( addr+i < n )\n"
+	"#endif\n"
+	"#if defined(NV_GPU)\n"
+	"				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
+	"#else\n"
+	"				AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );\n"
+	"#endif\n"
+	"			\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"			\n"
+	"			uint hIdx = NUM_BUCKET+lIdx;\n"
+	"			if( lIdx < NUM_BUCKET )\n"
+	"			{\n"
+	"				u32 sum = 0;\n"
+	"				for(int i=0; i<WG_SIZE/16; i++)\n"
+	"				{\n"
+	"					sum += SET_HISTOGRAM( i, lIdx );\n"
+	"				}\n"
+	"				myHistogram = sum;\n"
+	"				localHistogram[hIdx] = sum;\n"
+	"			}\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"#if defined(USE_2LEVEL_REDUCE)\n"
+	"			if( lIdx < NUM_BUCKET )\n"
+	"			{\n"
+	"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				u32 u0, u1, u2;\n"
+	"				u0 = localHistogram[hIdx-3];\n"
+	"				u1 = localHistogram[hIdx-2];\n"
+	"				u2 = localHistogram[hIdx-1];\n"
+	"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				u0 = localHistogram[hIdx-12];\n"
+	"				u1 = localHistogram[hIdx-8];\n"
+	"				u2 = localHistogram[hIdx-4];\n"
+	"				AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );\n"
+	"				GROUP_MEM_FENCE;\n"
+	"			}\n"
+	"#else\n"
+	"			if( lIdx < NUM_BUCKET )\n"
+	"			{\n"
+	"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
+	"				GROUP_MEM_FENCE;\n"
+	"			}\n"
+	"#endif\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"		}\n"
+	"    	{\n"
+	"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
+	"			{\n"
+	"				int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;\n"
+	"				int binIdx = keys[ie];\n"
+	"				int groupOffset = localHistogramToCarry[binIdx];\n"
+	"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
+	"#if defined(CHECK_BOUNDARY)\n"
+	"				if( addr+ie < n )\n"
+	"				{\n"
+	"                    if ((groupOffset + myIdx)<n)\n"
+	"                    {\n"
+	"                        if (sortData[ie]==sortVal[ie])\n"
+	"                        {\n"
+	"                            \n"
+	"                            SortDataCL tmp;\n"
+	"                            tmp.m_key = sortData[ie];\n"
+	"                            tmp.m_value = sortVal[ie];\n"
+	"                            if (tmp.m_key == tmp.m_value)\n"
+	"                                gDst[groupOffset + myIdx ] = tmp;\n"
+	"                        }\n"
+	"                        \n"
+	"                    }\n"
+	"				}\n"
+	"#else\n"
+	"                if ((groupOffset + myIdx)<n)\n"
+	"                {\n"
+	"                    gDst[ groupOffset + myIdx ].m_key = sortData[ie];\n"
+	"                    gDst[ groupOffset + myIdx ].m_value = sortVal[ie];\n"
+	"                }\n"
+	"#endif\n"
+	"			}\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"		if( lIdx < NUM_BUCKET )\n"
+	"		{\n"
+	"			localHistogramToCarry[lIdx] += myHistogram;\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
+	"{\n"
+	"    \n"
+	"	u32 gIdx = GET_GLOBAL_IDX;\n"
+	"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
+	"	u32 wgIdx = GET_GROUP_IDX;\n"
+	"	u32 wgSize = GET_GROUP_SIZE;\n"
+	"	const int startBit = cb.m_startBit;\n"
+	"	const int n = cb.m_n;\n"
+	"	const int nWGs = cb.m_nWGs;\n"
+	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+	"    int counter[NUM_BUCKET];\n"
+	"    \n"
+	"    if (realLocalIdx>0)\n"
+	"        return;\n"
+	"    \n"
+	"    for (int c=0;c<NUM_BUCKET;c++)\n"
+	"        counter[c]=0;\n"
+	"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+	"	\n"
+	"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+	"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
+	"  {\n"
+	"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
+	" 	{\n"
+	"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+	"        \n"
+	"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
+	"		{\n"
+	"            int i = addr2+j;\n"
+	"			if( i < n )\n"
+	"			{\n"
+	"                int tableIdx;\n"
+	"				tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
+	"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
+	"                counter[tableIdx] ++;\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"  }\n"
+	"    \n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
+	"{\n"
+	"    \n"
+	"	u32 gIdx = GET_GLOBAL_IDX;\n"
+	"	u32 realLocalIdx = GET_LOCAL_IDX;\n"
+	"	u32 wgIdx = GET_GROUP_IDX;\n"
+	"	u32 wgSize = GET_GROUP_SIZE;\n"
+	"	const int startBit = cb.m_startBit;\n"
+	"	const int n = cb.m_n;\n"
+	"	const int nWGs = cb.m_nWGs;\n"
+	"	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
+	"    int counter[NUM_BUCKET];\n"
+	"    \n"
+	"    if (realLocalIdx>0)\n"
+	"        return;\n"
+	"    \n"
+	"    for (int c=0;c<NUM_BUCKET;c++)\n"
+	"        counter[c]=0;\n"
+	"    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
+	"	\n"
+	"	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
+	"   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
+	"  {\n"
+	"     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
+	" 	{\n"
+	"        int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
+	"        \n"
+	"		for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)\n"
+	"		{\n"
+	"            int i = addr2+j;\n"
+	"			if( i < n )\n"
+	"			{\n"
+	"                int tableIdx;\n"
+	"				tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1\n"
+	"                gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];\n"
+	"                counter[tableIdx] ++;\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"  }\n"
+	"    \n"
+	"}\n";