diff options
Diffstat (limited to 'thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels')
20 files changed, 9744 insertions, 0 deletions
diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl new file mode 100644 index 0000000000..3b891b863d --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl @@ -0,0 +1,353 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile __global int* +#endif + + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +#define WG_SIZE 64 + + + + + +typedef struct +{ + int m_n; + int m_start; + int m_staticIdx; + int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_a; + int m_b; + u32 m_idx; +}Elem; + +#define STACK_SIZE (WG_SIZE*10) +//#define STACK_SIZE (WG_SIZE) +#define RING_SIZE 1024 +#define RING_SIZE_MASK (RING_SIZE-1) +#define CHECK_SIZE (WG_SIZE) + + +#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd) +#define RING_END ldsTmp + +u32 readBuf(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + return buff[bufIdx] & (1<<bitIdx); +} + +void writeBuf(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; +// buff[bufIdx] |= (1<<bitIdx); + atom_or( &buff[bufIdx], (1<<bitIdx) ); +} + +u32 tryWrite(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) ); + return ((ans >> bitIdx)&1) == 0; +} + +// batching on the GPU +__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut, + __global const u32* gN, __global const u32* gStart, __global int* batchSizes, + int m_staticIdx ) +{ + __local u32 ldsStackIdx[STACK_SIZE]; + __local u32 ldsStackEnd; + __local Elem ldsRingElem[RING_SIZE]; + __local u32 ldsRingEnd; + __local u32 ldsTmp; + __local u32 ldsCheckBuffer[CHECK_SIZE]; + __local u32 ldsFixedBuffer[CHECK_SIZE]; + __local u32 ldsGEnd; + __local u32 ldsDstEnd; + + int wgIdx = GET_GROUP_IDX; + int lIdx = GET_LOCAL_IDX; + + const int m_n = gN[wgIdx]; + const int m_start = gStart[wgIdx]; + + if( lIdx == 0 ) + { + ldsRingEnd = 0; + ldsGEnd = 0; + ldsStackEnd = 0; + ldsDstEnd = m_start; + } + + + +// while(1) +//was 250 + int ie=0; + int maxBatch = 0; + for(ie=0; ie<50; ie++) + { + ldsFixedBuffer[lIdx] = 0; + + for(int giter=0; giter<4; giter++) + { + int ringCap = GET_RING_CAPACITY; + + // 1. fill ring + if( ldsGEnd < m_n ) + { + while( ringCap > WG_SIZE ) + { + if( ldsGEnd >= m_n ) break; + if( lIdx < ringCap - WG_SIZE ) + { + int srcIdx; + AtomInc1( ldsGEnd, srcIdx ); + if( srcIdx < m_n ) + { + int dstIdx; + AtomInc1( ldsRingEnd, dstIdx ); + + int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit; + int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit; + ldsRingElem[dstIdx].m_a = (a>b)? b:a; + ldsRingElem[dstIdx].m_b = (a>b)? a:b; + ldsRingElem[dstIdx].m_idx = srcIdx; + } + } + ringCap = GET_RING_CAPACITY; + } + } + + GROUP_LDS_BARRIER; + + // 2. fill stack + __local Elem* dst = ldsRingElem; + if( lIdx == 0 ) RING_END = 0; + + int srcIdx=lIdx; + int end = ldsRingEnd; + + { + for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE) + { + Elem e; + if(srcIdx<end) e = ldsRingElem[srcIdx]; + bool done = (srcIdx<end)?false:true; + + for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0; + + if( !done ) + { + int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a)); + int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b)); + + if( aUsed==0 && bUsed==0 ) + { + int aAvailable=1; + int bAvailable=1; + int ea = abs(e.m_a); + int eb = abs(e.m_b); + + bool aStatic = (e.m_a<0) ||(ea==m_staticIdx); + bool bStatic = (e.m_b<0) ||(eb==m_staticIdx); + + if (!aStatic) + aAvailable = tryWrite( ldsCheckBuffer, ea ); + if (!bStatic) + bAvailable = tryWrite( ldsCheckBuffer, eb ); + + //aAvailable = aStatic? 1: aAvailable; + //bAvailable = bStatic? 1: bAvailable; + + bool success = (aAvailable && bAvailable); + if(success) + { + + if (!aStatic) + writeBuf( ldsFixedBuffer, ea ); + if (!bStatic) + writeBuf( ldsFixedBuffer, eb ); + } + done = success; + } + } + + // put it aside + if(srcIdx<end) + { + if( done ) + { + int dstIdx; AtomInc1( ldsStackEnd, dstIdx ); + if( dstIdx < STACK_SIZE ) + ldsStackIdx[dstIdx] = e.m_idx; + else{ + done = false; + AtomAdd( ldsStackEnd, -1 ); + } + } + if( !done ) + { + int dstIdx; AtomInc1( RING_END, dstIdx ); + dst[dstIdx] = e; + } + } + + // if filled, flush + if( ldsStackEnd == STACK_SIZE ) + { + for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE) + { + int idx = m_start + ldsStackIdx[i]; + int dstIdx; AtomInc1( ldsDstEnd, dstIdx ); + gConstraintsOut[ dstIdx ] = gConstraints[ idx ]; + gConstraintsOut[ dstIdx ].m_batchIdx = ie; + } + if( lIdx == 0 ) ldsStackEnd = 0; + + //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) + ldsFixedBuffer[lIdx] = 0; + } + } + } + + if( lIdx == 0 ) ldsRingEnd = RING_END; + } + + GROUP_LDS_BARRIER; + + for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE) + { + int idx = m_start + ldsStackIdx[i]; + int dstIdx; AtomInc1( ldsDstEnd, dstIdx ); + gConstraintsOut[ dstIdx ] = gConstraints[ idx ]; + gConstraintsOut[ dstIdx ].m_batchIdx = ie; + } + + // in case it couldn't consume any pair. Flush them + // todo. Serial batch worth while? + if( ldsStackEnd == 0 ) + { + for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE) + { + int idx = m_start + ldsRingElem[i].m_idx; + int dstIdx; AtomInc1( ldsDstEnd, dstIdx ); + gConstraintsOut[ dstIdx ] = gConstraints[ idx ]; + int curBatch = 100+i; + if (maxBatch < curBatch) + maxBatch = curBatch; + + gConstraintsOut[ dstIdx ].m_batchIdx = curBatch; + + } + GROUP_LDS_BARRIER; + if( lIdx == 0 ) ldsRingEnd = 0; + } + + if( lIdx == 0 ) ldsStackEnd = 0; + + GROUP_LDS_BARRIER; + + // termination + if( ldsGEnd == m_n && ldsRingEnd == 0 ) + break; + } + + if( lIdx == 0 ) + { + if (maxBatch < ie) + maxBatch=ie; + batchSizes[wgIdx]=maxBatch; + } + +} + + + + + + + + + + + + + + + + + + + + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h new file mode 100644 index 0000000000..150eedc94b --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h @@ -0,0 +1,388 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* batchingKernelsCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile __global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"#define WG_SIZE 64\n" +"typedef struct \n" +"{\n" +" int m_n;\n" +" int m_start;\n" +" int m_staticIdx;\n" +" int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct \n" +"{\n" +" int m_a;\n" +" int m_b;\n" +" u32 m_idx;\n" +"}Elem;\n" +"#define STACK_SIZE (WG_SIZE*10)\n" +"//#define STACK_SIZE (WG_SIZE)\n" +"#define RING_SIZE 1024\n" +"#define RING_SIZE_MASK (RING_SIZE-1)\n" +"#define CHECK_SIZE (WG_SIZE)\n" +"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n" +"#define RING_END ldsTmp\n" +"u32 readBuf(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" return buff[bufIdx] & (1<<bitIdx);\n" +"}\n" +"void writeBuf(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +"// buff[bufIdx] |= (1<<bitIdx);\n" +" atom_or( &buff[bufIdx], (1<<bitIdx) );\n" +"}\n" +"u32 tryWrite(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n" +" return ((ans >> bitIdx)&1) == 0;\n" +"}\n" +"// batching on the GPU\n" +"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n" +" __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n" +" int m_staticIdx )\n" +"{\n" +" __local u32 ldsStackIdx[STACK_SIZE];\n" +" __local u32 ldsStackEnd;\n" +" __local Elem ldsRingElem[RING_SIZE];\n" +" __local u32 ldsRingEnd;\n" +" __local u32 ldsTmp;\n" +" __local u32 ldsCheckBuffer[CHECK_SIZE];\n" +" __local u32 ldsFixedBuffer[CHECK_SIZE];\n" +" __local u32 ldsGEnd;\n" +" __local u32 ldsDstEnd;\n" +" int wgIdx = GET_GROUP_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" \n" +" const int m_n = gN[wgIdx];\n" +" const int m_start = gStart[wgIdx];\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsRingEnd = 0;\n" +" ldsGEnd = 0;\n" +" ldsStackEnd = 0;\n" +" ldsDstEnd = m_start;\n" +" }\n" +" \n" +" \n" +" \n" +"// while(1)\n" +"//was 250\n" +" int ie=0;\n" +" int maxBatch = 0;\n" +" for(ie=0; ie<50; ie++)\n" +" {\n" +" ldsFixedBuffer[lIdx] = 0;\n" +" for(int giter=0; giter<4; giter++)\n" +" {\n" +" int ringCap = GET_RING_CAPACITY;\n" +" \n" +" // 1. fill ring\n" +" if( ldsGEnd < m_n )\n" +" {\n" +" while( ringCap > WG_SIZE )\n" +" {\n" +" if( ldsGEnd >= m_n ) break;\n" +" if( lIdx < ringCap - WG_SIZE )\n" +" {\n" +" int srcIdx;\n" +" AtomInc1( ldsGEnd, srcIdx );\n" +" if( srcIdx < m_n )\n" +" {\n" +" int dstIdx;\n" +" AtomInc1( ldsRingEnd, dstIdx );\n" +" \n" +" int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n" +" int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n" +" ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n" +" ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n" +" ldsRingElem[dstIdx].m_idx = srcIdx;\n" +" }\n" +" }\n" +" ringCap = GET_RING_CAPACITY;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" // 2. fill stack\n" +" __local Elem* dst = ldsRingElem;\n" +" if( lIdx == 0 ) RING_END = 0;\n" +" int srcIdx=lIdx;\n" +" int end = ldsRingEnd;\n" +" {\n" +" for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n" +" {\n" +" Elem e;\n" +" if(srcIdx<end) e = ldsRingElem[srcIdx];\n" +" bool done = (srcIdx<end)?false:true;\n" +" for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n" +" \n" +" if( !done )\n" +" {\n" +" int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n" +" int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n" +" if( aUsed==0 && bUsed==0 )\n" +" {\n" +" int aAvailable=1;\n" +" int bAvailable=1;\n" +" int ea = abs(e.m_a);\n" +" int eb = abs(e.m_b);\n" +" bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n" +" bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n" +" \n" +" if (!aStatic)\n" +" aAvailable = tryWrite( ldsCheckBuffer, ea );\n" +" if (!bStatic)\n" +" bAvailable = tryWrite( ldsCheckBuffer, eb );\n" +" \n" +" //aAvailable = aStatic? 1: aAvailable;\n" +" //bAvailable = bStatic? 1: bAvailable;\n" +" bool success = (aAvailable && bAvailable);\n" +" if(success)\n" +" {\n" +" \n" +" if (!aStatic)\n" +" writeBuf( ldsFixedBuffer, ea );\n" +" if (!bStatic)\n" +" writeBuf( ldsFixedBuffer, eb );\n" +" }\n" +" done = success;\n" +" }\n" +" }\n" +" // put it aside\n" +" if(srcIdx<end)\n" +" {\n" +" if( done )\n" +" {\n" +" int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n" +" if( dstIdx < STACK_SIZE )\n" +" ldsStackIdx[dstIdx] = e.m_idx;\n" +" else{\n" +" done = false;\n" +" AtomAdd( ldsStackEnd, -1 );\n" +" }\n" +" }\n" +" if( !done )\n" +" {\n" +" int dstIdx; AtomInc1( RING_END, dstIdx );\n" +" dst[dstIdx] = e;\n" +" }\n" +" }\n" +" // if filled, flush\n" +" if( ldsStackEnd == STACK_SIZE )\n" +" {\n" +" for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n" +" {\n" +" int idx = m_start + ldsStackIdx[i];\n" +" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" +" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" +" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n" +" }\n" +" if( lIdx == 0 ) ldsStackEnd = 0;\n" +" //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n" +" ldsFixedBuffer[lIdx] = 0;\n" +" }\n" +" }\n" +" }\n" +" if( lIdx == 0 ) ldsRingEnd = RING_END;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n" +" {\n" +" int idx = m_start + ldsStackIdx[i];\n" +" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" +" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" +" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n" +" }\n" +" // in case it couldn't consume any pair. Flush them\n" +" // todo. Serial batch worth while?\n" +" if( ldsStackEnd == 0 )\n" +" {\n" +" for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n" +" {\n" +" int idx = m_start + ldsRingElem[i].m_idx;\n" +" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n" +" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n" +" int curBatch = 100+i;\n" +" if (maxBatch < curBatch)\n" +" maxBatch = curBatch;\n" +" \n" +" gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n" +" \n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 ) ldsRingEnd = 0;\n" +" }\n" +" if( lIdx == 0 ) ldsStackEnd = 0;\n" +" GROUP_LDS_BARRIER;\n" +" // termination\n" +" if( ldsGEnd == m_n && ldsRingEnd == 0 )\n" +" break;\n" +" }\n" +" if( lIdx == 0 )\n" +" {\n" +" if (maxBatch < ie)\n" +" maxBatch=ie;\n" +" batchSizes[wgIdx]=maxBatch;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl new file mode 100644 index 0000000000..ba1b66d2c3 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl @@ -0,0 +1,231 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile __global int* +#endif + +#define SIMD_WIDTH 64 + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +#define WG_SIZE 64 + + + + + +typedef struct +{ + int m_n; + int m_start; + int m_staticIdx; + int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_a; + int m_b; + u32 m_idx; +}Elem; + + + + + +// batching on the GPU +__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx ) +{ + int wgIdx = GET_GROUP_IDX; + int lIdx = GET_LOCAL_IDX; + + const int m_n = gN[wgIdx]; + const int m_start = gStart[wgIdx]; + + if( lIdx == 0 ) + { + for (int i=0;i<m_n;i++) + { + int srcIdx = i+m_start; + int batchIndex = i; + gConstraints[ srcIdx ].m_batchIdx = batchIndex; + } + } +} + + +#define CHECK_SIZE (WG_SIZE) + + + + +u32 readBuf(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + return buff[bufIdx] & (1<<bitIdx); +} + +void writeBuf(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + buff[bufIdx] |= (1<<bitIdx); + //atom_or( &buff[bufIdx], (1<<bitIdx) ); +} + +u32 tryWrite(__local u32* buff, int idx) +{ + idx = idx % (32*CHECK_SIZE); + int bitIdx = idx%32; + int bufIdx = idx/32; + u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) ); + return ((ans >> bitIdx)&1) == 0; +} + + +// batching on the GPU +__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx ) +{ + int wgIdx = GET_GROUP_IDX; + int lIdx = GET_LOCAL_IDX; + const int numConstraints = gN[wgIdx]; + const int m_start = gStart[wgIdx]; + b3Contact4Data_t tmp; + + __local u32 ldsFixedBuffer[CHECK_SIZE]; + + + + + + if( lIdx == 0 ) + { + + + __global struct b3Contact4Data* cs = &gConstraints[m_start]; + + + int numValidConstraints = 0; + int batchIdx = 0; + + while( numValidConstraints < numConstraints) + { + int nCurrentBatch = 0; + // clear flag + + for(int i=0; i<CHECK_SIZE; i++) + ldsFixedBuffer[i] = 0; + + for(int i=numValidConstraints; i<numConstraints; i++) + { + + int bodyAS = cs[i].m_bodyAPtrAndSignBit; + int bodyBS = cs[i].m_bodyBPtrAndSignBit; + int bodyA = abs(bodyAS); + int bodyB = abs(bodyBS); + bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx; + bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx; + int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA); + int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB); + + if( aUnavailable==0 && bUnavailable==0 ) // ok + { + if (!aIsStatic) + { + writeBuf( ldsFixedBuffer, bodyA ); + } + if (!bIsStatic) + { + writeBuf( ldsFixedBuffer, bodyB ); + } + + cs[i].m_batchIdx = batchIdx; + + if (i!=numValidConstraints) + { + + tmp = cs[i]; + cs[i] = cs[numValidConstraints]; + cs[numValidConstraints] = tmp; + + + } + + numValidConstraints++; + + nCurrentBatch++; + if( nCurrentBatch == SIMD_WIDTH) + { + nCurrentBatch = 0; + for(int i=0; i<CHECK_SIZE; i++) + ldsFixedBuffer[i] = 0; + + } + } + }//for + batchIdx ++; + }//while + + batchSizes[wgIdx] = batchIdx; + + }//if( lIdx == 0 ) + + //return batchIdx; +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h new file mode 100644 index 0000000000..1e5957adae --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h @@ -0,0 +1,291 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* batchingKernelsNewCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile __global int*\n" +"#endif\n" +"#define SIMD_WIDTH 64\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"#define WG_SIZE 64\n" +"typedef struct \n" +"{\n" +" int m_n;\n" +" int m_start;\n" +" int m_staticIdx;\n" +" int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct \n" +"{\n" +" int m_a;\n" +" int m_b;\n" +" u32 m_idx;\n" +"}Elem;\n" +"// batching on the GPU\n" +"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx )\n" +"{\n" +" int wgIdx = GET_GROUP_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" \n" +" const int m_n = gN[wgIdx];\n" +" const int m_start = gStart[wgIdx];\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" for (int i=0;i<m_n;i++)\n" +" {\n" +" int srcIdx = i+m_start;\n" +" int batchIndex = i;\n" +" gConstraints[ srcIdx ].m_batchIdx = batchIndex; \n" +" }\n" +" }\n" +"}\n" +"#define CHECK_SIZE (WG_SIZE)\n" +"u32 readBuf(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" return buff[bufIdx] & (1<<bitIdx);\n" +"}\n" +"void writeBuf(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" buff[bufIdx] |= (1<<bitIdx);\n" +" //atom_or( &buff[bufIdx], (1<<bitIdx) );\n" +"}\n" +"u32 tryWrite(__local u32* buff, int idx)\n" +"{\n" +" idx = idx % (32*CHECK_SIZE);\n" +" int bitIdx = idx%32;\n" +" int bufIdx = idx/32;\n" +" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n" +" return ((ans >> bitIdx)&1) == 0;\n" +"}\n" +"// batching on the GPU\n" +"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n" +"{\n" +" int wgIdx = GET_GROUP_IDX;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" const int numConstraints = gN[wgIdx];\n" +" const int m_start = gStart[wgIdx];\n" +" b3Contact4Data_t tmp;\n" +" \n" +" __local u32 ldsFixedBuffer[CHECK_SIZE];\n" +" \n" +" \n" +" \n" +" \n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" \n" +" \n" +" __global struct b3Contact4Data* cs = &gConstraints[m_start]; \n" +" \n" +" \n" +" int numValidConstraints = 0;\n" +" int batchIdx = 0;\n" +" while( numValidConstraints < numConstraints)\n" +" {\n" +" int nCurrentBatch = 0;\n" +" // clear flag\n" +" \n" +" for(int i=0; i<CHECK_SIZE; i++) \n" +" ldsFixedBuffer[i] = 0; \n" +" for(int i=numValidConstraints; i<numConstraints; i++)\n" +" {\n" +" int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n" +" int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n" +" int bodyA = abs(bodyAS);\n" +" int bodyB = abs(bodyBS);\n" +" bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n" +" bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n" +" int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n" +" int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n" +" \n" +" if( aUnavailable==0 && bUnavailable==0 ) // ok\n" +" {\n" +" if (!aIsStatic)\n" +" {\n" +" writeBuf( ldsFixedBuffer, bodyA );\n" +" }\n" +" if (!bIsStatic)\n" +" {\n" +" writeBuf( ldsFixedBuffer, bodyB );\n" +" }\n" +" cs[i].m_batchIdx = batchIdx;\n" +" if (i!=numValidConstraints)\n" +" {\n" +" tmp = cs[i];\n" +" cs[i] = cs[numValidConstraints];\n" +" cs[numValidConstraints] = tmp;\n" +" }\n" +" numValidConstraints++;\n" +" \n" +" nCurrentBatch++;\n" +" if( nCurrentBatch == SIMD_WIDTH)\n" +" {\n" +" nCurrentBatch = 0;\n" +" for(int i=0; i<CHECK_SIZE; i++) \n" +" ldsFixedBuffer[i] = 0;\n" +" \n" +" }\n" +" }\n" +" }//for\n" +" batchIdx ++;\n" +" }//while\n" +" \n" +" batchSizes[wgIdx] = batchIdx;\n" +" }//if( lIdx == 0 )\n" +" \n" +" //return batchIdx;\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl new file mode 100644 index 0000000000..e22bc9bc33 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl @@ -0,0 +1,32 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h" + +#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h" + + + +__kernel void + integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration) +{ + int nodeID = get_global_id(0); + + if( nodeID < numNodes) + { + integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration); + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h new file mode 100644 index 0000000000..a5a432947c --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h @@ -0,0 +1,433 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* integrateKernelCL= \ +"/*\n" +"Copyright (c) 2013 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#define B3_RIGIDBODY_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" +"struct b3RigidBodyData\n" +"{\n" +" b3Float4 m_pos;\n" +" b3Quat m_quat;\n" +" b3Float4 m_linVel;\n" +" b3Float4 m_angVel;\n" +" int m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"};\n" +"typedef struct b3InertiaData b3InertiaData_t;\n" +"struct b3InertiaData\n" +"{\n" +" b3Mat3x3 m_invInertiaWorld;\n" +" b3Mat3x3 m_initInvInertia;\n" +"};\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n" +"{\n" +" \n" +" if (bodies[nodeID].m_invMass != 0.f)\n" +" {\n" +" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n" +" //angular velocity\n" +" {\n" +" b3Float4 axis;\n" +" //add some hardcoded angular damping\n" +" bodies[nodeID].m_angVel.x *= angularDamping;\n" +" bodies[nodeID].m_angVel.y *= angularDamping;\n" +" bodies[nodeID].m_angVel.z *= angularDamping;\n" +" \n" +" b3Float4 angvel = bodies[nodeID].m_angVel;\n" +" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n" +" \n" +" //limit the angular motion\n" +" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n" +" {\n" +" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" +" }\n" +" if(fAngle < 0.001f)\n" +" {\n" +" // use Taylor's expansions of sync function\n" +" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n" +" }\n" +" else\n" +" {\n" +" // sync(fAngle) = sin(c*fAngle)/t\n" +" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n" +" }\n" +" \n" +" b3Quat dorn;\n" +" dorn.x = axis.x;\n" +" dorn.y = axis.y;\n" +" dorn.z = axis.z;\n" +" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n" +" b3Quat orn0 = bodies[nodeID].m_quat;\n" +" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n" +" predictedOrn = b3QuatNormalized(predictedOrn);\n" +" bodies[nodeID].m_quat=predictedOrn;\n" +" }\n" +" //linear velocity \n" +" bodies[nodeID].m_pos += bodies[nodeID].m_linVel * timeStep;\n" +" \n" +" //apply gravity\n" +" bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n" +" \n" +" }\n" +" \n" +"}\n" +"inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n" +"{\n" +" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n" +" \n" +" if( (body->m_invMass != 0.f))\n" +" {\n" +" //angular velocity\n" +" {\n" +" b3Float4 axis;\n" +" //add some hardcoded angular damping\n" +" body->m_angVel.x *= angularDamping;\n" +" body->m_angVel.y *= angularDamping;\n" +" body->m_angVel.z *= angularDamping;\n" +" \n" +" b3Float4 angvel = body->m_angVel;\n" +" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n" +" //limit the angular motion\n" +" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n" +" {\n" +" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n" +" }\n" +" if(fAngle < 0.001f)\n" +" {\n" +" // use Taylor's expansions of sync function\n" +" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n" +" }\n" +" else\n" +" {\n" +" // sync(fAngle) = sin(c*fAngle)/t\n" +" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n" +" }\n" +" b3Quat dorn;\n" +" dorn.x = axis.x;\n" +" dorn.y = axis.y;\n" +" dorn.z = axis.z;\n" +" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n" +" b3Quat orn0 = body->m_quat;\n" +" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n" +" predictedOrn = b3QuatNormalized(predictedOrn);\n" +" body->m_quat=predictedOrn;\n" +" }\n" +" //apply gravity\n" +" body->m_linVel += gravityAcceleration * timeStep;\n" +" //linear velocity \n" +" body->m_pos += body->m_linVel * timeStep;\n" +" \n" +" }\n" +" \n" +"}\n" +"__kernel void \n" +" integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n" +"{\n" +" int nodeID = get_global_id(0);\n" +" \n" +" if( nodeID < numNodes)\n" +" {\n" +" integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl new file mode 100644 index 0000000000..7f5dabe274 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl @@ -0,0 +1,877 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#define B3_CONSTRAINT_FLAG_ENABLED 1 + +#define B3_GPU_POINT2POINT_CONSTRAINT_TYPE 3 +#define B3_GPU_FIXED_CONSTRAINT_TYPE 4 + +#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails +#define B3_INFINITY 1e30f + +#define mymake_float4 (float4) + + +__inline float dot3F4(float4 a, float4 b) +{ + float4 a1 = mymake_float4(a.xyz,0.f); + float4 b1 = mymake_float4(b.xyz,0.f); + return dot(a1, b1); +} + + +typedef float4 Quaternion; + + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + + + +typedef struct +{ + Matrix3x3 m_invInertiaWorld; + Matrix3x3 m_initInvInertia; +} BodyInertia; + + +typedef struct +{ + Matrix3x3 m_basis;//orientation + float4 m_origin;//transform +}b3Transform; + +typedef struct +{ +// b3Transform m_worldTransformUnused; + float4 m_deltaLinearVelocity; + float4 m_deltaAngularVelocity; + float4 m_angularFactor; + float4 m_linearFactor; + float4 m_invMass; + float4 m_pushVelocity; + float4 m_turnVelocity; + float4 m_linearVelocity; + float4 m_angularVelocity; + + union + { + void* m_originalBody; + int m_originalBodyIndex; + }; + int padding[3]; + +} b3GpuSolverBody; + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + unsigned int m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} b3RigidBodyCL; + +typedef struct +{ + + float4 m_relpos1CrossNormal; + float4 m_contactNormal; + + float4 m_relpos2CrossNormal; + //float4 m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal + + float4 m_angularComponentA; + float4 m_angularComponentB; + + float m_appliedPushImpulse; + float m_appliedImpulse; + int m_padding1; + int m_padding2; + float m_friction; + float m_jacDiagABInv; + float m_rhs; + float m_cfm; + + float m_lowerLimit; + float m_upperLimit; + float m_rhsPenetration; + int m_originalConstraint; + + + int m_overrideNumSolverIterations; + int m_frictionIndex; + int m_solverBodyIdA; + int m_solverBodyIdB; + +} b3SolverConstraint; + +typedef struct +{ + int m_bodyAPtrAndSignBit; + int m_bodyBPtrAndSignBit; + int m_originalConstraintIndex; + int m_batchId; +} b3BatchConstraint; + + + + + + +typedef struct +{ + int m_constraintType; + int m_rbA; + int m_rbB; + float m_breakingImpulseThreshold; + + float4 m_pivotInA; + float4 m_pivotInB; + Quaternion m_relTargetAB; + + int m_flags; + int m_padding[3]; +} b3GpuGenericConstraint; + + +/*b3Transform getWorldTransform(b3RigidBodyCL* rb) +{ + b3Transform newTrans; + newTrans.setOrigin(rb->m_pos); + newTrans.setRotation(rb->m_quat); + return newTrans; +}*/ + + + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float4 fastNormalize4(float4 v) +{ + v = mymake_float4(v.xyz,0.f); + return fast_normalize(v); +} + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + + +__inline void internalApplyImpulse(__global b3GpuSolverBody* body, float4 linearComponent, float4 angularComponent,float impulseMagnitude) +{ + body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor; + body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor); +} + + +void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c) +{ + float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm; + float deltaVel1Dotn = dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) + dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity); + float deltaVel2Dotn = -dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity); + + deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv; + deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv; + + float sum = c->m_appliedImpulse + deltaImpulse; + if (sum < c->m_lowerLimit) + { + deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse; + c->m_appliedImpulse = c->m_lowerLimit; + } + else if (sum > c->m_upperLimit) + { + deltaImpulse = c->m_upperLimit-c->m_appliedImpulse; + c->m_appliedImpulse = c->m_upperLimit; + } + else + { + c->m_appliedImpulse = sum; + } + + internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse); + internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse); + +} + +__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies, + __global b3BatchConstraint* batchConstraints, + __global b3SolverConstraint* rows, + __global unsigned int* numConstraintRowsInfo1, + __global unsigned int* rowOffsets, + __global b3GpuGenericConstraint* constraints, + int batchOffset, + int numConstraintsInBatch + ) +{ + int b = get_global_id(0); + if (b>=numConstraintsInBatch) + return; + + __global b3BatchConstraint* c = &batchConstraints[b+batchOffset]; + int originalConstraintIndex = c->m_originalConstraintIndex; + if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED) + { + int numConstraintRows = numConstraintRowsInfo1[originalConstraintIndex]; + int rowOffset = rowOffsets[originalConstraintIndex]; + for (int jj=0;jj<numConstraintRows;jj++) + { + __global b3SolverConstraint* constraint = &rows[rowOffset+jj]; + resolveSingleConstraintRowGeneric(&solverBodies[constraint->m_solverBodyIdA],&solverBodies[constraint->m_solverBodyIdB],constraint); + } + } +}; + +__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies) +{ + int i = get_global_id(0); + if (i>=numBodies) + return; + + __global b3GpuSolverBody* solverBody = &solverBodies[i]; + __global b3RigidBodyCL* bodyCL = &bodiesCL[i]; + + solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f); + solverBody->m_deltaAngularVelocity = (float4)(0.f,0.f,0.f,0.f); + solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f); + solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f); + solverBody->m_invMass = (float4)(bodyCL->m_invMass,bodyCL->m_invMass,bodyCL->m_invMass,0.f); + solverBody->m_originalBodyIndex = i; + solverBody->m_angularFactor = (float4)(1,1,1,0); + solverBody->m_linearFactor = (float4) (1,1,1,0); + solverBody->m_linearVelocity = bodyCL->m_linVel; + solverBody->m_angularVelocity = bodyCL->m_angVel; +} + +__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints) +{ + int cid = get_global_id(0); + if (cid>=numConstraints) + return; + int numRows = numConstraintRows[cid]; + if (numRows) + { + for (int i=0;i<numRows;i++) + { + int rowIndex = rowOffsets[cid]+i; + float breakingThreshold = constraints[cid].m_breakingImpulseThreshold; + if (fabs(rows[rowIndex].m_appliedImpulse) >= breakingThreshold) + { + constraints[cid].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED; + } + } + } +} + + + +__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints) +{ + int i = get_global_id(0); + if (i>=numConstraints) + return; + + __global b3GpuGenericConstraint* constraint = &constraints[i]; + + switch (constraint->m_constraintType) + { + case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: + { + infos[i] = 3; + break; + } + case B3_GPU_FIXED_CONSTRAINT_TYPE: + { + infos[i] = 6; + break; + } + default: + { + } + } +} + +__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, + __global b3BatchConstraint* batchConstraints, + __global b3GpuGenericConstraint* constraints, + __global b3RigidBodyCL* bodies, + int numConstraints) +{ + int i = get_global_id(0); + if (i>=numConstraints) + return; + + int rbA = constraints[i].m_rbA; + int rbB = constraints[i].m_rbB; + + batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass != 0.f ? rbA : -rbA; + batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass != 0.f ? rbB : -rbB; + batchConstraints[i].m_batchId = -1; + batchConstraints[i].m_originalConstraintIndex = i; + +} + + + + +typedef struct +{ + // integrator parameters: frames per second (1/stepsize), default error + // reduction parameter (0..1). + float fps,erp; + + // for the first and second body, pointers to two (linear and angular) + // n*3 jacobian sub matrices, stored by rows. these matrices will have + // been initialized to 0 on entry. if the second body is zero then the + // J2xx pointers may be 0. + union + { + __global float4* m_J1linearAxisFloat4; + __global float* m_J1linearAxis; + }; + union + { + __global float4* m_J1angularAxisFloat4; + __global float* m_J1angularAxis; + + }; + union + { + __global float4* m_J2linearAxisFloat4; + __global float* m_J2linearAxis; + }; + union + { + __global float4* m_J2angularAxisFloat4; + __global float* m_J2angularAxis; + }; + // elements to jump from one row to the next in J's + int rowskip; + + // right hand sides of the equation J*v = c + cfm * lambda. cfm is the + // "constraint force mixing" vector. c is set to zero on entry, cfm is + // set to a constant value (typically very small or zero) value on entry. + __global float* m_constraintError; + __global float* cfm; + + // lo and hi limits for variables (set to -/+ infinity on entry). + __global float* m_lowerLimit; + __global float* m_upperLimit; + + // findex vector for variables. see the LCP solver interface for a + // description of what this does. this is set to -1 on entry. + // note that the returned indexes are relative to the first index of + // the constraint. + __global int *findex; + // number of solver iterations + int m_numIterations; + + //damping of the velocity + float m_damping; +} b3GpuConstraintInfo2; + + +void getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2) +{ + *v0 = (float4)(0. ,-vecIn.z ,vecIn.y,0.f); + *v1 = (float4)(vecIn.z ,0. ,-vecIn.x,0.f); + *v2 = (float4)(-vecIn.y ,vecIn.x ,0.f,0.f); +} + + +void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies) +{ + float4 posA = bodies[constraint->m_rbA].m_pos; + Quaternion rotA = bodies[constraint->m_rbA].m_quat; + + float4 posB = bodies[constraint->m_rbB].m_pos; + Quaternion rotB = bodies[constraint->m_rbB].m_quat; + + + + // anchor points in global coordinates with respect to body PORs. + + // set jacobian + info->m_J1linearAxis[0] = 1; + info->m_J1linearAxis[info->rowskip+1] = 1; + info->m_J1linearAxis[2*info->rowskip+2] = 1; + + float4 a1 = qtRotate(rotA,constraint->m_pivotInA); + + { + __global float4* angular0 = (__global float4*)(info->m_J1angularAxis); + __global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip); + __global float4* angular2 = (__global float4*)(info->m_J1angularAxis+2*info->rowskip); + float4 a1neg = -a1; + getSkewSymmetricMatrix(a1neg,angular0,angular1,angular2); + } + if (info->m_J2linearAxis) + { + info->m_J2linearAxis[0] = -1; + info->m_J2linearAxis[info->rowskip+1] = -1; + info->m_J2linearAxis[2*info->rowskip+2] = -1; + } + + float4 a2 = qtRotate(rotB,constraint->m_pivotInB); + + { + // float4 a2n = -a2; + __global float4* angular0 = (__global float4*)(info->m_J2angularAxis); + __global float4* angular1 = (__global float4*)(info->m_J2angularAxis+info->rowskip); + __global float4* angular2 = (__global float4*)(info->m_J2angularAxis+2*info->rowskip); + getSkewSymmetricMatrix(a2,angular0,angular1,angular2); + } + + // set right hand side +// float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp; + float currERP = info->erp; + + float k = info->fps * currERP; + int j; + float4 result = a2 + posB - a1 - posA; + float* resultPtr = &result; + + for (j=0; j<3; j++) + { + info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]); + } +} + +Quaternion nearest( Quaternion first, Quaternion qd) +{ + Quaternion diff,sum; + diff = first- qd; + sum = first + qd; + + if( dot(diff,diff) < dot(sum,sum) ) + return qd; + return (-qd); +} + +float b3Acos(float x) +{ + if (x<-1) + x=-1; + if (x>1) + x=1; + return acos(x); +} + +float getAngle(Quaternion orn) +{ + if (orn.w>=1.f) + orn.w=1.f; + float s = 2.f * b3Acos(orn.w); + return s; +} + +void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle) +{ + Quaternion orn1 = nearest(orn0,orn1a); + + Quaternion dorn = qtMul(orn1,qtInvert(orn0)); + *angle = getAngle(dorn); + *axis = (float4)(dorn.x,dorn.y,dorn.z,0.f); + + //check for axis length + float len = dot3F4(*axis,*axis); + if (len < FLT_EPSILON*FLT_EPSILON) + *axis = (float4)(1,0,0,0); + else + *axis /= sqrt(len); +} + + + +void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row) +{ + Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat; + Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat; + + int s = info->rowskip; + int start_index = start_row * s; + + // 3 rows to make body rotations equal + info->m_J1angularAxis[start_index] = 1; + info->m_J1angularAxis[start_index + s + 1] = 1; + info->m_J1angularAxis[start_index + s*2+2] = 1; + if ( info->m_J2angularAxis) + { + info->m_J2angularAxis[start_index] = -1; + info->m_J2angularAxis[start_index + s+1] = -1; + info->m_J2angularAxis[start_index + s*2+2] = -1; + } + + float currERP = info->erp; + float k = info->fps * currERP; + float4 diff; + float angle; + float4 qrelCur = qtMul(worldOrnA,qtInvert(worldOrnB)); + + calculateDiffAxisAngleQuaternion(constraint->m_relTargetAB,qrelCur,&diff,&angle); + diff*=-angle; + + float* resultPtr = &diff; + + for (int j=0; j<3; j++) + { + info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j]; + } + + +} + + +__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies) +{ + int i = get_global_id(0); + if (i>=numBodies) + return; + + if (bodies[i].m_invMass) + { +// if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP) + { + bodies[i].m_linVel += solverBodies[i].m_deltaLinearVelocity; + } +// if (length(solverBodies[i].m_deltaAngularVelocity)<MOTIONCLAMP) + { + bodies[i].m_angVel += solverBodies[i].m_deltaAngularVelocity; + } + } +} + + +__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, + __global unsigned int* infos, + __global unsigned int* constraintRowOffsets, + __global b3GpuGenericConstraint* constraints, + __global b3BatchConstraint* batchConstraints, + __global b3RigidBodyCL* bodies, + __global BodyInertia* inertias, + __global b3GpuSolverBody* solverBodies, + float timeStep, + float globalErp, + float globalCfm, + float globalDamping, + int globalNumIterations, + int numConstraints) +{ + + int i = get_global_id(0); + if (i>=numConstraints) + return; + + //for now, always initialize the batch info + int info1 = infos[i]; + + __global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]]; + __global b3GpuGenericConstraint* constraint = &constraints[i]; + + __global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA]; + __global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB]; + + int solverBodyIdA = constraint->m_rbA; + int solverBodyIdB = constraint->m_rbB; + + __global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA]; + __global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB]; + + + if (rbA->m_invMass) + { + batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA; + } else + { +// if (!solverBodyIdA) +// m_staticIdx = 0; + batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA; + } + + if (rbB->m_invMass) + { + batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB; + } else + { +// if (!solverBodyIdB) +// m_staticIdx = 0; + batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB; + } + + if (info1) + { + int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations; +// if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations) + // m_maxOverrideNumSolverIterations = overrideNumSolverIterations; + + + int j; + for ( j=0;j<info1;j++) + { +// memset(¤tConstraintRow[j],0,sizeof(b3SolverConstraint)); + currentConstraintRow[j].m_angularComponentA = (float4)(0,0,0,0); + currentConstraintRow[j].m_angularComponentB = (float4)(0,0,0,0); + currentConstraintRow[j].m_appliedImpulse = 0.f; + currentConstraintRow[j].m_appliedPushImpulse = 0.f; + currentConstraintRow[j].m_cfm = 0.f; + currentConstraintRow[j].m_contactNormal = (float4)(0,0,0,0); + currentConstraintRow[j].m_friction = 0.f; + currentConstraintRow[j].m_frictionIndex = 0; + currentConstraintRow[j].m_jacDiagABInv = 0.f; + currentConstraintRow[j].m_lowerLimit = 0.f; + currentConstraintRow[j].m_upperLimit = 0.f; + + currentConstraintRow[j].m_originalConstraint = i; + currentConstraintRow[j].m_overrideNumSolverIterations = 0; + currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0); + currentConstraintRow[j].m_relpos2CrossNormal = (float4)(0,0,0,0); + currentConstraintRow[j].m_rhs = 0.f; + currentConstraintRow[j].m_rhsPenetration = 0.f; + currentConstraintRow[j].m_solverBodyIdA = 0; + currentConstraintRow[j].m_solverBodyIdB = 0; + + currentConstraintRow[j].m_lowerLimit = -B3_INFINITY; + currentConstraintRow[j].m_upperLimit = B3_INFINITY; + currentConstraintRow[j].m_appliedImpulse = 0.f; + currentConstraintRow[j].m_appliedPushImpulse = 0.f; + currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA; + currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB; + currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations; + } + + bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0); + bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0); + bodyAPtr->m_pushVelocity = (float4)(0,0,0,0); + bodyAPtr->m_turnVelocity = (float4)(0,0,0,0); + bodyBPtr->m_deltaLinearVelocity = (float4)(0,0,0,0); + bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0); + bodyBPtr->m_pushVelocity = (float4)(0,0,0,0); + bodyBPtr->m_turnVelocity = (float4)(0,0,0,0); + + int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this + + + + + b3GpuConstraintInfo2 info2; + info2.fps = 1.f/timeStep; + info2.erp = globalErp; + info2.m_J1linearAxisFloat4 = ¤tConstraintRow->m_contactNormal; + info2.m_J1angularAxisFloat4 = ¤tConstraintRow->m_relpos1CrossNormal; + info2.m_J2linearAxisFloat4 = 0; + info2.m_J2angularAxisFloat4 = ¤tConstraintRow->m_relpos2CrossNormal; + info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this + + ///the size of b3SolverConstraint needs be a multiple of float +// b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint)); + info2.m_constraintError = ¤tConstraintRow->m_rhs; + currentConstraintRow->m_cfm = globalCfm; + info2.m_damping = globalDamping; + info2.cfm = ¤tConstraintRow->m_cfm; + info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit; + info2.m_upperLimit = ¤tConstraintRow->m_upperLimit; + info2.m_numIterations = globalNumIterations; + + switch (constraint->m_constraintType) + { + case B3_GPU_POINT2POINT_CONSTRAINT_TYPE: + { + getInfo2Point2Point(constraint,&info2,bodies); + break; + } + case B3_GPU_FIXED_CONSTRAINT_TYPE: + { + getInfo2Point2Point(constraint,&info2,bodies); + + getInfo2FixedOrientation(constraint,&info2,bodies,3); + + break; + } + + default: + { + } + } + + ///finalize the constraint setup + for ( j=0;j<info1;j++) + { + __global b3SolverConstraint* solverConstraint = ¤tConstraintRow[j]; + + if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold) + { + solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold; + } + + if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold) + { + solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold; + } + +// solverConstraint->m_originalContactPoint = constraint; + + Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld; + { + + //float4 angularFactorA(1,1,1); + float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal; + solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA; + } + + Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld; + { + + float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal; + solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor(); + } + + { + //it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal + //because it gets multiplied iMJlB + float4 iMJlA = solverConstraint->m_contactNormal*rbA->m_invMass; + float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA); + float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal? + float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB); + + float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal); + sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal); + sum += dot3F4(iMJlB,solverConstraint->m_contactNormal); + sum += dot3F4(iMJaB,solverConstraint->m_relpos2CrossNormal); + float fsum = fabs(sum); + if (fsum>FLT_EPSILON) + { + solverConstraint->m_jacDiagABInv = 1.f/sum; + } else + { + solverConstraint->m_jacDiagABInv = 0.f; + } + } + + + ///fix rhs + ///todo: add force/torque accelerators + { + float rel_vel; + float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel); + float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel); + + rel_vel = vel1Dotn+vel2Dotn; + + float restitution = 0.f; + float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2 + float velocityError = restitution - rel_vel * info2.m_damping; + float penetrationImpulse = positionalError*solverConstraint->m_jacDiagABInv; + float velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv; + solverConstraint->m_rhs = penetrationImpulse+velocityImpulse; + solverConstraint->m_appliedImpulse = 0.f; + + } + } + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h new file mode 100644 index 0000000000..d48ecf6ea6 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h @@ -0,0 +1,721 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solveConstraintRowsCL= \ +"/*\n" +"Copyright (c) 2013 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#define B3_CONSTRAINT_FLAG_ENABLED 1\n" +"#define B3_GPU_POINT2POINT_CONSTRAINT_TYPE 3\n" +"#define B3_GPU_FIXED_CONSTRAINT_TYPE 4\n" +"#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails\n" +"#define B3_INFINITY 1e30f\n" +"#define mymake_float4 (float4)\n" +"__inline float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = mymake_float4(a.xyz,0.f);\n" +" float4 b1 = mymake_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"typedef float4 Quaternion;\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertiaWorld;\n" +" Matrix3x3 m_initInvInertia;\n" +"} BodyInertia;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_basis;//orientation\n" +" float4 m_origin;//transform\n" +"}b3Transform;\n" +"typedef struct\n" +"{\n" +"// b3Transform m_worldTransformUnused;\n" +" float4 m_deltaLinearVelocity;\n" +" float4 m_deltaAngularVelocity;\n" +" float4 m_angularFactor;\n" +" float4 m_linearFactor;\n" +" float4 m_invMass;\n" +" float4 m_pushVelocity;\n" +" float4 m_turnVelocity;\n" +" float4 m_linearVelocity;\n" +" float4 m_angularVelocity;\n" +" union \n" +" {\n" +" void* m_originalBody;\n" +" int m_originalBodyIndex;\n" +" };\n" +" int padding[3];\n" +"} b3GpuSolverBody;\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" unsigned int m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} b3RigidBodyCL;\n" +"typedef struct\n" +"{\n" +" float4 m_relpos1CrossNormal;\n" +" float4 m_contactNormal;\n" +" float4 m_relpos2CrossNormal;\n" +" //float4 m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal\n" +" float4 m_angularComponentA;\n" +" float4 m_angularComponentB;\n" +" \n" +" float m_appliedPushImpulse;\n" +" float m_appliedImpulse;\n" +" int m_padding1;\n" +" int m_padding2;\n" +" float m_friction;\n" +" float m_jacDiagABInv;\n" +" float m_rhs;\n" +" float m_cfm;\n" +" \n" +" float m_lowerLimit;\n" +" float m_upperLimit;\n" +" float m_rhsPenetration;\n" +" int m_originalConstraint;\n" +" int m_overrideNumSolverIterations;\n" +" int m_frictionIndex;\n" +" int m_solverBodyIdA;\n" +" int m_solverBodyIdB;\n" +"} b3SolverConstraint;\n" +"typedef struct \n" +"{\n" +" int m_bodyAPtrAndSignBit;\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_originalConstraintIndex;\n" +" int m_batchId;\n" +"} b3BatchConstraint;\n" +"typedef struct \n" +"{\n" +" int m_constraintType;\n" +" int m_rbA;\n" +" int m_rbB;\n" +" float m_breakingImpulseThreshold;\n" +" float4 m_pivotInA;\n" +" float4 m_pivotInB;\n" +" Quaternion m_relTargetAB;\n" +" int m_flags;\n" +" int m_padding[3];\n" +"} b3GpuGenericConstraint;\n" +"/*b3Transform getWorldTransform(b3RigidBodyCL* rb)\n" +"{\n" +" b3Transform newTrans;\n" +" newTrans.setOrigin(rb->m_pos);\n" +" newTrans.setRotation(rb->m_quat);\n" +" return newTrans;\n" +"}*/\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" v = mymake_float4(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline void internalApplyImpulse(__global b3GpuSolverBody* body, float4 linearComponent, float4 angularComponent,float impulseMagnitude)\n" +"{\n" +" body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;\n" +" body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);\n" +"}\n" +"void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c)\n" +"{\n" +" float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm;\n" +" float deltaVel1Dotn = dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) + dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);\n" +" float deltaVel2Dotn = -dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);\n" +" deltaImpulse -= deltaVel1Dotn*c->m_jacDiagABInv;\n" +" deltaImpulse -= deltaVel2Dotn*c->m_jacDiagABInv;\n" +" float sum = c->m_appliedImpulse + deltaImpulse;\n" +" if (sum < c->m_lowerLimit)\n" +" {\n" +" deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse;\n" +" c->m_appliedImpulse = c->m_lowerLimit;\n" +" }\n" +" else if (sum > c->m_upperLimit) \n" +" {\n" +" deltaImpulse = c->m_upperLimit-c->m_appliedImpulse;\n" +" c->m_appliedImpulse = c->m_upperLimit;\n" +" }\n" +" else\n" +" {\n" +" c->m_appliedImpulse = sum;\n" +" }\n" +" internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n" +" internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n" +"}\n" +"__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,\n" +" __global b3BatchConstraint* batchConstraints,\n" +" __global b3SolverConstraint* rows,\n" +" __global unsigned int* numConstraintRowsInfo1, \n" +" __global unsigned int* rowOffsets,\n" +" __global b3GpuGenericConstraint* constraints,\n" +" int batchOffset,\n" +" int numConstraintsInBatch\n" +" )\n" +"{\n" +" int b = get_global_id(0);\n" +" if (b>=numConstraintsInBatch)\n" +" return;\n" +" __global b3BatchConstraint* c = &batchConstraints[b+batchOffset];\n" +" int originalConstraintIndex = c->m_originalConstraintIndex;\n" +" if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED)\n" +" {\n" +" int numConstraintRows = numConstraintRowsInfo1[originalConstraintIndex];\n" +" int rowOffset = rowOffsets[originalConstraintIndex];\n" +" for (int jj=0;jj<numConstraintRows;jj++)\n" +" {\n" +" __global b3SolverConstraint* constraint = &rows[rowOffset+jj];\n" +" resolveSingleConstraintRowGeneric(&solverBodies[constraint->m_solverBodyIdA],&solverBodies[constraint->m_solverBodyIdB],constraint);\n" +" }\n" +" }\n" +"};\n" +"__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numBodies)\n" +" return;\n" +" __global b3GpuSolverBody* solverBody = &solverBodies[i];\n" +" __global b3RigidBodyCL* bodyCL = &bodiesCL[i];\n" +" solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f);\n" +" solverBody->m_deltaAngularVelocity = (float4)(0.f,0.f,0.f,0.f);\n" +" solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n" +" solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n" +" solverBody->m_invMass = (float4)(bodyCL->m_invMass,bodyCL->m_invMass,bodyCL->m_invMass,0.f);\n" +" solverBody->m_originalBodyIndex = i;\n" +" solverBody->m_angularFactor = (float4)(1,1,1,0);\n" +" solverBody->m_linearFactor = (float4) (1,1,1,0);\n" +" solverBody->m_linearVelocity = bodyCL->m_linVel;\n" +" solverBody->m_angularVelocity = bodyCL->m_angVel;\n" +"}\n" +"__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints)\n" +"{\n" +" int cid = get_global_id(0);\n" +" if (cid>=numConstraints)\n" +" return;\n" +" int numRows = numConstraintRows[cid];\n" +" if (numRows)\n" +" {\n" +" for (int i=0;i<numRows;i++)\n" +" {\n" +" int rowIndex = rowOffsets[cid]+i;\n" +" float breakingThreshold = constraints[cid].m_breakingImpulseThreshold;\n" +" if (fabs(rows[rowIndex].m_appliedImpulse) >= breakingThreshold)\n" +" {\n" +" constraints[cid].m_flags =0;//&= ~B3_CONSTRAINT_FLAG_ENABLED;\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numConstraints)\n" +" return;\n" +" __global b3GpuGenericConstraint* constraint = &constraints[i];\n" +" switch (constraint->m_constraintType)\n" +" {\n" +" case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n" +" {\n" +" infos[i] = 3;\n" +" break;\n" +" }\n" +" case B3_GPU_FIXED_CONSTRAINT_TYPE:\n" +" {\n" +" infos[i] = 6;\n" +" break;\n" +" }\n" +" default:\n" +" {\n" +" }\n" +" }\n" +"}\n" +"__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, \n" +" __global b3BatchConstraint* batchConstraints, \n" +" __global b3GpuGenericConstraint* constraints,\n" +" __global b3RigidBodyCL* bodies,\n" +" int numConstraints)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numConstraints)\n" +" return;\n" +" int rbA = constraints[i].m_rbA;\n" +" int rbB = constraints[i].m_rbB;\n" +" batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass != 0.f ? rbA : -rbA;\n" +" batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass != 0.f ? rbB : -rbB;\n" +" batchConstraints[i].m_batchId = -1;\n" +" batchConstraints[i].m_originalConstraintIndex = i;\n" +"}\n" +"typedef struct\n" +"{\n" +" // integrator parameters: frames per second (1/stepsize), default error\n" +" // reduction parameter (0..1).\n" +" float fps,erp;\n" +" // for the first and second body, pointers to two (linear and angular)\n" +" // n*3 jacobian sub matrices, stored by rows. these matrices will have\n" +" // been initialized to 0 on entry. if the second body is zero then the\n" +" // J2xx pointers may be 0.\n" +" union \n" +" {\n" +" __global float4* m_J1linearAxisFloat4;\n" +" __global float* m_J1linearAxis;\n" +" };\n" +" union\n" +" {\n" +" __global float4* m_J1angularAxisFloat4;\n" +" __global float* m_J1angularAxis;\n" +" };\n" +" union\n" +" {\n" +" __global float4* m_J2linearAxisFloat4;\n" +" __global float* m_J2linearAxis;\n" +" };\n" +" union\n" +" {\n" +" __global float4* m_J2angularAxisFloat4;\n" +" __global float* m_J2angularAxis;\n" +" };\n" +" // elements to jump from one row to the next in J's\n" +" int rowskip;\n" +" // right hand sides of the equation J*v = c + cfm * lambda. cfm is the\n" +" // \"constraint force mixing\" vector. c is set to zero on entry, cfm is\n" +" // set to a constant value (typically very small or zero) value on entry.\n" +" __global float* m_constraintError;\n" +" __global float* cfm;\n" +" // lo and hi limits for variables (set to -/+ infinity on entry).\n" +" __global float* m_lowerLimit;\n" +" __global float* m_upperLimit;\n" +" // findex vector for variables. see the LCP solver interface for a\n" +" // description of what this does. this is set to -1 on entry.\n" +" // note that the returned indexes are relative to the first index of\n" +" // the constraint.\n" +" __global int *findex;\n" +" // number of solver iterations\n" +" int m_numIterations;\n" +" //damping of the velocity\n" +" float m_damping;\n" +"} b3GpuConstraintInfo2;\n" +"void getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2)\n" +"{\n" +" *v0 = (float4)(0. ,-vecIn.z ,vecIn.y,0.f);\n" +" *v1 = (float4)(vecIn.z ,0. ,-vecIn.x,0.f);\n" +" *v2 = (float4)(-vecIn.y ,vecIn.x ,0.f,0.f);\n" +"}\n" +"void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies)\n" +"{\n" +" float4 posA = bodies[constraint->m_rbA].m_pos;\n" +" Quaternion rotA = bodies[constraint->m_rbA].m_quat;\n" +" float4 posB = bodies[constraint->m_rbB].m_pos;\n" +" Quaternion rotB = bodies[constraint->m_rbB].m_quat;\n" +" // anchor points in global coordinates with respect to body PORs.\n" +" \n" +" // set jacobian\n" +" info->m_J1linearAxis[0] = 1;\n" +" info->m_J1linearAxis[info->rowskip+1] = 1;\n" +" info->m_J1linearAxis[2*info->rowskip+2] = 1;\n" +" float4 a1 = qtRotate(rotA,constraint->m_pivotInA);\n" +" {\n" +" __global float4* angular0 = (__global float4*)(info->m_J1angularAxis);\n" +" __global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip);\n" +" __global float4* angular2 = (__global float4*)(info->m_J1angularAxis+2*info->rowskip);\n" +" float4 a1neg = -a1;\n" +" getSkewSymmetricMatrix(a1neg,angular0,angular1,angular2);\n" +" }\n" +" if (info->m_J2linearAxis)\n" +" {\n" +" info->m_J2linearAxis[0] = -1;\n" +" info->m_J2linearAxis[info->rowskip+1] = -1;\n" +" info->m_J2linearAxis[2*info->rowskip+2] = -1;\n" +" }\n" +" \n" +" float4 a2 = qtRotate(rotB,constraint->m_pivotInB);\n" +" \n" +" {\n" +" // float4 a2n = -a2;\n" +" __global float4* angular0 = (__global float4*)(info->m_J2angularAxis);\n" +" __global float4* angular1 = (__global float4*)(info->m_J2angularAxis+info->rowskip);\n" +" __global float4* angular2 = (__global float4*)(info->m_J2angularAxis+2*info->rowskip);\n" +" getSkewSymmetricMatrix(a2,angular0,angular1,angular2);\n" +" }\n" +" \n" +" // set right hand side\n" +"// float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;\n" +" float currERP = info->erp;\n" +" float k = info->fps * currERP;\n" +" int j;\n" +" float4 result = a2 + posB - a1 - posA;\n" +" float* resultPtr = &result;\n" +" for (j=0; j<3; j++)\n" +" {\n" +" info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]);\n" +" }\n" +"}\n" +"Quaternion nearest( Quaternion first, Quaternion qd)\n" +"{\n" +" Quaternion diff,sum;\n" +" diff = first- qd;\n" +" sum = first + qd;\n" +" \n" +" if( dot(diff,diff) < dot(sum,sum) )\n" +" return qd;\n" +" return (-qd);\n" +"}\n" +"float b3Acos(float x) \n" +"{ \n" +" if (x<-1) \n" +" x=-1; \n" +" if (x>1) \n" +" x=1;\n" +" return acos(x); \n" +"}\n" +"float getAngle(Quaternion orn)\n" +"{\n" +" if (orn.w>=1.f)\n" +" orn.w=1.f;\n" +" float s = 2.f * b3Acos(orn.w);\n" +" return s;\n" +"}\n" +"void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle)\n" +"{\n" +" Quaternion orn1 = nearest(orn0,orn1a);\n" +" \n" +" Quaternion dorn = qtMul(orn1,qtInvert(orn0));\n" +" *angle = getAngle(dorn);\n" +" *axis = (float4)(dorn.x,dorn.y,dorn.z,0.f);\n" +" \n" +" //check for axis length\n" +" float len = dot3F4(*axis,*axis);\n" +" if (len < FLT_EPSILON*FLT_EPSILON)\n" +" *axis = (float4)(1,0,0,0);\n" +" else\n" +" *axis /= sqrt(len);\n" +"}\n" +"void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row)\n" +"{\n" +" Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat;\n" +" Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat;\n" +" int s = info->rowskip;\n" +" int start_index = start_row * s;\n" +" // 3 rows to make body rotations equal\n" +" info->m_J1angularAxis[start_index] = 1;\n" +" info->m_J1angularAxis[start_index + s + 1] = 1;\n" +" info->m_J1angularAxis[start_index + s*2+2] = 1;\n" +" if ( info->m_J2angularAxis)\n" +" {\n" +" info->m_J2angularAxis[start_index] = -1;\n" +" info->m_J2angularAxis[start_index + s+1] = -1;\n" +" info->m_J2angularAxis[start_index + s*2+2] = -1;\n" +" }\n" +" \n" +" float currERP = info->erp;\n" +" float k = info->fps * currERP;\n" +" float4 diff;\n" +" float angle;\n" +" float4 qrelCur = qtMul(worldOrnA,qtInvert(worldOrnB));\n" +" \n" +" calculateDiffAxisAngleQuaternion(constraint->m_relTargetAB,qrelCur,&diff,&angle);\n" +" diff*=-angle;\n" +" \n" +" float* resultPtr = &diff;\n" +" \n" +" for (int j=0; j<3; j++)\n" +" {\n" +" info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j];\n" +" }\n" +" \n" +"}\n" +"__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numBodies)\n" +" return;\n" +" if (bodies[i].m_invMass)\n" +" {\n" +"// if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP)\n" +" {\n" +" bodies[i].m_linVel += solverBodies[i].m_deltaLinearVelocity;\n" +" }\n" +"// if (length(solverBodies[i].m_deltaAngularVelocity)<MOTIONCLAMP)\n" +" {\n" +" bodies[i].m_angVel += solverBodies[i].m_deltaAngularVelocity;\n" +" } \n" +" }\n" +"}\n" +"__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, \n" +" __global unsigned int* infos, \n" +" __global unsigned int* constraintRowOffsets, \n" +" __global b3GpuGenericConstraint* constraints, \n" +" __global b3BatchConstraint* batchConstraints, \n" +" __global b3RigidBodyCL* bodies,\n" +" __global BodyInertia* inertias,\n" +" __global b3GpuSolverBody* solverBodies,\n" +" float timeStep,\n" +" float globalErp,\n" +" float globalCfm,\n" +" float globalDamping,\n" +" int globalNumIterations,\n" +" int numConstraints)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i>=numConstraints)\n" +" return;\n" +" \n" +" //for now, always initialize the batch info\n" +" int info1 = infos[i];\n" +" \n" +" __global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]];\n" +" __global b3GpuGenericConstraint* constraint = &constraints[i];\n" +" __global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA];\n" +" __global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB];\n" +" int solverBodyIdA = constraint->m_rbA;\n" +" int solverBodyIdB = constraint->m_rbB;\n" +" __global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA];\n" +" __global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB];\n" +" if (rbA->m_invMass)\n" +" {\n" +" batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;\n" +" } else\n" +" {\n" +"// if (!solverBodyIdA)\n" +"// m_staticIdx = 0;\n" +" batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;\n" +" }\n" +" if (rbB->m_invMass)\n" +" {\n" +" batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;\n" +" } else\n" +" {\n" +"// if (!solverBodyIdB)\n" +"// m_staticIdx = 0;\n" +" batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;\n" +" }\n" +" if (info1)\n" +" {\n" +" int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;\n" +"// if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)\n" +" // m_maxOverrideNumSolverIterations = overrideNumSolverIterations;\n" +" int j;\n" +" for ( j=0;j<info1;j++)\n" +" {\n" +"// memset(¤tConstraintRow[j],0,sizeof(b3SolverConstraint));\n" +" currentConstraintRow[j].m_angularComponentA = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_angularComponentB = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_appliedImpulse = 0.f;\n" +" currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n" +" currentConstraintRow[j].m_cfm = 0.f;\n" +" currentConstraintRow[j].m_contactNormal = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_friction = 0.f;\n" +" currentConstraintRow[j].m_frictionIndex = 0;\n" +" currentConstraintRow[j].m_jacDiagABInv = 0.f;\n" +" currentConstraintRow[j].m_lowerLimit = 0.f;\n" +" currentConstraintRow[j].m_upperLimit = 0.f;\n" +" currentConstraintRow[j].m_originalConstraint = i;\n" +" currentConstraintRow[j].m_overrideNumSolverIterations = 0;\n" +" currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_relpos2CrossNormal = (float4)(0,0,0,0);\n" +" currentConstraintRow[j].m_rhs = 0.f;\n" +" currentConstraintRow[j].m_rhsPenetration = 0.f;\n" +" currentConstraintRow[j].m_solverBodyIdA = 0;\n" +" currentConstraintRow[j].m_solverBodyIdB = 0;\n" +" \n" +" currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;\n" +" currentConstraintRow[j].m_upperLimit = B3_INFINITY;\n" +" currentConstraintRow[j].m_appliedImpulse = 0.f;\n" +" currentConstraintRow[j].m_appliedPushImpulse = 0.f;\n" +" currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;\n" +" currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;\n" +" currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations; \n" +" }\n" +" bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n" +" bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n" +" bodyAPtr->m_pushVelocity = (float4)(0,0,0,0);\n" +" bodyAPtr->m_turnVelocity = (float4)(0,0,0,0);\n" +" bodyBPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n" +" bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n" +" bodyBPtr->m_pushVelocity = (float4)(0,0,0,0);\n" +" bodyBPtr->m_turnVelocity = (float4)(0,0,0,0);\n" +" int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n" +" \n" +" b3GpuConstraintInfo2 info2;\n" +" info2.fps = 1.f/timeStep;\n" +" info2.erp = globalErp;\n" +" info2.m_J1linearAxisFloat4 = ¤tConstraintRow->m_contactNormal;\n" +" info2.m_J1angularAxisFloat4 = ¤tConstraintRow->m_relpos1CrossNormal;\n" +" info2.m_J2linearAxisFloat4 = 0;\n" +" info2.m_J2angularAxisFloat4 = ¤tConstraintRow->m_relpos2CrossNormal;\n" +" info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n" +" ///the size of b3SolverConstraint needs be a multiple of float\n" +"// b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint));\n" +" info2.m_constraintError = ¤tConstraintRow->m_rhs;\n" +" currentConstraintRow->m_cfm = globalCfm;\n" +" info2.m_damping = globalDamping;\n" +" info2.cfm = ¤tConstraintRow->m_cfm;\n" +" info2.m_lowerLimit = ¤tConstraintRow->m_lowerLimit;\n" +" info2.m_upperLimit = ¤tConstraintRow->m_upperLimit;\n" +" info2.m_numIterations = globalNumIterations;\n" +" switch (constraint->m_constraintType)\n" +" {\n" +" case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n" +" {\n" +" getInfo2Point2Point(constraint,&info2,bodies);\n" +" break;\n" +" }\n" +" case B3_GPU_FIXED_CONSTRAINT_TYPE:\n" +" {\n" +" getInfo2Point2Point(constraint,&info2,bodies);\n" +" getInfo2FixedOrientation(constraint,&info2,bodies,3);\n" +" break;\n" +" }\n" +" default:\n" +" {\n" +" }\n" +" }\n" +" ///finalize the constraint setup\n" +" for ( j=0;j<info1;j++)\n" +" {\n" +" __global b3SolverConstraint* solverConstraint = ¤tConstraintRow[j];\n" +" if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold)\n" +" {\n" +" solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold;\n" +" }\n" +" if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold)\n" +" {\n" +" solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold;\n" +" }\n" +"// solverConstraint->m_originalContactPoint = constraint;\n" +" \n" +" Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld;\n" +" {\n" +" //float4 angularFactorA(1,1,1);\n" +" float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal;\n" +" solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA;\n" +" }\n" +" \n" +" Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld;\n" +" {\n" +" float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal;\n" +" solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor();\n" +" }\n" +" {\n" +" //it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal\n" +" //because it gets multiplied iMJlB\n" +" float4 iMJlA = solverConstraint->m_contactNormal*rbA->m_invMass;\n" +" float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA);\n" +" float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal?\n" +" float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB);\n" +" float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal);\n" +" sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal);\n" +" sum += dot3F4(iMJlB,solverConstraint->m_contactNormal);\n" +" sum += dot3F4(iMJaB,solverConstraint->m_relpos2CrossNormal);\n" +" float fsum = fabs(sum);\n" +" if (fsum>FLT_EPSILON)\n" +" {\n" +" solverConstraint->m_jacDiagABInv = 1.f/sum;\n" +" } else\n" +" {\n" +" solverConstraint->m_jacDiagABInv = 0.f;\n" +" }\n" +" }\n" +" ///fix rhs\n" +" ///todo: add force/torque accelerators\n" +" {\n" +" float rel_vel;\n" +" float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel);\n" +" float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel);\n" +" rel_vel = vel1Dotn+vel2Dotn;\n" +" float restitution = 0.f;\n" +" float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2\n" +" float velocityError = restitution - rel_vel * info2.m_damping;\n" +" float penetrationImpulse = positionalError*solverConstraint->m_jacDiagABInv;\n" +" float velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv;\n" +" solverConstraint->m_rhs = penetrationImpulse+velocityImpulse;\n" +" solverConstraint->m_appliedImpulse = 0.f;\n" +" }\n" +" }\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl new file mode 100644 index 0000000000..5c4d62e4ec --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl @@ -0,0 +1,501 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +//#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define mymake_float4 (float4) +//#define make_float2 (float2) +//#define make_uint4 (uint4) +//#define make_int4 (int4) +//#define make_uint2 (uint2) +//#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// + + + + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = mymake_float4(a.xyz,0.f); + float4 b1 = mymake_float4(b.xyz,0.f); + return dot(a1, b1); +} + + + + +__inline +float4 normalize3(const float4 a) +{ + float4 n = mymake_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + + + + +/////////////////////////////////////// +// Matrix3x3 +/////////////////////////////////////// + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + + + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + + + + + + + +#define WG_SIZE 64 + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + +typedef struct +{ + Matrix3x3 m_invInertia; + Matrix3x3 m_initInvInertia; +} Shape; + +typedef struct +{ + float4 m_linear; + float4 m_worldPos[4]; + float4 m_center; + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; + float m_fAppliedRambdaDt[2]; + + u32 m_bodyA; + u32 m_bodyB; + + int m_batchIdx; + u32 m_paddings[1]; +} Constraint4; + + + +typedef struct +{ + int m_nConstraints; + int m_start; + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_solveFriction; + int m_maxBatch; // long batch really kills the performance + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBufferBatchSolve; + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1); + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1) +{ + *linear = mymake_float4(-n.xyz,0.f); + *angular0 = -cross3(r0, n); + *angular1 = cross3(r1, n); +} + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ); + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ) +{ + return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1); +} + + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1); + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1) +{ + // linear0,1 are normlized + float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0; + float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0); + float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1; + float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1); + return -1.f/(jmj0+jmj1+jmj2+jmj3); +} + + +void solveContact(__global Constraint4* cs, + float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA, + float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB); + +void solveContact(__global Constraint4* cs, + float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA, + float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB) +{ + float minRambdaDt = 0; + float maxRambdaDt = FLT_MAX; + + for(int ic=0; ic<4; ic++) + { + if( cs->m_jacCoeffInv[ic] == 0.f ) continue; + + float4 angular0, angular1, linear; + float4 r0 = cs->m_worldPos[ic] - posA; + float4 r1 = cs->m_worldPos[ic] - posB; + setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 ); + + float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, + *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic]; + rambdaDt *= cs->m_jacCoeffInv[ic]; + + { + float prevSum = cs->m_appliedRambdaDt[ic]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt ); + updated = min2( updated, maxRambdaDt ); + rambdaDt = updated - prevSum; + cs->m_appliedRambdaDt[ic] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + *linVelA += linImp0; + *angVelA += angImp0; + *linVelB += linImp1; + *angVelB += angImp1; + } +} + +void btPlaneSpace1 (const float4* n, float4* p, float4* q); + void btPlaneSpace1 (const float4* n, float4* p, float4* q) +{ + if (fabs(n[0].z) > 0.70710678f) { + // choose p in y-z plane + float a = n[0].y*n[0].y + n[0].z*n[0].z; + float k = 1.f/sqrt(a); + p[0].x = 0; + p[0].y = -n[0].z*k; + p[0].z = n[0].y*k; + // set q = n x p + q[0].x = a*k; + q[0].y = -n[0].x*p[0].z; + q[0].z = n[0].x*p[0].y; + } + else { + // choose p in x-y plane + float a = n[0].x*n[0].x + n[0].y*n[0].y; + float k = 1.f/sqrt(a); + p[0].x = -n[0].y*k; + p[0].y = n[0].x*k; + p[0].z = 0; + // set q = n x p + q[0].x = -n[0].z*p[0].y; + q[0].y = n[0].z*p[0].x; + q[0].z = a*k; + } +} + +void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs); +void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs) +{ + //float frictionCoeff = ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, + posB, &linVelB, &angVelB, invMassB, invInertiaB ); + + if (gBodies[aIdx].m_invMass) + { + gBodies[aIdx].m_linVel = linVelA; + gBodies[aIdx].m_angVel = angVelA; + } else + { + gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0); + + } + if (gBodies[bIdx].m_invMass) + { + gBodies[bIdx].m_linVel = linVelB; + gBodies[bIdx].m_angVel = angVelB; + } else + { + gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0); + + } + +} + + + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void BatchSolveKernelContact(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + __global int* gN, + __global int* gOffsets, + __global int* batchSizes, + int maxBatch1, + int cellBatch, + int4 nSplit + ) +{ + //__local int ldsBatchIdx[WG_SIZE+1]; + __local int ldsCurBatch; + __local int ldsNextBatch; + __local int ldsStart; + + int lIdx = GET_LOCAL_IDX; + int wgIdx = GET_GROUP_IDX; + +// int gIdx = GET_GLOBAL_IDX; +// debugInfo[gIdx].m_valInt0 = gIdx; + //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; + + + + + int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2); + int remain= (wgIdx%((nSplit.x*nSplit.y)/4)); + int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1); + int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y); + + //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1); + //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1); + //int cellIdx = xIdx+yIdx*nSplit; + + if( gN[cellIdx] == 0 ) + return; + + int maxBatch = batchSizes[cellIdx]; + + + const int start = gOffsets[cellIdx]; + const int end = start + gN[cellIdx]; + + + + + if( lIdx == 0 ) + { + ldsCurBatch = 0; + ldsNextBatch = 0; + ldsStart = start; + } + + + GROUP_LDS_BARRIER; + + int idx=ldsStart+lIdx; + while (ldsCurBatch < maxBatch) + { + for(; idx<end; ) + { + if (gConstraints[idx].m_batchIdx == ldsCurBatch) + { + solveContactConstraint( gBodies, gShapes, &gConstraints[idx] ); + + idx+=64; + } else + { + break; + } + } + GROUP_LDS_BARRIER; + + if( lIdx == 0 ) + { + ldsCurBatch++; + } + GROUP_LDS_BARRIER; + } + + +} + + + +__kernel void solveSingleContactKernel(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + int cellIdx, + int batchOffset, + int numConstraintsInBatch + ) +{ + + int index = get_global_id(0); + if (index < numConstraintsInBatch) + { + int idx=batchOffset+index; + solveContactConstraint( gBodies, gShapes, &gConstraints[idx] ); + } +} diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h new file mode 100644 index 0000000000..15a049992b --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h @@ -0,0 +1,393 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solveContactCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define mymake_float4 (float4)\n" +"//#define make_float2 (float2)\n" +"//#define make_uint4 (uint4)\n" +"//#define make_int4 (int4)\n" +"//#define make_uint2 (uint2)\n" +"//#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = mymake_float4(a.xyz,0.f);\n" +" float4 b1 = mymake_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +" int m_batchIdx;\n" +" u32 m_paddings[1];\n" +"} Constraint4;\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" +"{\n" +" *linear = mymake_float4(-n.xyz,0.f);\n" +" *angular0 = -cross3(r0, n);\n" +" *angular1 = cross3(r1, n);\n" +"}\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" +"{\n" +" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" +"}\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n" +"{\n" +" // linear0,1 are normlized\n" +" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" +" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" +" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" +" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" +" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" +"}\n" +"void solveContact(__global Constraint4* cs,\n" +" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" +" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n" +"void solveContact(__global Constraint4* cs,\n" +" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" +" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n" +"{\n" +" float minRambdaDt = 0;\n" +" float maxRambdaDt = FLT_MAX;\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = cs->m_worldPos[ic] - posA;\n" +" float4 r1 = cs->m_worldPos[ic] - posB;\n" +" setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n" +" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n" +" *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n" +" rambdaDt *= cs->m_jacCoeffInv[ic];\n" +" {\n" +" float prevSum = cs->m_appliedRambdaDt[ic];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt );\n" +" updated = min2( updated, maxRambdaDt );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_appliedRambdaDt[ic] = updated;\n" +" }\n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +" *linVelA += linImp0;\n" +" *angVelA += angImp0;\n" +" *linVelB += linImp1;\n" +" *angVelB += angImp1;\n" +" }\n" +"}\n" +"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" +" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" +"{\n" +" if (fabs(n[0].z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n[0].z*k;\n" +" p[0].z = n[0].y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n[0].x*p[0].z;\n" +" q[0].z = n[0].x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n[0].y*k;\n" +" p[0].y = n[0].x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n[0].z*p[0].y;\n" +" q[0].y = n[0].z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" +"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" +"{\n" +" //float frictionCoeff = ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +" posB, &linVelB, &angVelB, invMassB, invInertiaB );\n" +" if (gBodies[aIdx].m_invMass)\n" +" {\n" +" gBodies[aIdx].m_linVel = linVelA;\n" +" gBodies[aIdx].m_angVel = angVelA;\n" +" } else\n" +" {\n" +" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" \n" +" }\n" +" if (gBodies[bIdx].m_invMass)\n" +" {\n" +" gBodies[bIdx].m_linVel = linVelB;\n" +" gBodies[bIdx].m_angVel = angVelB;\n" +" } else\n" +" {\n" +" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" \n" +" }\n" +"}\n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void BatchSolveKernelContact(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" __global int* gN,\n" +" __global int* gOffsets,\n" +" __global int* batchSizes,\n" +" int maxBatch1,\n" +" int cellBatch,\n" +" int4 nSplit\n" +" )\n" +"{\n" +" //__local int ldsBatchIdx[WG_SIZE+1];\n" +" __local int ldsCurBatch;\n" +" __local int ldsNextBatch;\n" +" __local int ldsStart;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int wgIdx = GET_GROUP_IDX;\n" +"// int gIdx = GET_GLOBAL_IDX;\n" +"// debugInfo[gIdx].m_valInt0 = gIdx;\n" +" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" +" \n" +" \n" +" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" +" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" +" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" +" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" +" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" +" //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n" +" //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n" +" //int cellIdx = xIdx+yIdx*nSplit;\n" +" \n" +" if( gN[cellIdx] == 0 ) \n" +" return;\n" +" int maxBatch = batchSizes[cellIdx];\n" +" \n" +" \n" +" const int start = gOffsets[cellIdx];\n" +" const int end = start + gN[cellIdx];\n" +" \n" +" \n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch = 0;\n" +" ldsNextBatch = 0;\n" +" ldsStart = start;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" int idx=ldsStart+lIdx;\n" +" while (ldsCurBatch < maxBatch)\n" +" {\n" +" for(; idx<end; )\n" +" {\n" +" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n" +" {\n" +" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n" +" idx+=64;\n" +" } else\n" +" {\n" +" break;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch++;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" \n" +" \n" +"}\n" +"__kernel void solveSingleContactKernel(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" int cellIdx,\n" +" int batchOffset,\n" +" int numConstraintsInBatch\n" +" )\n" +"{\n" +" int index = get_global_id(0);\n" +" if (index < numConstraintsInBatch)\n" +" {\n" +" int idx=batchOffset+index;\n" +" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n" +" } \n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl new file mode 100644 index 0000000000..1d70fbbae3 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl @@ -0,0 +1,527 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +//#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define mymake_float4 (float4) +//#define make_float2 (float2) +//#define make_uint4 (uint4) +//#define make_int4 (int4) +//#define make_uint2 (uint2) +//#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// + + + + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = mymake_float4(a.xyz,0.f); + float4 b1 = mymake_float4(b.xyz,0.f); + return dot(a1, b1); +} + + + + +__inline +float4 normalize3(const float4 a) +{ + float4 n = mymake_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + + + + +/////////////////////////////////////// +// Matrix3x3 +/////////////////////////////////////// + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + + + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + + + + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + + + + + + + +#define WG_SIZE 64 + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + +typedef struct +{ + Matrix3x3 m_invInertia; + Matrix3x3 m_initInvInertia; +} Shape; + +typedef struct +{ + float4 m_linear; + float4 m_worldPos[4]; + float4 m_center; + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; + float m_fAppliedRambdaDt[2]; + + u32 m_bodyA; + u32 m_bodyB; + + int m_batchIdx; + u32 m_paddings[1]; +} Constraint4; + + + +typedef struct +{ + int m_nConstraints; + int m_start; + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_solveFriction; + int m_maxBatch; // long batch really kills the performance + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBufferBatchSolve; + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1); + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1) +{ + *linear = mymake_float4(-n.xyz,0.f); + *angular0 = -cross3(r0, n); + *angular1 = cross3(r1, n); +} + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ); + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ) +{ + return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1); +} + + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1); + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1) +{ + // linear0,1 are normlized + float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0; + float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0); + float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1; + float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1); + return -1.f/(jmj0+jmj1+jmj2+jmj3); +} +void btPlaneSpace1 (const float4* n, float4* p, float4* q); + void btPlaneSpace1 (const float4* n, float4* p, float4* q) +{ + if (fabs(n[0].z) > 0.70710678f) { + // choose p in y-z plane + float a = n[0].y*n[0].y + n[0].z*n[0].z; + float k = 1.f/sqrt(a); + p[0].x = 0; + p[0].y = -n[0].z*k; + p[0].z = n[0].y*k; + // set q = n x p + q[0].x = a*k; + q[0].y = -n[0].x*p[0].z; + q[0].z = n[0].x*p[0].y; + } + else { + // choose p in x-y plane + float a = n[0].x*n[0].x + n[0].y*n[0].y; + float k = 1.f/sqrt(a); + p[0].x = -n[0].y*k; + p[0].y = n[0].x*k; + p[0].z = 0; + // set q = n x p + q[0].x = -n[0].z*p[0].y; + q[0].y = n[0].z*p[0].x; + q[0].z = a*k; + } +} + + +void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs); +void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs) +{ + float frictionCoeff = ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=ldsCs[0].m_appliedRambdaDt[j]; + } + frictionCoeff = 0.7f; + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + + +// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, +// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt ); + + + { + + __global Constraint4* cs = ldsCs; + + if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return; + const float4 center = cs->m_center; + + float4 n = -cs->m_linear; + + float4 tangent[2]; + btPlaneSpace1(&n,&tangent[0],&tangent[1]); + float4 angular0, angular1, linear; + float4 r0 = center - posA; + float4 r1 = center - posB; + for(int i=0; i<2; i++) + { + setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 ); + float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB ); + rambdaDt *= cs->m_fJacCoeffInv[i]; + + { + float prevSum = cs->m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt[i] ); + updated = min2( updated, maxRambdaDt[i] ); + rambdaDt = updated - prevSum; + cs->m_fAppliedRambdaDt[i] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + linVelA += linImp0; + angVelA += angImp0; + linVelB += linImp1; + angVelB += angImp1; + } + { // angular damping for point constraint + float4 ab = normalize3( posB - posA ); + float4 ac = normalize3( center - posA ); + if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { + float angNA = dot3F4( n, angVelA ); + float angNB = dot3F4( n, angVelB ); + + angVelA -= (angNA*0.1f)*n; + angVelB -= (angNB*0.1f)*n; + } + } + } + + + + } + + if (gBodies[aIdx].m_invMass) + { + gBodies[aIdx].m_linVel = linVelA; + gBodies[aIdx].m_angVel = angVelA; + } else + { + gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0); + } + if (gBodies[bIdx].m_invMass) + { + gBodies[bIdx].m_linVel = linVelB; + gBodies[bIdx].m_angVel = angVelB; + } else + { + gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0); + gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0); + } + + +} + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void BatchSolveKernelFriction(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + __global int* gN, + __global int* gOffsets, + __global int* batchSizes, + int maxBatch1, + int cellBatch, + int4 nSplit + ) +{ + //__local int ldsBatchIdx[WG_SIZE+1]; + __local int ldsCurBatch; + __local int ldsNextBatch; + __local int ldsStart; + + int lIdx = GET_LOCAL_IDX; + int wgIdx = GET_GROUP_IDX; + +// int gIdx = GET_GLOBAL_IDX; +// debugInfo[gIdx].m_valInt0 = gIdx; + //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE; + + + int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2); + int remain= (wgIdx%((nSplit.x*nSplit.y)/4)); + int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1); + int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1); + int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y); + + + if( gN[cellIdx] == 0 ) + return; + + int maxBatch = batchSizes[cellIdx]; + + const int start = gOffsets[cellIdx]; + const int end = start + gN[cellIdx]; + + + if( lIdx == 0 ) + { + ldsCurBatch = 0; + ldsNextBatch = 0; + ldsStart = start; + } + + + GROUP_LDS_BARRIER; + + int idx=ldsStart+lIdx; + while (ldsCurBatch < maxBatch) + { + for(; idx<end; ) + { + if (gConstraints[idx].m_batchIdx == ldsCurBatch) + { + + solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] ); + + idx+=64; + } else + { + break; + } + } + GROUP_LDS_BARRIER; + if( lIdx == 0 ) + { + ldsCurBatch++; + } + GROUP_LDS_BARRIER; + } + + +} + + + + + + +__kernel void solveSingleFrictionKernel(__global Body* gBodies, + __global Shape* gShapes, + __global Constraint4* gConstraints, + int cellIdx, + int batchOffset, + int numConstraintsInBatch + ) +{ + + int index = get_global_id(0); + if (index < numConstraintsInBatch) + { + + int idx=batchOffset+index; + + solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] ); + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h new file mode 100644 index 0000000000..eb58674f22 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h @@ -0,0 +1,421 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solveFrictionCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define mymake_float4 (float4)\n" +"//#define make_float2 (float2)\n" +"//#define make_uint4 (uint4)\n" +"//#define make_int4 (int4)\n" +"//#define make_uint2 (uint2)\n" +"//#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = mymake_float4(a.xyz,0.f);\n" +" float4 b1 = mymake_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +" int m_batchIdx;\n" +" u32 m_paddings[1];\n" +"} Constraint4;\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" +"{\n" +" *linear = mymake_float4(-n.xyz,0.f);\n" +" *angular0 = -cross3(r0, n);\n" +" *angular1 = cross3(r1, n);\n" +"}\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" +"{\n" +" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" +"}\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n" +"{\n" +" // linear0,1 are normlized\n" +" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" +" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" +" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" +" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" +" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" +"}\n" +"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n" +" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n" +"{\n" +" if (fabs(n[0].z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n[0].z*k;\n" +" p[0].z = n[0].y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n[0].x*p[0].z;\n" +" q[0].z = n[0].x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n[0].y*k;\n" +" p[0].y = n[0].x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n[0].z*p[0].y;\n" +" q[0].y = n[0].z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n" +"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n" +"{\n" +" float frictionCoeff = ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" \n" +" {\n" +" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n" +" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n" +" float sum = 0;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" sum +=ldsCs[0].m_appliedRambdaDt[j];\n" +" }\n" +" frictionCoeff = 0.7f;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" maxRambdaDt[j] = frictionCoeff*sum;\n" +" minRambdaDt[j] = -maxRambdaDt[j];\n" +" }\n" +" \n" +"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n" +" \n" +" \n" +" {\n" +" \n" +" __global Constraint4* cs = ldsCs;\n" +" \n" +" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n" +" const float4 center = cs->m_center;\n" +" \n" +" float4 n = -cs->m_linear;\n" +" \n" +" float4 tangent[2];\n" +" btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = center - posA;\n" +" float4 r1 = center - posB;\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n" +" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA, angVelA, linVelB, angVelB );\n" +" rambdaDt *= cs->m_fJacCoeffInv[i];\n" +" \n" +" {\n" +" float prevSum = cs->m_fAppliedRambdaDt[i];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt[i] );\n" +" updated = min2( updated, maxRambdaDt[i] );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_fAppliedRambdaDt[i] = updated;\n" +" }\n" +" \n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +" \n" +" linVelA += linImp0;\n" +" angVelA += angImp0;\n" +" linVelB += linImp1;\n" +" angVelB += angImp1;\n" +" }\n" +" { // angular damping for point constraint\n" +" float4 ab = normalize3( posB - posA );\n" +" float4 ac = normalize3( center - posA );\n" +" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n" +" {\n" +" float angNA = dot3F4( n, angVelA );\n" +" float angNB = dot3F4( n, angVelB );\n" +" \n" +" angVelA -= (angNA*0.1f)*n;\n" +" angVelB -= (angNB*0.1f)*n;\n" +" }\n" +" }\n" +" }\n" +" \n" +" \n" +" }\n" +" if (gBodies[aIdx].m_invMass)\n" +" {\n" +" gBodies[aIdx].m_linVel = linVelA;\n" +" gBodies[aIdx].m_angVel = angVelA;\n" +" } else\n" +" {\n" +" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" }\n" +" if (gBodies[bIdx].m_invMass)\n" +" {\n" +" gBodies[bIdx].m_linVel = linVelB;\n" +" gBodies[bIdx].m_angVel = angVelB;\n" +" } else\n" +" {\n" +" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n" +" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n" +" }\n" +" \n" +"}\n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void BatchSolveKernelFriction(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" __global int* gN,\n" +" __global int* gOffsets,\n" +" __global int* batchSizes,\n" +" int maxBatch1,\n" +" int cellBatch,\n" +" int4 nSplit\n" +" )\n" +"{\n" +" //__local int ldsBatchIdx[WG_SIZE+1];\n" +" __local int ldsCurBatch;\n" +" __local int ldsNextBatch;\n" +" __local int ldsStart;\n" +" int lIdx = GET_LOCAL_IDX;\n" +" int wgIdx = GET_GROUP_IDX;\n" +"// int gIdx = GET_GLOBAL_IDX;\n" +"// debugInfo[gIdx].m_valInt0 = gIdx;\n" +" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n" +" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n" +" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n" +" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n" +" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n" +" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n" +" \n" +" if( gN[cellIdx] == 0 ) \n" +" return;\n" +" int maxBatch = batchSizes[cellIdx];\n" +" const int start = gOffsets[cellIdx];\n" +" const int end = start + gN[cellIdx];\n" +" \n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch = 0;\n" +" ldsNextBatch = 0;\n" +" ldsStart = start;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" int idx=ldsStart+lIdx;\n" +" while (ldsCurBatch < maxBatch)\n" +" {\n" +" for(; idx<end; )\n" +" {\n" +" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n" +" {\n" +" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n" +" idx+=64;\n" +" } else\n" +" {\n" +" break;\n" +" }\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" if( lIdx == 0 )\n" +" {\n" +" ldsCurBatch++;\n" +" }\n" +" GROUP_LDS_BARRIER;\n" +" }\n" +" \n" +" \n" +"}\n" +"__kernel void solveSingleFrictionKernel(__global Body* gBodies,\n" +" __global Shape* gShapes,\n" +" __global Constraint4* gConstraints,\n" +" int cellIdx,\n" +" int batchOffset,\n" +" int numConstraintsInBatch\n" +" )\n" +"{\n" +" int index = get_global_id(0);\n" +" if (index < numConstraintsInBatch)\n" +" {\n" +" \n" +" int idx=batchOffset+index;\n" +" \n" +" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n" +" } \n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl new file mode 100644 index 0000000000..8e2de7b5a6 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl @@ -0,0 +1,277 @@ + +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + +#include "Bullet3Dynamics/shared/b3ConvertConstraint4.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// +__inline +float fastDiv(float numerator, float denominator) +{ + return native_divide(numerator, denominator); +// return numerator/denominator; +} + +__inline +float4 fastDiv4(float4 numerator, float4 denominator) +{ + return native_divide(numerator, denominator); +} + +__inline +float fastSqrtf(float f2) +{ + return native_sqrt(f2); +// return sqrt(f2); +} + +__inline +float fastRSqrt(float f2) +{ + return native_rsqrt(f2); +} + +__inline +float fastLength4(float4 v) +{ + return fast_length(v); +} + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + +__inline +float sqrtf(float a) +{ +// return sqrt(a); + return native_sqrt(a); +} + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float length3(const float4 a) +{ + return sqrtf(dot3F4(a,a)); +} + +__inline +float dot4(const float4 a, const float4 b) +{ + return dot( a, b ); +} + +// for height +__inline +float dot3w1(const float4 point, const float4 eqn) +{ + return dot3F4(point,eqn) + eqn.w; +} + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + +__inline +float4 normalize4(const float4 a) +{ + float length = sqrtf(dot4(a, a)); + return 1.f/length * a; +} + +__inline +float4 createEquation(const float4 a, const float4 b, const float4 c) +{ + float4 eqn; + float4 ab = b-a; + float4 ac = c-a; + eqn = normalize3( cross3(ab, ac) ); + eqn.w = -dot3F4(eqn,a); + return eqn; +} + + + +#define WG_SIZE 64 + + + + + + + +typedef struct +{ + int m_nConstraints; + int m_start; + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_solveFriction; + int m_maxBatch; // long batch really kills the performance + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBufferBatchSolve; + + + + + + + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + + + +typedef struct +{ + int m_nContacts; + float m_dt; + float m_positionDrift; + float m_positionConstraintCoeff; +} ConstBufferCTC; + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, +int nContacts, +float dt, +float positionDrift, +float positionConstraintCoeff +) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); + int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia; + + b3ContactConstraint4_t cs; + + setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, + &gContact[gIdx], dt, positionDrift, positionConstraintCoeff, + &cs ); + + cs.m_batchIdx = gContact[gIdx].m_batchIdx; + + gConstraintOut[gIdx] = cs; + } +} + + + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h new file mode 100644 index 0000000000..eb1834ee00 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h @@ -0,0 +1,703 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solverSetupCL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#ifndef B3_CONTACT_CONSTRAINT5_H\n" +"#define B3_CONTACT_CONSTRAINT5_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3ContactConstraint4 b3ContactConstraint4_t;\n" +"struct b3ContactConstraint4\n" +"{\n" +" b3Float4 m_linear;//normal?\n" +" b3Float4 m_worldPos[4];\n" +" b3Float4 m_center; // friction\n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; // friction\n" +" float m_fAppliedRambdaDt[2]; // friction\n" +" unsigned int m_bodyA;\n" +" unsigned int m_bodyB;\n" +" int m_batchIdx;\n" +" unsigned int m_paddings;\n" +"};\n" +"//inline void setFrictionCoeff(float value) { m_linear[3] = value; }\n" +"inline float b3GetFrictionCoeff(b3ContactConstraint4_t* constraint) \n" +"{\n" +" return constraint->m_linear.w; \n" +"}\n" +"#endif //B3_CONTACT_CONSTRAINT5_H\n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#define B3_RIGIDBODY_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" +"struct b3RigidBodyData\n" +"{\n" +" b3Float4 m_pos;\n" +" b3Quat m_quat;\n" +" b3Float4 m_linVel;\n" +" b3Float4 m_angVel;\n" +" int m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"};\n" +"typedef struct b3InertiaData b3InertiaData_t;\n" +"struct b3InertiaData\n" +"{\n" +" b3Mat3x3 m_invInertiaWorld;\n" +" b3Mat3x3 m_initInvInertia;\n" +"};\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q);\n" +" void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q)\n" +"{\n" +" if (b3Fabs(n.z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n.y*n.y + n.z*n.z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n.z*k;\n" +" p[0].z = n.y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n.x*p[0].z;\n" +" q[0].z = n.x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n.x*n.x + n.y*n.y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n.y*k;\n" +" p[0].y = n.x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n.z*p[0].y;\n" +" q[0].y = n.z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +" \n" +"void setLinearAndAngular( b3Float4ConstArg n, b3Float4ConstArg r0, b3Float4ConstArg r1, b3Float4* linear, b3Float4* angular0, b3Float4* angular1)\n" +"{\n" +" *linear = b3MakeFloat4(n.x,n.y,n.z,0.f);\n" +" *angular0 = b3Cross3(r0, n);\n" +" *angular1 = -b3Cross3(r1, n);\n" +"}\n" +"float calcRelVel( b3Float4ConstArg l0, b3Float4ConstArg l1, b3Float4ConstArg a0, b3Float4ConstArg a1, b3Float4ConstArg linVel0,\n" +" b3Float4ConstArg angVel0, b3Float4ConstArg linVel1, b3Float4ConstArg angVel1 )\n" +"{\n" +" return b3Dot3F4(l0, linVel0) + b3Dot3F4(a0, angVel0) + b3Dot3F4(l1, linVel1) + b3Dot3F4(a1, angVel1);\n" +"}\n" +"float calcJacCoeff(b3Float4ConstArg linear0, b3Float4ConstArg linear1, b3Float4ConstArg angular0, b3Float4ConstArg angular1,\n" +" float invMass0, const b3Mat3x3* invInertia0, float invMass1, const b3Mat3x3* invInertia1)\n" +"{\n" +" // linear0,1 are normlized\n" +" float jmj0 = invMass0;//b3Dot3F4(linear0, linear0)*invMass0;\n" +" float jmj1 = b3Dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" +" float jmj2 = invMass1;//b3Dot3F4(linear1, linear1)*invMass1;\n" +" float jmj3 = b3Dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" +" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n" +"}\n" +"void setConstraint4( b3Float4ConstArg posA, b3Float4ConstArg linVelA, b3Float4ConstArg angVelA, float invMassA, b3Mat3x3ConstArg invInertiaA,\n" +" b3Float4ConstArg posB, b3Float4ConstArg linVelB, b3Float4ConstArg angVelB, float invMassB, b3Mat3x3ConstArg invInertiaB, \n" +" __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,\n" +" b3ContactConstraint4_t* dstC )\n" +"{\n" +" dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n" +" dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n" +" float dtInv = 1.f/dt;\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n" +" dstC->m_linear = src->m_worldNormalOnB;\n" +" dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" b3Float4 r0 = src->m_worldPosB[ic] - posA;\n" +" b3Float4 r1 = src->m_worldPosB[ic] - posB;\n" +" if( ic >= src->m_worldNormalOnB.w )//npoints\n" +" {\n" +" dstC->m_jacCoeffInv[ic] = 0.f;\n" +" continue;\n" +" }\n" +" float relVelN;\n" +" {\n" +" b3Float4 linear, angular0, angular1;\n" +" setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n" +" dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB );\n" +" relVelN = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA, angVelA, linVelB, angVelB);\n" +" float e = 0.f;//src->getRestituitionCoeff();\n" +" if( relVelN*relVelN < 0.004f ) e = 0.f;\n" +" dstC->m_b[ic] = e*relVelN;\n" +" //float penetration = src->m_worldPosB[ic].w;\n" +" dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" }\n" +" if( src->m_worldNormalOnB.w > 0 )//npoints\n" +" { // prepare friction\n" +" b3Float4 center = b3MakeFloat4(0.f,0.f,0.f,0.f);\n" +" for(int i=0; i<src->m_worldNormalOnB.w; i++) \n" +" center += src->m_worldPosB[i];\n" +" center /= (float)src->m_worldNormalOnB.w;\n" +" b3Float4 tangent[2];\n" +" b3PlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n" +" \n" +" b3Float4 r[2];\n" +" r[0] = center - posA;\n" +" r[1] = center - posB;\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" b3Float4 linear, angular0, angular1;\n" +" setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n" +" dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB );\n" +" dstC->m_fAppliedRambdaDt[i] = 0.f;\n" +" }\n" +" dstC->m_center = center;\n" +" }\n" +" for(int i=0; i<4; i++)\n" +" {\n" +" if( i<src->m_worldNormalOnB.w )\n" +" {\n" +" dstC->m_worldPos[i] = src->m_worldPosB[i];\n" +" }\n" +" else\n" +" {\n" +" dstC->m_worldPos[i] = b3MakeFloat4(0.f,0.f,0.f,0.f);\n" +" }\n" +" }\n" +"}\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"__inline\n" +"float fastSqrtf(float f2)\n" +"{\n" +" return native_sqrt(f2);\n" +"// return sqrt(f2);\n" +"}\n" +"__inline\n" +"float fastRSqrt(float f2)\n" +"{\n" +" return native_rsqrt(f2);\n" +"}\n" +"__inline\n" +"float fastLength4(float4 v)\n" +"{\n" +" return fast_length(v);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float sqrtf(float a)\n" +"{\n" +"// return sqrt(a);\n" +" return native_sqrt(a);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float length3(const float4 a)\n" +"{\n" +" return sqrtf(dot3F4(a,a));\n" +"}\n" +"__inline\n" +"float dot4(const float4 a, const float4 b)\n" +"{\n" +" return dot( a, b );\n" +"}\n" +"// for height\n" +"__inline\n" +"float dot3w1(const float4 point, const float4 eqn)\n" +"{\n" +" return dot3F4(point,eqn) + eqn.w;\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 normalize4(const float4 a)\n" +"{\n" +" float length = sqrtf(dot4(a, a));\n" +" return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" +"{\n" +" float4 eqn;\n" +" float4 ab = b-a;\n" +" float4 ac = c-a;\n" +" eqn = normalize3( cross3(ab, ac) );\n" +" eqn.w = -dot3F4(eqn,a);\n" +" return eqn;\n" +"}\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +" \n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"typedef struct\n" +"{\n" +" int m_nContacts;\n" +" float m_dt;\n" +" float m_positionDrift;\n" +" float m_positionConstraintCoeff;\n" +"} ConstBufferCTC;\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, \n" +"int nContacts,\n" +"float dt,\n" +"float positionDrift,\n" +"float positionConstraintCoeff\n" +")\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nContacts )\n" +" {\n" +" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" +" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;\n" +" b3ContactConstraint4_t cs;\n" +" setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n" +" &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n" +" &cs );\n" +" \n" +" cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n" +" gConstraintOut[gIdx] = cs;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl new file mode 100644 index 0000000000..3dc48d4350 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl @@ -0,0 +1,613 @@ +/* +Copyright (c) 2012 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Takahiro Harada + + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// +__inline +float fastDiv(float numerator, float denominator) +{ + return native_divide(numerator, denominator); +// return numerator/denominator; +} + +__inline +float4 fastDiv4(float4 numerator, float4 denominator) +{ + return native_divide(numerator, denominator); +} + +__inline +float fastSqrtf(float f2) +{ + return native_sqrt(f2); +// return sqrt(f2); +} + +__inline +float fastRSqrt(float f2) +{ + return native_rsqrt(f2); +} + +__inline +float fastLength4(float4 v) +{ + return fast_length(v); +} + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + +__inline +float sqrtf(float a) +{ +// return sqrt(a); + return native_sqrt(a); +} + +__inline +float4 cross3(float4 a, float4 b) +{ + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float length3(const float4 a) +{ + return sqrtf(dot3F4(a,a)); +} + +__inline +float dot4(const float4 a, const float4 b) +{ + return dot( a, b ); +} + +// for height +__inline +float dot3w1(const float4 point, const float4 eqn) +{ + return dot3F4(point,eqn) + eqn.w; +} + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + +__inline +float4 normalize4(const float4 a) +{ + float length = sqrtf(dot4(a, a)); + return 1.f/length * a; +} + +__inline +float4 createEquation(const float4 a, const float4 b, const float4 c) +{ + float4 eqn; + float4 ab = b-a; + float4 ac = c-a; + eqn = normalize3( cross3(ab, ac) ); + eqn.w = -dot3F4(eqn,a); + return eqn; +} + +/////////////////////////////////////// +// Matrix3x3 +/////////////////////////////////////// + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + +__inline +Matrix3x3 mtZero(); + +__inline +Matrix3x3 mtIdentity(); + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m); + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b); + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + +__inline +Matrix3x3 mtZero() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(0.f); + m.m_row[1] = (float4)(0.f); + m.m_row[2] = (float4)(0.f); + return m; +} + +__inline +Matrix3x3 mtIdentity() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(1,0,0,0); + m.m_row[1] = (float4)(0,1,0,0); + m.m_row[2] = (float4)(0,0,1,0); + return m; +} + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m) +{ + Matrix3x3 out; + out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f); + out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f); + out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f); + return out; +} + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b) +{ + Matrix3x3 transB; + transB = mtTranspose( b ); + Matrix3x3 ans; + // why this doesn't run when 0ing in the for{} + a.m_row[0].w = 0.f; + a.m_row[1].w = 0.f; + a.m_row[2].w = 0.f; + for(int i=0; i<3; i++) + { +// a.m_row[i].w = 0.f; + ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]); + ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]); + ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]); + ans.m_row[i].w = 0.f; + } + return ans; +} + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + + + + +#define WG_SIZE 64 + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + +typedef struct +{ + Matrix3x3 m_invInertia; + Matrix3x3 m_initInvInertia; +} Shape; + +typedef struct +{ + float4 m_linear; + float4 m_worldPos[4]; + float4 m_center; + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; + float m_fAppliedRambdaDt[2]; + + u32 m_bodyA; + u32 m_bodyB; + + int m_batchIdx; + u32 m_paddings[1]; +} Constraint4; + + + +typedef struct +{ + int m_nConstraints; + int m_start; + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBuffer; + +typedef struct +{ + int m_solveFriction; + int m_maxBatch; // long batch really kills the performance + int m_batchIdx; + int m_nSplit; +// int m_paddings[1]; +} ConstBufferBatchSolve; + + + + + +typedef struct +{ + int m_valInt0; + int m_valInt1; + int m_valInt2; + int m_valInt3; + + float m_val0; + float m_val1; + float m_val2; + float m_val3; +} SolverDebugInfo; + + + + +// others +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb ) +{ + int nContacts = cb.x; + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int srcIdx = sortData[gIdx].y; + out[gIdx] = in[srcIdx]; + } +} + +__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sd; + sd.x = contactsIn[gIdx].m_childIndexB; + sd.y = gIdx; + sortDataOut[gIdx] = sd; + } +} + +__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sdIn; + sdIn = sortDataInOut[gIdx]; + int2 sdOut; + sdOut.x = contactsIn[sdIn.y].m_childIndexA; + sdOut.y = sdIn.y; + sortDataInOut[gIdx] = sdOut; + } +} + +__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sdIn; + sdIn = sortDataInOut[gIdx]; + int2 sdOut; + sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit; + sdOut.y = sdIn.y; + sortDataInOut[gIdx] = sdOut; + } +} + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int2 sdIn; + sdIn = sortDataInOut[gIdx]; + int2 sdOut; + sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit; + sdOut.y = sdIn.y; + sortDataInOut[gIdx] = sdOut; + } +} + + + + +typedef struct +{ + int m_nContacts; + int m_staticIdx; + float m_scale; + int m_nSplit; +} ConstBufferSSD; + + +__constant const int gridTable4x4[] = +{ + 0,1,17,16, + 1,2,18,19, + 17,18,32,3, + 16,19,3,34 +}; + +__constant const int gridTable8x8[] = +{ + 0, 2, 3, 16, 17, 18, 19, 1, + 66, 64, 80, 67, 82, 81, 65, 83, + 131,144,128,130,147,129,145,146, + 208,195,194,192,193,211,210,209, + 21, 22, 23, 5, 4, 6, 7, 20, + 86, 85, 69, 87, 70, 68, 84, 71, + 151,133,149,150,135,148,132,134, + 197,27,214,213,212,199,198,196 + +}; + + + + +#define USE_SPATIAL_BATCHING 1 +#define USE_4x4_GRID 1 + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, +int nContacts,float scale,int4 nSplit,int staticIdx) + +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit; + int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit; + + int aIdx = abs(aPtrAndSignBit ); + int bIdx = abs(bPtrAndSignBit); + + bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx); + bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx); + +#if USE_SPATIAL_BATCHING + int idx = (aStatic)? bIdx: aIdx; + float4 p = gBodies[idx].m_pos; + int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1); + int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1); + int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1); + int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y); + +#else//USE_SPATIAL_BATCHING + #if USE_4x4_GRID + int aa = aIdx&3; + int bb = bIdx&3; + if (aStatic) + aa = bb; + if (bStatic) + bb = aa; + + int gridIndex = aa + bb*4; + int newIndex = gridTable4x4[gridIndex]; + #else//USE_4x4_GRID + int aa = aIdx&7; + int bb = bIdx&7; + if (aStatic) + aa = bb; + if (bStatic) + bb = aa; + + int gridIndex = aa + bb*8; + int newIndex = gridTable8x8[gridIndex]; + #endif//USE_4x4_GRID +#endif//USE_SPATIAL_BATCHING + + + gSortDataOut[gIdx].x = newIndex; + gSortDataOut[gIdx].y = gIdx; + } + else + { + gSortDataOut[gIdx].x = 0xffffffff; + } +} + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb ) +{ + int gIdx = GET_GLOBAL_IDX; + if( gIdx < cb.x ) + { + gOut[gIdx] = gIn[gIdx]; + } +} + + + diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h new file mode 100644 index 0000000000..1b5819f6cf --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h @@ -0,0 +1,601 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solverSetup2CL= \ +"/*\n" +"Copyright (c) 2012 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Takahiro Harada\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"__inline\n" +"float fastSqrtf(float f2)\n" +"{\n" +" return native_sqrt(f2);\n" +"// return sqrt(f2);\n" +"}\n" +"__inline\n" +"float fastRSqrt(float f2)\n" +"{\n" +" return native_rsqrt(f2);\n" +"}\n" +"__inline\n" +"float fastLength4(float4 v)\n" +"{\n" +" return fast_length(v);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float sqrtf(float a)\n" +"{\n" +"// return sqrt(a);\n" +" return native_sqrt(a);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a, float4 b)\n" +"{\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float length3(const float4 a)\n" +"{\n" +" return sqrtf(dot3F4(a,a));\n" +"}\n" +"__inline\n" +"float dot4(const float4 a, const float4 b)\n" +"{\n" +" return dot( a, b );\n" +"}\n" +"// for height\n" +"__inline\n" +"float dot3w1(const float4 point, const float4 eqn)\n" +"{\n" +" return dot3F4(point,eqn) + eqn.w;\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 normalize4(const float4 a)\n" +"{\n" +" float length = sqrtf(dot4(a, a));\n" +" return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" +"{\n" +" float4 eqn;\n" +" float4 ab = b-a;\n" +" float4 ac = c-a;\n" +" eqn = normalize3( cross3(ab, ac) );\n" +" eqn.w = -dot3F4(eqn,a);\n" +" return eqn;\n" +"}\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"Matrix3x3 mtZero();\n" +"__inline\n" +"Matrix3x3 mtIdentity();\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m);\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"Matrix3x3 mtZero()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(0.f);\n" +" m.m_row[1] = (float4)(0.f);\n" +" m.m_row[2] = (float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtIdentity()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(1,0,0,0);\n" +" m.m_row[1] = (float4)(0,1,0,0);\n" +" m.m_row[2] = (float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m)\n" +"{\n" +" Matrix3x3 out;\n" +" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" +"{\n" +" Matrix3x3 transB;\n" +" transB = mtTranspose( b );\n" +" Matrix3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +" int m_batchIdx;\n" +" u32 m_paddings[1];\n" +"} Constraint4;\n" +"typedef struct\n" +"{\n" +" int m_nConstraints;\n" +" int m_start;\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBuffer;\n" +"typedef struct\n" +"{\n" +" int m_solveFriction;\n" +" int m_maxBatch; // long batch really kills the performance\n" +" int m_batchIdx;\n" +" int m_nSplit;\n" +"// int m_paddings[1];\n" +"} ConstBufferBatchSolve;\n" +" \n" +"typedef struct \n" +"{\n" +" int m_valInt0;\n" +" int m_valInt1;\n" +" int m_valInt2;\n" +" int m_valInt3;\n" +" float m_val0;\n" +" float m_val1;\n" +" float m_val2;\n" +" float m_val3;\n" +"} SolverDebugInfo;\n" +"// others\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n" +"{\n" +" int nContacts = cb.x;\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int srcIdx = sortData[gIdx].y;\n" +" out[gIdx] = in[srcIdx];\n" +" }\n" +"}\n" +"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sd;\n" +" sd.x = contactsIn[gIdx].m_childIndexB;\n" +" sd.y = gIdx;\n" +" sortDataOut[gIdx] = sd;\n" +" }\n" +"}\n" +"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sdIn;\n" +" sdIn = sortDataInOut[gIdx];\n" +" int2 sdOut;\n" +" sdOut.x = contactsIn[sdIn.y].m_childIndexA;\n" +" sdOut.y = sdIn.y;\n" +" sortDataInOut[gIdx] = sdOut;\n" +" }\n" +"}\n" +"__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sdIn;\n" +" sdIn = sortDataInOut[gIdx];\n" +" int2 sdOut;\n" +" sdOut.x = contactsIn[sdIn.y].m_bodyAPtrAndSignBit;\n" +" sdOut.y = sdIn.y;\n" +" sortDataInOut[gIdx] = sdOut;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < nContacts )\n" +" {\n" +" int2 sdIn;\n" +" sdIn = sortDataInOut[gIdx];\n" +" int2 sdOut;\n" +" sdOut.x = contactsIn[sdIn.y].m_bodyBPtrAndSignBit;\n" +" sdOut.y = sdIn.y;\n" +" sortDataInOut[gIdx] = sdOut;\n" +" }\n" +"}\n" +"typedef struct\n" +"{\n" +" int m_nContacts;\n" +" int m_staticIdx;\n" +" float m_scale;\n" +" int m_nSplit;\n" +"} ConstBufferSSD;\n" +"__constant const int gridTable4x4[] = \n" +"{\n" +" 0,1,17,16,\n" +" 1,2,18,19,\n" +" 17,18,32,3,\n" +" 16,19,3,34\n" +"};\n" +"__constant const int gridTable8x8[] = \n" +"{\n" +" 0, 2, 3, 16, 17, 18, 19, 1,\n" +" 66, 64, 80, 67, 82, 81, 65, 83,\n" +" 131,144,128,130,147,129,145,146,\n" +" 208,195,194,192,193,211,210,209,\n" +" 21, 22, 23, 5, 4, 6, 7, 20,\n" +" 86, 85, 69, 87, 70, 68, 84, 71,\n" +" 151,133,149,150,135,148,132,134,\n" +" 197,27,214,213,212,199,198,196\n" +" \n" +"};\n" +"#define USE_SPATIAL_BATCHING 1\n" +"#define USE_4x4_GRID 1\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n" +"int nContacts,float scale,int4 nSplit,int staticIdx)\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nContacts )\n" +" {\n" +" int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;\n" +" int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;\n" +" int aIdx = abs(aPtrAndSignBit );\n" +" int bIdx = abs(bPtrAndSignBit);\n" +" bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);\n" +" bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);\n" +"#if USE_SPATIAL_BATCHING \n" +" int idx = (aStatic)? bIdx: aIdx;\n" +" float4 p = gBodies[idx].m_pos;\n" +" int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (nSplit.x-1);\n" +" int yIdx = (int)((p.y-((p.y<0.f)?1.f:0.f))*scale) & (nSplit.y-1);\n" +" int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (nSplit.z-1);\n" +" int newIndex = (xIdx+yIdx*nSplit.x+zIdx*nSplit.x*nSplit.y);\n" +" \n" +"#else//USE_SPATIAL_BATCHING\n" +" #if USE_4x4_GRID\n" +" int aa = aIdx&3;\n" +" int bb = bIdx&3;\n" +" if (aStatic)\n" +" aa = bb;\n" +" if (bStatic)\n" +" bb = aa;\n" +" int gridIndex = aa + bb*4;\n" +" int newIndex = gridTable4x4[gridIndex];\n" +" #else//USE_4x4_GRID\n" +" int aa = aIdx&7;\n" +" int bb = bIdx&7;\n" +" if (aStatic)\n" +" aa = bb;\n" +" if (bStatic)\n" +" bb = aa;\n" +" int gridIndex = aa + bb*8;\n" +" int newIndex = gridTable8x8[gridIndex];\n" +" #endif//USE_4x4_GRID\n" +"#endif//USE_SPATIAL_BATCHING\n" +" gSortDataOut[gIdx].x = newIndex;\n" +" gSortDataOut[gIdx].y = gIdx;\n" +" }\n" +" else\n" +" {\n" +" gSortDataOut[gIdx].x = 0xffffffff;\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" if( gIdx < cb.x )\n" +" {\n" +" gOut[gIdx] = gIn[gIdx];\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl new file mode 100644 index 0000000000..a21a08c3b4 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl @@ -0,0 +1,968 @@ +/* +Copyright (c) 2013 Advanced Micro Devices, Inc. + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Originally written by Erwin Coumans + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" + +#pragma OPENCL EXTENSION cl_amd_printf : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + + +#ifdef cl_ext_atomic_counters_32 +#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable +#else +#define counter32_t volatile global int* +#endif + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define GET_GROUP_IDX get_group_id(0) +#define GET_LOCAL_IDX get_local_id(0) +#define GET_GLOBAL_IDX get_global_id(0) +#define GET_GROUP_SIZE get_local_size(0) +#define GET_NUM_GROUPS get_num_groups(0) +#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE) +#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE) +#define AtomInc(x) atom_inc(&(x)) +#define AtomInc1(x, out) out = atom_inc(&(x)) +#define AppendInc(x, out) out = atomic_inc(x) +#define AtomAdd(x, value) atom_add(&(x), value) +#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value ) +#define AtomXhg(x, value) atom_xchg ( &(x), value ) + + +#define SELECT_UINT4( b, a, condition ) select( b,a,condition ) + +#define make_float4 (float4) +#define make_float2 (float2) +#define make_uint4 (uint4) +#define make_int4 (int4) +#define make_uint2 (uint2) +#define make_int2 (int2) + + +#define max2 max +#define min2 min + + +/////////////////////////////////////// +// Vector +/////////////////////////////////////// +__inline +float fastDiv(float numerator, float denominator) +{ + return native_divide(numerator, denominator); +// return numerator/denominator; +} + +__inline +float4 fastDiv4(float4 numerator, float4 denominator) +{ + return native_divide(numerator, denominator); +} + +__inline +float fastSqrtf(float f2) +{ + return native_sqrt(f2); +// return sqrt(f2); +} + +__inline +float fastRSqrt(float f2) +{ + return native_rsqrt(f2); +} + +__inline +float fastLength4(float4 v) +{ + return fast_length(v); +} + +__inline +float4 fastNormalize4(float4 v) +{ + return fast_normalize(v); +} + + +__inline +float sqrtf(float a) +{ +// return sqrt(a); + return native_sqrt(a); +} + +__inline +float4 cross3(float4 a1, float4 b1) +{ + + float4 a=make_float4(a1.xyz,0.f); + float4 b=make_float4(b1.xyz,0.f); + //float4 a=a1; + //float4 b=b1; + return cross(a,b); +} + +__inline +float dot3F4(float4 a, float4 b) +{ + float4 a1 = make_float4(a.xyz,0.f); + float4 b1 = make_float4(b.xyz,0.f); + return dot(a1, b1); +} + +__inline +float length3(const float4 a) +{ + return sqrtf(dot3F4(a,a)); +} + +__inline +float dot4(const float4 a, const float4 b) +{ + return dot( a, b ); +} + +// for height +__inline +float dot3w1(const float4 point, const float4 eqn) +{ + return dot3F4(point,eqn) + eqn.w; +} + +__inline +float4 normalize3(const float4 a) +{ + float4 n = make_float4(a.x, a.y, a.z, 0.f); + return fastNormalize4( n ); +// float length = sqrtf(dot3F4(a, a)); +// return 1.f/length * a; +} + +__inline +float4 normalize4(const float4 a) +{ + float length = sqrtf(dot4(a, a)); + return 1.f/length * a; +} + +__inline +float4 createEquation(const float4 a, const float4 b, const float4 c) +{ + float4 eqn; + float4 ab = b-a; + float4 ac = c-a; + eqn = normalize3( cross3(ab, ac) ); + eqn.w = -dot3F4(eqn,a); + return eqn; +} + +/////////////////////////////////////// +// Matrix3x3 +/////////////////////////////////////// + +typedef struct +{ + float4 m_row[3]; +}Matrix3x3; + +__inline +Matrix3x3 mtZero(); + +__inline +Matrix3x3 mtIdentity(); + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m); + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b); + +__inline +float4 mtMul1(Matrix3x3 a, float4 b); + +__inline +float4 mtMul3(float4 a, Matrix3x3 b); + +__inline +Matrix3x3 mtZero() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(0.f); + m.m_row[1] = (float4)(0.f); + m.m_row[2] = (float4)(0.f); + return m; +} + +__inline +Matrix3x3 mtIdentity() +{ + Matrix3x3 m; + m.m_row[0] = (float4)(1,0,0,0); + m.m_row[1] = (float4)(0,1,0,0); + m.m_row[2] = (float4)(0,0,1,0); + return m; +} + +__inline +Matrix3x3 mtTranspose(Matrix3x3 m) +{ + Matrix3x3 out; + out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f); + out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f); + out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f); + return out; +} + +__inline +Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b) +{ + Matrix3x3 transB; + transB = mtTranspose( b ); + Matrix3x3 ans; + // why this doesn't run when 0ing in the for{} + a.m_row[0].w = 0.f; + a.m_row[1].w = 0.f; + a.m_row[2].w = 0.f; + for(int i=0; i<3; i++) + { +// a.m_row[i].w = 0.f; + ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]); + ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]); + ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]); + ans.m_row[i].w = 0.f; + } + return ans; +} + +__inline +float4 mtMul1(Matrix3x3 a, float4 b) +{ + float4 ans; + ans.x = dot3F4( a.m_row[0], b ); + ans.y = dot3F4( a.m_row[1], b ); + ans.z = dot3F4( a.m_row[2], b ); + ans.w = 0.f; + return ans; +} + +__inline +float4 mtMul3(float4 a, Matrix3x3 b) +{ + float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0); + float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0); + float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0); + + float4 ans; + ans.x = dot3F4( a, colx ); + ans.y = dot3F4( a, coly ); + ans.z = dot3F4( a, colz ); + return ans; +} + +/////////////////////////////////////// +// Quaternion +/////////////////////////////////////// + +typedef float4 Quaternion; + +__inline +Quaternion qtMul(Quaternion a, Quaternion b); + +__inline +Quaternion qtNormalize(Quaternion in); + +__inline +float4 qtRotate(Quaternion q, float4 vec); + +__inline +Quaternion qtInvert(Quaternion q); + + + + + +__inline +Quaternion qtMul(Quaternion a, Quaternion b) +{ + Quaternion ans; + ans = cross3( a, b ); + ans += a.w*b+b.w*a; +// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z); + ans.w = a.w*b.w - dot3F4(a, b); + return ans; +} + +__inline +Quaternion qtNormalize(Quaternion in) +{ + return fastNormalize4(in); +// in /= length( in ); +// return in; +} +__inline +float4 qtRotate(Quaternion q, float4 vec) +{ + Quaternion qInv = qtInvert( q ); + float4 vcpy = vec; + vcpy.w = 0.f; + float4 out = qtMul(qtMul(q,vcpy),qInv); + return out; +} + +__inline +Quaternion qtInvert(Quaternion q) +{ + return (Quaternion)(-q.xyz, q.w); +} + +__inline +float4 qtInvRotate(const Quaternion q, float4 vec) +{ + return qtRotate( qtInvert( q ), vec ); +} + + + + +#define WG_SIZE 64 + +typedef struct +{ + float4 m_pos; + Quaternion m_quat; + float4 m_linVel; + float4 m_angVel; + + u32 m_shapeIdx; + float m_invMass; + float m_restituitionCoeff; + float m_frictionCoeff; +} Body; + + + +typedef struct +{ + Matrix3x3 m_invInertia; + Matrix3x3 m_initInvInertia; +} Shape; + +typedef struct +{ + float4 m_linear; + float4 m_worldPos[4]; + float4 m_center; + float m_jacCoeffInv[4]; + float m_b[4]; + float m_appliedRambdaDt[4]; + + float m_fJacCoeffInv[2]; + float m_fAppliedRambdaDt[2]; + + u32 m_bodyA; + u32 m_bodyB; + int m_batchIdx; + u32 m_paddings; +} Constraint4; + + + + + + +__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex) +{ + int i = GET_GLOBAL_IDX; + + if( i < numContactManifolds) + { + int pa = manifoldPtr[i].m_bodyAPtrAndSignBit; + bool isFixedA = (pa <0) || (pa == fixedBodyIndex); + int bodyIndexA = abs(pa); + if (!isFixedA) + { + AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x); + } + barrier(CLK_GLOBAL_MEM_FENCE); + int pb = manifoldPtr[i].m_bodyBPtrAndSignBit; + bool isFixedB = (pb <0) || (pb == fixedBodyIndex); + int bodyIndexB = abs(pb); + if (!isFixedB) + { + AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y); + } + } +} + +__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies) +{ + int i = GET_GLOBAL_IDX; + + if( i < numSplitBodies) + { + linearVelocities[i] = make_float4(0); + angularVelocities[i] = make_float4(0); + } +} + + +__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount, +__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies) +{ + int i = GET_GLOBAL_IDX; + if (i<numBodies) + { + if (gBodies[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + float factor = 1.f/((float)count); + float4 averageLinVel = make_float4(0.f); + float4 averageAngVel = make_float4(0.f); + + for (int j=0;j<count;j++) + { + averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor; + averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor; + } + + for (int j=0;j<count;j++) + { + deltaLinearVelocities[bodyOffset+j] = averageLinVel; + deltaAngularVelocities[bodyOffset+j] = averageAngVel; + } + + }//bodies[i].m_invMass + }//i<numBodies +} + + + +void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1) +{ + *linear = make_float4(n.xyz,0.f); + *angular0 = cross3(r0, n); + *angular1 = -cross3(r1, n); +} + + +float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 ) +{ + return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1); +} + + +float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1, + float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB) +{ + // linear0,1 are normlized + float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0; + float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0); + float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1; + float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1); + return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB); +} + + +void btPlaneSpace1 (float4 n, float4* p, float4* q); + void btPlaneSpace1 (float4 n, float4* p, float4* q) +{ + if (fabs(n.z) > 0.70710678f) { + // choose p in y-z plane + float a = n.y*n.y + n.z*n.z; + float k = 1.f/sqrt(a); + p[0].x = 0; + p[0].y = -n.z*k; + p[0].z = n.y*k; + // set q = n x p + q[0].x = a*k; + q[0].y = -n.x*p[0].z; + q[0].z = n.x*p[0].y; + } + else { + // choose p in x-y plane + float a = n.x*n.x + n.y*n.y; + float k = 1.f/sqrt(a); + p[0].x = -n.y*k; + p[0].y = n.x*k; + p[0].z = 0; + // set q = n x p + q[0].x = -n.z*p[0].y; + q[0].y = n.z*p[0].x; + q[0].z = a*k; + } +} + + + + + +void solveContact(__global Constraint4* cs, + float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA, + float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB, + float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB) +{ + float minRambdaDt = 0; + float maxRambdaDt = FLT_MAX; + + for(int ic=0; ic<4; ic++) + { + if( cs->m_jacCoeffInv[ic] == 0.f ) continue; + + float4 angular0, angular1, linear; + float4 r0 = cs->m_worldPos[ic] - posA; + float4 r1 = cs->m_worldPos[ic] - posB; + setLinearAndAngular( cs->m_linear, r0, r1, &linear, &angular0, &angular1 ); + + + + float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, + *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic]; + rambdaDt *= cs->m_jacCoeffInv[ic]; + + + { + float prevSum = cs->m_appliedRambdaDt[ic]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt ); + updated = min2( updated, maxRambdaDt ); + rambdaDt = updated - prevSum; + cs->m_appliedRambdaDt[ic] = updated; + } + + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + + if (invMassA) + { + *dLinVelA += linImp0; + *dAngVelA += angImp0; + } + if (invMassB) + { + *dLinVelB += linImp1; + *dAngVelB += angImp1; + } + } +} + + +// solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities); + + +void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, +__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies, +__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities) +{ + + //float frictionCoeff = ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + + float4 dLinVelA = make_float4(0,0,0,0); + float4 dAngVelA = make_float4(0,0,0,0); + float4 dLinVelB = make_float4(0,0,0,0); + float4 dAngVelB = make_float4(0,0,0,0); + + int bodyOffsetA = offsetSplitBodies[aIdx]; + int constraintOffsetA = contactConstraintOffsets[0].x; + int splitIndexA = bodyOffsetA+constraintOffsetA; + + if (invMassA) + { + dLinVelA = deltaLinearVelocities[splitIndexA]; + dAngVelA = deltaAngularVelocities[splitIndexA]; + } + + int bodyOffsetB = offsetSplitBodies[bIdx]; + int constraintOffsetB = contactConstraintOffsets[0].y; + int splitIndexB= bodyOffsetB+constraintOffsetB; + + if (invMassB) + { + dLinVelB = deltaLinearVelocities[splitIndexB]; + dAngVelB = deltaAngularVelocities[splitIndexB]; + } + + solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, + posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB); + + if (invMassA) + { + deltaLinearVelocities[splitIndexA] = dLinVelA; + deltaAngularVelocities[splitIndexA] = dAngVelA; + } + if (invMassB) + { + deltaLinearVelocities[splitIndexB] = dLinVelB; + deltaAngularVelocities[splitIndexB] = dAngVelB; + } + +} + + +__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes , +__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, +float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds +) +{ + int i = GET_GLOBAL_IDX; + if (i<numManifolds) + { + solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities); + } +} + + + + +void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, + __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies, + __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities) +{ + float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w; + int aIdx = ldsCs[0].m_bodyA; + int bIdx = ldsCs[0].m_bodyB; + + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + + float4 dLinVelA = make_float4(0,0,0,0); + float4 dAngVelA = make_float4(0,0,0,0); + float4 dLinVelB = make_float4(0,0,0,0); + float4 dAngVelB = make_float4(0,0,0,0); + + int bodyOffsetA = offsetSplitBodies[aIdx]; + int constraintOffsetA = contactConstraintOffsets[0].x; + int splitIndexA = bodyOffsetA+constraintOffsetA; + + if (invMassA) + { + dLinVelA = deltaLinearVelocities[splitIndexA]; + dAngVelA = deltaAngularVelocities[splitIndexA]; + } + + int bodyOffsetB = offsetSplitBodies[bIdx]; + int constraintOffsetB = contactConstraintOffsets[0].y; + int splitIndexB= bodyOffsetB+constraintOffsetB; + + if (invMassB) + { + dLinVelB = deltaLinearVelocities[splitIndexB]; + dAngVelB = deltaAngularVelocities[splitIndexB]; + } + + + + + { + float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + float minRambdaDt[4] = {0.f,0.f,0.f,0.f}; + + float sum = 0; + for(int j=0; j<4; j++) + { + sum +=ldsCs[0].m_appliedRambdaDt[j]; + } + frictionCoeff = 0.7f; + for(int j=0; j<4; j++) + { + maxRambdaDt[j] = frictionCoeff*sum; + minRambdaDt[j] = -maxRambdaDt[j]; + } + + +// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA, +// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt ); + + + { + + __global Constraint4* cs = ldsCs; + + if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return; + const float4 center = cs->m_center; + + float4 n = -cs->m_linear; + + float4 tangent[2]; + btPlaneSpace1(n,&tangent[0],&tangent[1]); + float4 angular0, angular1, linear; + float4 r0 = center - posA; + float4 r1 = center - posB; + for(int i=0; i<2; i++) + { + setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 ); + float rambdaDt = calcRelVel(linear, -linear, angular0, angular1, + linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB ); + rambdaDt *= cs->m_fJacCoeffInv[i]; + + { + float prevSum = cs->m_fAppliedRambdaDt[i]; + float updated = prevSum; + updated += rambdaDt; + updated = max2( updated, minRambdaDt[i] ); + updated = min2( updated, maxRambdaDt[i] ); + rambdaDt = updated - prevSum; + cs->m_fAppliedRambdaDt[i] = updated; + } + + float4 linImp0 = invMassA*linear*rambdaDt; + float4 linImp1 = invMassB*(-linear)*rambdaDt; + float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt; + float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt; + + dLinVelA += linImp0; + dAngVelA += angImp0; + dLinVelB += linImp1; + dAngVelB += angImp1; + } + { // angular damping for point constraint + float4 ab = normalize3( posB - posA ); + float4 ac = normalize3( center - posA ); + if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f)) + { + float angNA = dot3F4( n, angVelA ); + float angNB = dot3F4( n, angVelB ); + + dAngVelA -= (angNA*0.1f)*n; + dAngVelB -= (angNB*0.1f)*n; + } + } + } + + + + } + + if (invMassA) + { + deltaLinearVelocities[splitIndexA] = dLinVelA; + deltaAngularVelocities[splitIndexA] = dAngVelA; + } + if (invMassB) + { + deltaLinearVelocities[splitIndexB] = dLinVelB; + deltaAngularVelocities[splitIndexB] = dAngVelB; + } + + +} + + +__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes , + __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies, + __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, + float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds +) +{ + int i = GET_GLOBAL_IDX; + if (i<numManifolds) + { + solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities); + } +} + + +__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount, + __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies) +{ + int i = GET_GLOBAL_IDX; + if (i<numBodies) + { + if (gBodies[i].m_invMass) + { + int bodyOffset = offsetSplitBodies[i]; + int count = bodyCount[i]; + if (count) + { + gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset]; + gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset]; + } + } + } +} + + + +void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA, + const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, + __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB, + Constraint4* dstC ) +{ + dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit); + dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit); + + float dtInv = 1.f/dt; + for(int ic=0; ic<4; ic++) + { + dstC->m_appliedRambdaDt[ic] = 0.f; + } + dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f; + + + dstC->m_linear = src->m_worldNormalOnB; + dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() ); + for(int ic=0; ic<4; ic++) + { + float4 r0 = src->m_worldPosB[ic] - posA; + float4 r1 = src->m_worldPosB[ic] - posB; + + if( ic >= src->m_worldNormalOnB.w )//npoints + { + dstC->m_jacCoeffInv[ic] = 0.f; + continue; + } + + float relVelN; + { + float4 linear, angular0, angular1; + setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1); + + dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1, + invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB); + + relVelN = calcRelVel(linear, -linear, angular0, angular1, + linVelA, angVelA, linVelB, angVelB); + + float e = 0.f;//src->getRestituitionCoeff(); + if( relVelN*relVelN < 0.004f ) e = 0.f; + + dstC->m_b[ic] = e*relVelN; + //float penetration = src->m_worldPosB[ic].w; + dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv; + dstC->m_appliedRambdaDt[ic] = 0.f; + } + } + + if( src->m_worldNormalOnB.w > 0 )//npoints + { // prepare friction + float4 center = make_float4(0.f); + for(int i=0; i<src->m_worldNormalOnB.w; i++) + center += src->m_worldPosB[i]; + center /= (float)src->m_worldNormalOnB.w; + + float4 tangent[2]; + btPlaneSpace1(-src->m_worldNormalOnB,&tangent[0],&tangent[1]); + + float4 r[2]; + r[0] = center - posA; + r[1] = center - posB; + + for(int i=0; i<2; i++) + { + float4 linear, angular0, angular1; + setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1); + + dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1, + invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB); + dstC->m_fAppliedRambdaDt[i] = 0.f; + } + dstC->m_center = center; + } + + for(int i=0; i<4; i++) + { + if( i<src->m_worldNormalOnB.w ) + { + dstC->m_worldPos[i] = src->m_worldPosB[i]; + } + else + { + dstC->m_worldPos[i] = make_float4(0.f); + } + } +} + + +__kernel +__attribute__((reqd_work_group_size(WG_SIZE,1,1))) +void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, +__global const unsigned int* bodyCount, +int nContacts, +float dt, +float positionDrift, +float positionConstraintCoeff +) +{ + int gIdx = GET_GLOBAL_IDX; + + if( gIdx < nContacts ) + { + int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit); + int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit); + + float4 posA = gBodies[aIdx].m_pos; + float4 linVelA = gBodies[aIdx].m_linVel; + float4 angVelA = gBodies[aIdx].m_angVel; + float invMassA = gBodies[aIdx].m_invMass; + Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia; + + float4 posB = gBodies[bIdx].m_pos; + float4 linVelB = gBodies[bIdx].m_linVel; + float4 angVelB = gBodies[bIdx].m_angVel; + float invMassB = gBodies[bIdx].m_invMass; + Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia; + + Constraint4 cs; + + float countA = invMassA != 0.f ? (float)bodyCount[aIdx] : 1; + float countB = invMassB != 0.f ? (float)bodyCount[bIdx] : 1; + + setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB, + &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB, + &cs ); + + cs.m_batchIdx = gContact[gIdx].m_batchIdx; + + gConstraintOut[gIdx] = cs; + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h new file mode 100644 index 0000000000..c0173ad9f4 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h @@ -0,0 +1,909 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* solverUtilsCL= \ +"/*\n" +"Copyright (c) 2013 Advanced Micro Devices, Inc. \n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose, \n" +"including commercial applications, and to alter it and redistribute it freely, \n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Originally written by Erwin Coumans\n" +"#ifndef B3_CONTACT4DATA_H\n" +"#define B3_CONTACT4DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"typedef struct b3Contact4Data b3Contact4Data_t;\n" +"struct b3Contact4Data\n" +"{\n" +" b3Float4 m_worldPosB[4];\n" +"// b3Float4 m_localPosA[4];\n" +"// b3Float4 m_localPosB[4];\n" +" b3Float4 m_worldNormalOnB; // w: m_nPoints\n" +" unsigned short m_restituitionCoeffCmp;\n" +" unsigned short m_frictionCoeffCmp;\n" +" int m_batchIdx;\n" +" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n" +" int m_bodyBPtrAndSignBit;\n" +" int m_childIndexA;\n" +" int m_childIndexB;\n" +" int m_unused1;\n" +" int m_unused2;\n" +"};\n" +"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n" +"{\n" +" return (int)contact->m_worldNormalOnB.w;\n" +"};\n" +"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n" +"{\n" +" contact->m_worldNormalOnB.w = (float)numPoints;\n" +"};\n" +"#endif //B3_CONTACT4DATA_H\n" +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n" +"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" +"#ifdef cl_ext_atomic_counters_32\n" +"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n" +"#else\n" +"#define counter32_t volatile global int*\n" +"#endif\n" +"typedef unsigned int u32;\n" +"typedef unsigned short u16;\n" +"typedef unsigned char u8;\n" +"#define GET_GROUP_IDX get_group_id(0)\n" +"#define GET_LOCAL_IDX get_local_id(0)\n" +"#define GET_GLOBAL_IDX get_global_id(0)\n" +"#define GET_GROUP_SIZE get_local_size(0)\n" +"#define GET_NUM_GROUPS get_num_groups(0)\n" +"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n" +"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n" +"#define AtomInc(x) atom_inc(&(x))\n" +"#define AtomInc1(x, out) out = atom_inc(&(x))\n" +"#define AppendInc(x, out) out = atomic_inc(x)\n" +"#define AtomAdd(x, value) atom_add(&(x), value)\n" +"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n" +"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n" +"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n" +"#define make_float4 (float4)\n" +"#define make_float2 (float2)\n" +"#define make_uint4 (uint4)\n" +"#define make_int4 (int4)\n" +"#define make_uint2 (uint2)\n" +"#define make_int2 (int2)\n" +"#define max2 max\n" +"#define min2 min\n" +"///////////////////////////////////////\n" +"// Vector\n" +"///////////////////////////////////////\n" +"__inline\n" +"float fastDiv(float numerator, float denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"// return numerator/denominator; \n" +"}\n" +"__inline\n" +"float4 fastDiv4(float4 numerator, float4 denominator)\n" +"{\n" +" return native_divide(numerator, denominator); \n" +"}\n" +"__inline\n" +"float fastSqrtf(float f2)\n" +"{\n" +" return native_sqrt(f2);\n" +"// return sqrt(f2);\n" +"}\n" +"__inline\n" +"float fastRSqrt(float f2)\n" +"{\n" +" return native_rsqrt(f2);\n" +"}\n" +"__inline\n" +"float fastLength4(float4 v)\n" +"{\n" +" return fast_length(v);\n" +"}\n" +"__inline\n" +"float4 fastNormalize4(float4 v)\n" +"{\n" +" return fast_normalize(v);\n" +"}\n" +"__inline\n" +"float sqrtf(float a)\n" +"{\n" +"// return sqrt(a);\n" +" return native_sqrt(a);\n" +"}\n" +"__inline\n" +"float4 cross3(float4 a1, float4 b1)\n" +"{\n" +" float4 a=make_float4(a1.xyz,0.f);\n" +" float4 b=make_float4(b1.xyz,0.f);\n" +" //float4 a=a1;\n" +" //float4 b=b1;\n" +" return cross(a,b);\n" +"}\n" +"__inline\n" +"float dot3F4(float4 a, float4 b)\n" +"{\n" +" float4 a1 = make_float4(a.xyz,0.f);\n" +" float4 b1 = make_float4(b.xyz,0.f);\n" +" return dot(a1, b1);\n" +"}\n" +"__inline\n" +"float length3(const float4 a)\n" +"{\n" +" return sqrtf(dot3F4(a,a));\n" +"}\n" +"__inline\n" +"float dot4(const float4 a, const float4 b)\n" +"{\n" +" return dot( a, b );\n" +"}\n" +"// for height\n" +"__inline\n" +"float dot3w1(const float4 point, const float4 eqn)\n" +"{\n" +" return dot3F4(point,eqn) + eqn.w;\n" +"}\n" +"__inline\n" +"float4 normalize3(const float4 a)\n" +"{\n" +" float4 n = make_float4(a.x, a.y, a.z, 0.f);\n" +" return fastNormalize4( n );\n" +"// float length = sqrtf(dot3F4(a, a));\n" +"// return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 normalize4(const float4 a)\n" +"{\n" +" float length = sqrtf(dot4(a, a));\n" +" return 1.f/length * a;\n" +"}\n" +"__inline\n" +"float4 createEquation(const float4 a, const float4 b, const float4 c)\n" +"{\n" +" float4 eqn;\n" +" float4 ab = b-a;\n" +" float4 ac = c-a;\n" +" eqn = normalize3( cross3(ab, ac) );\n" +" eqn.w = -dot3F4(eqn,a);\n" +" return eqn;\n" +"}\n" +"///////////////////////////////////////\n" +"// Matrix3x3\n" +"///////////////////////////////////////\n" +"typedef struct\n" +"{\n" +" float4 m_row[3];\n" +"}Matrix3x3;\n" +"__inline\n" +"Matrix3x3 mtZero();\n" +"__inline\n" +"Matrix3x3 mtIdentity();\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m);\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b);\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b);\n" +"__inline\n" +"Matrix3x3 mtZero()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(0.f);\n" +" m.m_row[1] = (float4)(0.f);\n" +" m.m_row[2] = (float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtIdentity()\n" +"{\n" +" Matrix3x3 m;\n" +" m.m_row[0] = (float4)(1,0,0,0);\n" +" m.m_row[1] = (float4)(0,1,0,0);\n" +" m.m_row[2] = (float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtTranspose(Matrix3x3 m)\n" +"{\n" +" Matrix3x3 out;\n" +" out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n" +"{\n" +" Matrix3x3 transB;\n" +" transB = mtTranspose( b );\n" +" Matrix3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul1(Matrix3x3 a, float4 b)\n" +"{\n" +" float4 ans;\n" +" ans.x = dot3F4( a.m_row[0], b );\n" +" ans.y = dot3F4( a.m_row[1], b );\n" +" ans.z = dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"float4 mtMul3(float4 a, Matrix3x3 b)\n" +"{\n" +" float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" float4 ans;\n" +" ans.x = dot3F4( a, colx );\n" +" ans.y = dot3F4( a, coly );\n" +" ans.z = dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"///////////////////////////////////////\n" +"// Quaternion\n" +"///////////////////////////////////////\n" +"typedef float4 Quaternion;\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b);\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in);\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec);\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q);\n" +"__inline\n" +"Quaternion qtMul(Quaternion a, Quaternion b)\n" +"{\n" +" Quaternion ans;\n" +" ans = cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"__inline\n" +"Quaternion qtNormalize(Quaternion in)\n" +"{\n" +" return fastNormalize4(in);\n" +"// in /= length( in );\n" +"// return in;\n" +"}\n" +"__inline\n" +"float4 qtRotate(Quaternion q, float4 vec)\n" +"{\n" +" Quaternion qInv = qtInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = qtMul(qtMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"__inline\n" +"Quaternion qtInvert(Quaternion q)\n" +"{\n" +" return (Quaternion)(-q.xyz, q.w);\n" +"}\n" +"__inline\n" +"float4 qtInvRotate(const Quaternion q, float4 vec)\n" +"{\n" +" return qtRotate( qtInvert( q ), vec );\n" +"}\n" +"#define WG_SIZE 64\n" +"typedef struct\n" +"{\n" +" float4 m_pos;\n" +" Quaternion m_quat;\n" +" float4 m_linVel;\n" +" float4 m_angVel;\n" +" u32 m_shapeIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"} Body;\n" +"typedef struct\n" +"{\n" +" Matrix3x3 m_invInertia;\n" +" Matrix3x3 m_initInvInertia;\n" +"} Shape;\n" +"typedef struct\n" +"{\n" +" float4 m_linear;\n" +" float4 m_worldPos[4];\n" +" float4 m_center; \n" +" float m_jacCoeffInv[4];\n" +" float m_b[4];\n" +" float m_appliedRambdaDt[4];\n" +" float m_fJacCoeffInv[2]; \n" +" float m_fAppliedRambdaDt[2]; \n" +" u32 m_bodyA;\n" +" u32 m_bodyB;\n" +" int m_batchIdx;\n" +" u32 m_paddings;\n" +"} Constraint4;\n" +"__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" \n" +" if( i < numContactManifolds)\n" +" {\n" +" int pa = manifoldPtr[i].m_bodyAPtrAndSignBit;\n" +" bool isFixedA = (pa <0) || (pa == fixedBodyIndex);\n" +" int bodyIndexA = abs(pa);\n" +" if (!isFixedA)\n" +" {\n" +" AtomInc1(bodyCount[bodyIndexA],contactConstraintOffsets[i].x);\n" +" }\n" +" barrier(CLK_GLOBAL_MEM_FENCE);\n" +" int pb = manifoldPtr[i].m_bodyBPtrAndSignBit;\n" +" bool isFixedB = (pb <0) || (pb == fixedBodyIndex);\n" +" int bodyIndexB = abs(pb);\n" +" if (!isFixedB)\n" +" {\n" +" AtomInc1(bodyCount[bodyIndexB],contactConstraintOffsets[i].y);\n" +" } \n" +" }\n" +"}\n" +"__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" \n" +" if( i < numSplitBodies)\n" +" {\n" +" linearVelocities[i] = make_float4(0);\n" +" angularVelocities[i] = make_float4(0);\n" +" }\n" +"}\n" +"__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n" +"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" if (i<numBodies)\n" +" {\n" +" if (gBodies[i].m_invMass)\n" +" {\n" +" int bodyOffset = offsetSplitBodies[i];\n" +" int count = bodyCount[i];\n" +" float factor = 1.f/((float)count);\n" +" float4 averageLinVel = make_float4(0.f);\n" +" float4 averageAngVel = make_float4(0.f);\n" +" \n" +" for (int j=0;j<count;j++)\n" +" {\n" +" averageLinVel += deltaLinearVelocities[bodyOffset+j]*factor;\n" +" averageAngVel += deltaAngularVelocities[bodyOffset+j]*factor;\n" +" }\n" +" \n" +" for (int j=0;j<count;j++)\n" +" {\n" +" deltaLinearVelocities[bodyOffset+j] = averageLinVel;\n" +" deltaAngularVelocities[bodyOffset+j] = averageAngVel;\n" +" }\n" +" \n" +" }//bodies[i].m_invMass\n" +" }//i<numBodies\n" +"}\n" +"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n" +"{\n" +" *linear = make_float4(n.xyz,0.f);\n" +" *angular0 = cross3(r0, n);\n" +" *angular1 = -cross3(r1, n);\n" +"}\n" +"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n" +"{\n" +" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n" +"}\n" +"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n" +" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)\n" +"{\n" +" // linear0,1 are normlized\n" +" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n" +" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n" +" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n" +" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n" +" return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);\n" +"}\n" +"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n" +" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n" +"{\n" +" if (fabs(n.z) > 0.70710678f) {\n" +" // choose p in y-z plane\n" +" float a = n.y*n.y + n.z*n.z;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = 0;\n" +" p[0].y = -n.z*k;\n" +" p[0].z = n.y*k;\n" +" // set q = n x p\n" +" q[0].x = a*k;\n" +" q[0].y = -n.x*p[0].z;\n" +" q[0].z = n.x*p[0].y;\n" +" }\n" +" else {\n" +" // choose p in x-y plane\n" +" float a = n.x*n.x + n.y*n.y;\n" +" float k = 1.f/sqrt(a);\n" +" p[0].x = -n.y*k;\n" +" p[0].y = n.x*k;\n" +" p[0].z = 0;\n" +" // set q = n x p\n" +" q[0].x = -n.z*p[0].y;\n" +" q[0].y = n.z*p[0].x;\n" +" q[0].z = a*k;\n" +" }\n" +"}\n" +"void solveContact(__global Constraint4* cs,\n" +" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n" +" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,\n" +" float4* dLinVelA, float4* dAngVelA, float4* dLinVelB, float4* dAngVelB)\n" +"{\n" +" float minRambdaDt = 0;\n" +" float maxRambdaDt = FLT_MAX;\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = cs->m_worldPos[ic] - posA;\n" +" float4 r1 = cs->m_worldPos[ic] - posB;\n" +" setLinearAndAngular( cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n" +" \n" +" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n" +" *linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];\n" +" rambdaDt *= cs->m_jacCoeffInv[ic];\n" +" \n" +" {\n" +" float prevSum = cs->m_appliedRambdaDt[ic];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt );\n" +" updated = min2( updated, maxRambdaDt );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_appliedRambdaDt[ic] = updated;\n" +" }\n" +" \n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +" \n" +" if (invMassA)\n" +" {\n" +" *dLinVelA += linImp0;\n" +" *dAngVelA += angImp0;\n" +" }\n" +" if (invMassB)\n" +" {\n" +" *dLinVelB += linImp1;\n" +" *dAngVelB += angImp1;\n" +" }\n" +" }\n" +"}\n" +"// solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" +"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, \n" +"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" +"__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n" +"{\n" +" //float frictionCoeff = ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" \n" +" float4 dLinVelA = make_float4(0,0,0,0);\n" +" float4 dAngVelA = make_float4(0,0,0,0);\n" +" float4 dLinVelB = make_float4(0,0,0,0);\n" +" float4 dAngVelB = make_float4(0,0,0,0);\n" +" \n" +" int bodyOffsetA = offsetSplitBodies[aIdx];\n" +" int constraintOffsetA = contactConstraintOffsets[0].x;\n" +" int splitIndexA = bodyOffsetA+constraintOffsetA;\n" +" \n" +" if (invMassA)\n" +" {\n" +" dLinVelA = deltaLinearVelocities[splitIndexA];\n" +" dAngVelA = deltaAngularVelocities[splitIndexA];\n" +" }\n" +" int bodyOffsetB = offsetSplitBodies[bIdx];\n" +" int constraintOffsetB = contactConstraintOffsets[0].y;\n" +" int splitIndexB= bodyOffsetB+constraintOffsetB;\n" +" if (invMassB)\n" +" {\n" +" dLinVelB = deltaLinearVelocities[splitIndexB];\n" +" dAngVelB = deltaAngularVelocities[splitIndexB];\n" +" }\n" +" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +" posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);\n" +" if (invMassA)\n" +" {\n" +" deltaLinearVelocities[splitIndexA] = dLinVelA;\n" +" deltaAngularVelocities[splitIndexA] = dAngVelA;\n" +" } \n" +" if (invMassB)\n" +" {\n" +" deltaLinearVelocities[splitIndexB] = dLinVelB;\n" +" deltaAngularVelocities[splitIndexB] = dAngVelB;\n" +" }\n" +"}\n" +"__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n" +"__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n" +"float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n" +")\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" if (i<numManifolds)\n" +" {\n" +" solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" +" }\n" +"}\n" +"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,\n" +" __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" +" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n" +"{\n" +" float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;\n" +" int aIdx = ldsCs[0].m_bodyA;\n" +" int bIdx = ldsCs[0].m_bodyB;\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" \n" +" float4 dLinVelA = make_float4(0,0,0,0);\n" +" float4 dAngVelA = make_float4(0,0,0,0);\n" +" float4 dLinVelB = make_float4(0,0,0,0);\n" +" float4 dAngVelB = make_float4(0,0,0,0);\n" +" \n" +" int bodyOffsetA = offsetSplitBodies[aIdx];\n" +" int constraintOffsetA = contactConstraintOffsets[0].x;\n" +" int splitIndexA = bodyOffsetA+constraintOffsetA;\n" +" \n" +" if (invMassA)\n" +" {\n" +" dLinVelA = deltaLinearVelocities[splitIndexA];\n" +" dAngVelA = deltaAngularVelocities[splitIndexA];\n" +" }\n" +" int bodyOffsetB = offsetSplitBodies[bIdx];\n" +" int constraintOffsetB = contactConstraintOffsets[0].y;\n" +" int splitIndexB= bodyOffsetB+constraintOffsetB;\n" +" if (invMassB)\n" +" {\n" +" dLinVelB = deltaLinearVelocities[splitIndexB];\n" +" dAngVelB = deltaAngularVelocities[splitIndexB];\n" +" }\n" +" {\n" +" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n" +" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n" +" float sum = 0;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" sum +=ldsCs[0].m_appliedRambdaDt[j];\n" +" }\n" +" frictionCoeff = 0.7f;\n" +" for(int j=0; j<4; j++)\n" +" {\n" +" maxRambdaDt[j] = frictionCoeff*sum;\n" +" minRambdaDt[j] = -maxRambdaDt[j];\n" +" }\n" +" \n" +"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n" +"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n" +" \n" +" \n" +" {\n" +" \n" +" __global Constraint4* cs = ldsCs;\n" +" \n" +" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n" +" const float4 center = cs->m_center;\n" +" \n" +" float4 n = -cs->m_linear;\n" +" \n" +" float4 tangent[2];\n" +" btPlaneSpace1(n,&tangent[0],&tangent[1]);\n" +" float4 angular0, angular1, linear;\n" +" float4 r0 = center - posA;\n" +" float4 r1 = center - posB;\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n" +" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA+dLinVelA, angVelA+dAngVelA, linVelB+dLinVelB, angVelB+dAngVelB );\n" +" rambdaDt *= cs->m_fJacCoeffInv[i];\n" +" \n" +" {\n" +" float prevSum = cs->m_fAppliedRambdaDt[i];\n" +" float updated = prevSum;\n" +" updated += rambdaDt;\n" +" updated = max2( updated, minRambdaDt[i] );\n" +" updated = min2( updated, maxRambdaDt[i] );\n" +" rambdaDt = updated - prevSum;\n" +" cs->m_fAppliedRambdaDt[i] = updated;\n" +" }\n" +" \n" +" float4 linImp0 = invMassA*linear*rambdaDt;\n" +" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n" +" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n" +" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n" +" \n" +" dLinVelA += linImp0;\n" +" dAngVelA += angImp0;\n" +" dLinVelB += linImp1;\n" +" dAngVelB += angImp1;\n" +" }\n" +" { // angular damping for point constraint\n" +" float4 ab = normalize3( posB - posA );\n" +" float4 ac = normalize3( center - posA );\n" +" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n" +" {\n" +" float angNA = dot3F4( n, angVelA );\n" +" float angNB = dot3F4( n, angVelB );\n" +" \n" +" dAngVelA -= (angNA*0.1f)*n;\n" +" dAngVelB -= (angNB*0.1f)*n;\n" +" }\n" +" }\n" +" }\n" +" \n" +" \n" +" }\n" +" if (invMassA)\n" +" {\n" +" deltaLinearVelocities[splitIndexA] = dLinVelA;\n" +" deltaAngularVelocities[splitIndexA] = dAngVelA;\n" +" } \n" +" if (invMassB)\n" +" {\n" +" deltaLinearVelocities[splitIndexB] = dLinVelB;\n" +" deltaAngularVelocities[splitIndexB] = dAngVelB;\n" +" }\n" +" \n" +"}\n" +"__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n" +" __global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n" +" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n" +" float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n" +")\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" if (i<numManifolds)\n" +" {\n" +" solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n" +" }\n" +"}\n" +"__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n" +" __global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n" +"{\n" +" int i = GET_GLOBAL_IDX;\n" +" if (i<numBodies)\n" +" {\n" +" if (gBodies[i].m_invMass)\n" +" {\n" +" int bodyOffset = offsetSplitBodies[i];\n" +" int count = bodyCount[i];\n" +" if (count)\n" +" {\n" +" gBodies[i].m_linVel += deltaLinearVelocities[bodyOffset];\n" +" gBodies[i].m_angVel += deltaAngularVelocities[bodyOffset];\n" +" }\n" +" }\n" +" }\n" +"}\n" +"void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n" +" const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n" +" __global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n" +" Constraint4* dstC )\n" +"{\n" +" dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n" +" dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n" +" float dtInv = 1.f/dt;\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n" +" dstC->m_linear = src->m_worldNormalOnB;\n" +" dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n" +" for(int ic=0; ic<4; ic++)\n" +" {\n" +" float4 r0 = src->m_worldPosB[ic] - posA;\n" +" float4 r1 = src->m_worldPosB[ic] - posB;\n" +" if( ic >= src->m_worldNormalOnB.w )//npoints\n" +" {\n" +" dstC->m_jacCoeffInv[ic] = 0.f;\n" +" continue;\n" +" }\n" +" float relVelN;\n" +" {\n" +" float4 linear, angular0, angular1;\n" +" setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n" +" dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);\n" +" relVelN = calcRelVel(linear, -linear, angular0, angular1,\n" +" linVelA, angVelA, linVelB, angVelB);\n" +" float e = 0.f;//src->getRestituitionCoeff();\n" +" if( relVelN*relVelN < 0.004f ) e = 0.f;\n" +" dstC->m_b[ic] = e*relVelN;\n" +" //float penetration = src->m_worldPosB[ic].w;\n" +" dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n" +" dstC->m_appliedRambdaDt[ic] = 0.f;\n" +" }\n" +" }\n" +" if( src->m_worldNormalOnB.w > 0 )//npoints\n" +" { // prepare friction\n" +" float4 center = make_float4(0.f);\n" +" for(int i=0; i<src->m_worldNormalOnB.w; i++) \n" +" center += src->m_worldPosB[i];\n" +" center /= (float)src->m_worldNormalOnB.w;\n" +" float4 tangent[2];\n" +" btPlaneSpace1(-src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n" +" \n" +" float4 r[2];\n" +" r[0] = center - posA;\n" +" r[1] = center - posB;\n" +" for(int i=0; i<2; i++)\n" +" {\n" +" float4 linear, angular0, angular1;\n" +" setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n" +" dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n" +" invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);\n" +" dstC->m_fAppliedRambdaDt[i] = 0.f;\n" +" }\n" +" dstC->m_center = center;\n" +" }\n" +" for(int i=0; i<4; i++)\n" +" {\n" +" if( i<src->m_worldNormalOnB.w )\n" +" {\n" +" dstC->m_worldPos[i] = src->m_worldPosB[i];\n" +" }\n" +" else\n" +" {\n" +" dstC->m_worldPos[i] = make_float4(0.f);\n" +" }\n" +" }\n" +"}\n" +"__kernel\n" +"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n" +"void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n" +"__global const unsigned int* bodyCount,\n" +"int nContacts,\n" +"float dt,\n" +"float positionDrift,\n" +"float positionConstraintCoeff\n" +")\n" +"{\n" +" int gIdx = GET_GLOBAL_IDX;\n" +" \n" +" if( gIdx < nContacts )\n" +" {\n" +" int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n" +" int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n" +" float4 posA = gBodies[aIdx].m_pos;\n" +" float4 linVelA = gBodies[aIdx].m_linVel;\n" +" float4 angVelA = gBodies[aIdx].m_angVel;\n" +" float invMassA = gBodies[aIdx].m_invMass;\n" +" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n" +" float4 posB = gBodies[bIdx].m_pos;\n" +" float4 linVelB = gBodies[bIdx].m_linVel;\n" +" float4 angVelB = gBodies[bIdx].m_angVel;\n" +" float invMassB = gBodies[bIdx].m_invMass;\n" +" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n" +" Constraint4 cs;\n" +" float countA = invMassA != 0.f ? (float)bodyCount[aIdx] : 1;\n" +" float countB = invMassB != 0.f ? (float)bodyCount[bIdx] : 1;\n" +" setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n" +" &gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,\n" +" &cs );\n" +" \n" +" cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n" +" gConstraintOut[gIdx] = cs;\n" +" }\n" +"}\n" +; diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl new file mode 100644 index 0000000000..ba8ba735d0 --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl @@ -0,0 +1,22 @@ + + +#include "Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h" + + +__kernel void initializeGpuAabbsFull( const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB) +{ + int nodeID = get_global_id(0); + if( nodeID < numNodes ) + { + b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB); + } +} + +__kernel void clearOverlappingPairsKernel( __global int4* pairs, int numPairs) +{ + int pairId = get_global_id(0); + if( pairId< numPairs ) + { + pairs[pairId].z = 0xffffffff; + } +}
\ No newline at end of file diff --git a/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h new file mode 100644 index 0000000000..d70e74017a --- /dev/null +++ b/thirdparty/bullet/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h @@ -0,0 +1,483 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* updateAabbsKernelCL= \ +"#ifndef B3_UPDATE_AABBS_H\n" +"#define B3_UPDATE_AABBS_H\n" +"#ifndef B3_AABB_H\n" +"#define B3_AABB_H\n" +"#ifndef B3_FLOAT4_H\n" +"#define B3_FLOAT4_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#define B3_PLATFORM_DEFINITIONS_H\n" +"struct MyTest\n" +"{\n" +" int bla;\n" +"};\n" +"#ifdef __cplusplus\n" +"#else\n" +"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n" +"#define B3_LARGE_FLOAT 1e18f\n" +"#define B3_INFINITY 1e18f\n" +"#define b3Assert(a)\n" +"#define b3ConstArray(a) __global const a*\n" +"#define b3AtomicInc atomic_inc\n" +"#define b3AtomicAdd atomic_add\n" +"#define b3Fabs fabs\n" +"#define b3Sqrt native_sqrt\n" +"#define b3Sin native_sin\n" +"#define b3Cos native_cos\n" +"#define B3_STATIC\n" +"#endif\n" +"#endif\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Float4;\n" +" #define b3Float4ConstArg const b3Float4\n" +" #define b3MakeFloat4 (float4)\n" +" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return dot(a1, b1);\n" +" }\n" +" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n" +" {\n" +" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n" +" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n" +" return cross(a1, b1);\n" +" }\n" +" #define b3MinFloat4 min\n" +" #define b3MaxFloat4 max\n" +" #define b3Normalized(a) normalize(a)\n" +"#endif \n" +" \n" +"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n" +"{\n" +" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n" +" return false;\n" +" return true;\n" +"}\n" +"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n" +"{\n" +" float maxDot = -B3_INFINITY;\n" +" int i = 0;\n" +" int ptIndex = -1;\n" +" for( i = 0; i < vecLen; i++ )\n" +" {\n" +" float dot = b3Dot3F4(vecArray[i],vec);\n" +" \n" +" if( dot > maxDot )\n" +" {\n" +" maxDot = dot;\n" +" ptIndex = i;\n" +" }\n" +" }\n" +" b3Assert(ptIndex>=0);\n" +" if (ptIndex<0)\n" +" {\n" +" ptIndex = 0;\n" +" }\n" +" *dotOut = maxDot;\n" +" return ptIndex;\n" +"}\n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_MAT3x3_H\n" +"#define B3_MAT3x3_H\n" +"#ifndef B3_QUAT_H\n" +"#define B3_QUAT_H\n" +"#ifndef B3_PLATFORM_DEFINITIONS_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +" typedef float4 b3Quat;\n" +" #define b3QuatConstArg const b3Quat\n" +" \n" +" \n" +"inline float4 b3FastNormalize4(float4 v)\n" +"{\n" +" v = (float4)(v.xyz,0.f);\n" +" return fast_normalize(v);\n" +"}\n" +" \n" +"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n" +"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n" +"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n" +"{\n" +" b3Quat ans;\n" +" ans = b3Cross3( a, b );\n" +" ans += a.w*b+b.w*a;\n" +"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n" +" ans.w = a.w*b.w - b3Dot3F4(a, b);\n" +" return ans;\n" +"}\n" +"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n" +"{\n" +" b3Quat q;\n" +" q=in;\n" +" //return b3FastNormalize4(in);\n" +" float len = native_sqrt(dot(q, q));\n" +" if(len > 0.f)\n" +" {\n" +" q *= 1.f / len;\n" +" }\n" +" else\n" +" {\n" +" q.x = q.y = q.z = 0.f;\n" +" q.w = 1.f;\n" +" }\n" +" return q;\n" +"}\n" +"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" b3Quat qInv = b3QuatInvert( q );\n" +" float4 vcpy = vec;\n" +" vcpy.w = 0.f;\n" +" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n" +" return out;\n" +"}\n" +"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n" +"{\n" +" return (b3Quat)(-q.xyz, q.w);\n" +"}\n" +"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n" +"{\n" +" return b3QuatRotate( b3QuatInvert( q ), vec );\n" +"}\n" +"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n" +"{\n" +" return b3QuatRotate( orientation, point ) + (translation);\n" +"}\n" +" \n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"typedef struct\n" +"{\n" +" b3Float4 m_row[3];\n" +"}b3Mat3x3;\n" +"#define b3Mat3x3ConstArg const b3Mat3x3\n" +"#define b3GetRow(m,row) (m.m_row[row])\n" +"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n" +"{\n" +" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n" +" b3Mat3x3 out;\n" +" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n" +" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n" +" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n" +" out.m_row[0].w = 0.f;\n" +" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n" +" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n" +" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n" +" out.m_row[1].w = 0.f;\n" +" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n" +" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n" +" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n" +" out.m_row[2].w = 0.f;\n" +" return out;\n" +"}\n" +"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = fabs(matIn.m_row[0]);\n" +" out.m_row[1] = fabs(matIn.m_row[1]);\n" +" out.m_row[2] = fabs(matIn.m_row[2]);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtZero();\n" +"__inline\n" +"b3Mat3x3 mtIdentity();\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n" +"__inline\n" +"b3Mat3x3 mtZero()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(0.f);\n" +" m.m_row[1] = (b3Float4)(0.f);\n" +" m.m_row[2] = (b3Float4)(0.f);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtIdentity()\n" +"{\n" +" b3Mat3x3 m;\n" +" m.m_row[0] = (b3Float4)(1,0,0,0);\n" +" m.m_row[1] = (b3Float4)(0,1,0,0);\n" +" m.m_row[2] = (b3Float4)(0,0,1,0);\n" +" return m;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n" +"{\n" +" b3Mat3x3 out;\n" +" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n" +" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n" +" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n" +" return out;\n" +"}\n" +"__inline\n" +"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n" +"{\n" +" b3Mat3x3 transB;\n" +" transB = mtTranspose( b );\n" +" b3Mat3x3 ans;\n" +" // why this doesn't run when 0ing in the for{}\n" +" a.m_row[0].w = 0.f;\n" +" a.m_row[1].w = 0.f;\n" +" a.m_row[2].w = 0.f;\n" +" for(int i=0; i<3; i++)\n" +" {\n" +"// a.m_row[i].w = 0.f;\n" +" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n" +" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n" +" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n" +" ans.m_row[i].w = 0.f;\n" +" }\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n" +"{\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a.m_row[0], b );\n" +" ans.y = b3Dot3F4( a.m_row[1], b );\n" +" ans.z = b3Dot3F4( a.m_row[2], b );\n" +" ans.w = 0.f;\n" +" return ans;\n" +"}\n" +"__inline\n" +"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n" +"{\n" +" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n" +" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n" +" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n" +" b3Float4 ans;\n" +" ans.x = b3Dot3F4( a, colx );\n" +" ans.y = b3Dot3F4( a, coly );\n" +" ans.z = b3Dot3F4( a, colz );\n" +" return ans;\n" +"}\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3Aabb b3Aabb_t;\n" +"struct b3Aabb\n" +"{\n" +" union\n" +" {\n" +" float m_min[4];\n" +" b3Float4 m_minVec;\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float m_max[4];\n" +" b3Float4 m_maxVec;\n" +" int m_signedMaxIndices[4];\n" +" };\n" +"};\n" +"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n" +" b3Float4ConstArg pos,\n" +" b3QuatConstArg orn,\n" +" b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n" +"{\n" +" b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n" +" localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n" +" b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n" +" b3Mat3x3 m;\n" +" m = b3QuatGetRotationMatrix(orn);\n" +" b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n" +" b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n" +" \n" +" b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n" +" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n" +" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n" +" 0.f);\n" +" *aabbMinOut = center-extent;\n" +" *aabbMaxOut = center+extent;\n" +"}\n" +"/// conservative test for overlap between two aabbs\n" +"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n" +" b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n" +" overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n" +" overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"#endif //B3_AABB_H\n" +"#ifndef B3_COLLIDABLE_H\n" +"#define B3_COLLIDABLE_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"enum b3ShapeTypes\n" +"{\n" +" SHAPE_HEIGHT_FIELD=1,\n" +" SHAPE_CONVEX_HULL=3,\n" +" SHAPE_PLANE=4,\n" +" SHAPE_CONCAVE_TRIMESH=5,\n" +" SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n" +" SHAPE_SPHERE=7,\n" +" MAX_NUM_SHAPE_TYPES,\n" +"};\n" +"typedef struct b3Collidable b3Collidable_t;\n" +"struct b3Collidable\n" +"{\n" +" union {\n" +" int m_numChildShapes;\n" +" int m_bvhIndex;\n" +" };\n" +" union\n" +" {\n" +" float m_radius;\n" +" int m_compoundBvhIndex;\n" +" };\n" +" int m_shapeType;\n" +" union\n" +" {\n" +" int m_shapeIndex;\n" +" float m_height;\n" +" };\n" +"};\n" +"typedef struct b3GpuChildShape b3GpuChildShape_t;\n" +"struct b3GpuChildShape\n" +"{\n" +" b3Float4 m_childPosition;\n" +" b3Quat m_childOrientation;\n" +" union\n" +" {\n" +" int m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS\n" +" int m_capsuleAxis;\n" +" };\n" +" union \n" +" {\n" +" float m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES\n" +" int m_numChildShapes;//used for compound shape\n" +" };\n" +" union \n" +" {\n" +" float m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES\n" +" int m_collidableShapeIndex;\n" +" };\n" +" int m_shapeType;\n" +"};\n" +"struct b3CompoundOverlappingPair\n" +"{\n" +" int m_bodyIndexA;\n" +" int m_bodyIndexB;\n" +"// int m_pairType;\n" +" int m_childShapeIndexA;\n" +" int m_childShapeIndexB;\n" +"};\n" +"#endif //B3_COLLIDABLE_H\n" +"#ifndef B3_RIGIDBODY_DATA_H\n" +"#define B3_RIGIDBODY_DATA_H\n" +"#ifndef B3_FLOAT4_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_FLOAT4_H\n" +"#ifndef B3_QUAT_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif \n" +"#endif //B3_QUAT_H\n" +"#ifndef B3_MAT3x3_H\n" +"#ifdef __cplusplus\n" +"#else\n" +"#endif\n" +"#endif //B3_MAT3x3_H\n" +"typedef struct b3RigidBodyData b3RigidBodyData_t;\n" +"struct b3RigidBodyData\n" +"{\n" +" b3Float4 m_pos;\n" +" b3Quat m_quat;\n" +" b3Float4 m_linVel;\n" +" b3Float4 m_angVel;\n" +" int m_collidableIdx;\n" +" float m_invMass;\n" +" float m_restituitionCoeff;\n" +" float m_frictionCoeff;\n" +"};\n" +"typedef struct b3InertiaData b3InertiaData_t;\n" +"struct b3InertiaData\n" +"{\n" +" b3Mat3x3 m_invInertiaWorld;\n" +" b3Mat3x3 m_initInvInertia;\n" +"};\n" +"#endif //B3_RIGIDBODY_DATA_H\n" +" \n" +"void b3ComputeWorldAabb( int bodyId, __global const b3RigidBodyData_t* bodies, __global const b3Collidable_t* collidables, __global const b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n" +"{\n" +" __global const b3RigidBodyData_t* body = &bodies[bodyId];\n" +" b3Float4 position = body->m_pos;\n" +" b3Quat orientation = body->m_quat;\n" +" \n" +" int collidableIndex = body->m_collidableIdx;\n" +" int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n" +" \n" +" if (shapeIndex>=0)\n" +" {\n" +" \n" +" b3Aabb_t localAabb = localShapeAABB[collidableIndex];\n" +" b3Aabb_t worldAabb;\n" +" \n" +" b3Float4 aabbAMinOut,aabbAMaxOut; \n" +" float margin = 0.f;\n" +" b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);\n" +" \n" +" worldAabb.m_minVec =aabbAMinOut;\n" +" worldAabb.m_minIndices[3] = bodyId;\n" +" worldAabb.m_maxVec = aabbAMaxOut;\n" +" worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;\n" +" worldAabbs[bodyId] = worldAabb;\n" +" }\n" +"}\n" +"#endif //B3_UPDATE_AABBS_H\n" +"__kernel void initializeGpuAabbsFull( const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)\n" +"{\n" +" int nodeID = get_global_id(0);\n" +" if( nodeID < numNodes )\n" +" {\n" +" b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n" +" }\n" +"}\n" +"__kernel void clearOverlappingPairsKernel( __global int4* pairs, int numPairs)\n" +"{\n" +" int pairId = get_global_id(0);\n" +" if( pairId< numPairs )\n" +" {\n" +" pairs[pairId].z = 0xffffffff;\n" +" }\n" +"}\n" +; |