/***
 * ---------------------------------
 * Copyright (c)2012 Daniel Fiser <danfis@danfis.cz>
 *
 *  This file was ported from mpr.c file, part of libccd.
 *  The Minkoski Portal Refinement implementation was ported 
 *  to OpenCL by Erwin Coumans for the Bullet 3 Physics library.
 *  at http://github.com/erwincoumans/bullet3
 *
 *  Distributed under the OSI-approved BSD License (the "License");
 *  see <http://www.opensource.org/licenses/bsd-license.php>.
 *  This software is distributed WITHOUT ANY WARRANTY; without even the
 *  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *  See the License for more information.
 */




#ifndef B3_MPR_PENETRATION_H
#define B3_MPR_PENETRATION_H

#include "Bullet3Common/shared/b3PlatformDefinitions.h"
#include "Bullet3Common/shared/b3Float4.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"




#ifdef __cplusplus
#define B3_MPR_SQRT sqrtf
#else
#define B3_MPR_SQRT sqrt
#endif
#define B3_MPR_FMIN(x, y) ((x) < (y) ? (x) : (y))
#define B3_MPR_FABS fabs

#define B3_MPR_TOLERANCE 1E-6f
#define B3_MPR_MAX_ITERATIONS 1000

struct _b3MprSupport_t 
{
    b3Float4 v;  //!< Support point in minkowski sum
    b3Float4 v1; //!< Support point in obj1
    b3Float4 v2; //!< Support point in obj2
};
typedef struct _b3MprSupport_t b3MprSupport_t;

struct _b3MprSimplex_t 
{
    b3MprSupport_t ps[4];
    int last; //!< index of last added point
};
typedef struct _b3MprSimplex_t b3MprSimplex_t;

inline b3MprSupport_t* b3MprSimplexPointW(b3MprSimplex_t *s, int idx)
{
    return &s->ps[idx];
}

inline void b3MprSimplexSetSize(b3MprSimplex_t *s, int size)
{
    s->last = size - 1;
}


inline int b3MprSimplexSize(const b3MprSimplex_t *s)
{
    return s->last + 1;
}


inline const b3MprSupport_t* b3MprSimplexPoint(const b3MprSimplex_t* s, int idx)
{
    // here is no check on boundaries
    return &s->ps[idx];
}

inline void b3MprSupportCopy(b3MprSupport_t *d, const b3MprSupport_t *s)
{
    *d = *s;
}

inline void b3MprSimplexSet(b3MprSimplex_t *s, size_t pos, const b3MprSupport_t *a)
{
    b3MprSupportCopy(s->ps + pos, a);
}


inline void b3MprSimplexSwap(b3MprSimplex_t *s, size_t pos1, size_t pos2)
{
    b3MprSupport_t supp;

    b3MprSupportCopy(&supp, &s->ps[pos1]);
    b3MprSupportCopy(&s->ps[pos1], &s->ps[pos2]);
    b3MprSupportCopy(&s->ps[pos2], &supp);
}


inline int b3MprIsZero(float val)
{
    return B3_MPR_FABS(val) < FLT_EPSILON;
}



inline int b3MprEq(float _a, float _b)
{
    float ab;
    float a, b;

    ab = B3_MPR_FABS(_a - _b);
    if (B3_MPR_FABS(ab) < FLT_EPSILON)
        return 1;

    a = B3_MPR_FABS(_a);
    b = B3_MPR_FABS(_b);
    if (b > a){
        return ab < FLT_EPSILON * b;
    }else{
        return ab < FLT_EPSILON * a;
    }
}


inline int b3MprVec3Eq(const b3Float4* a, const b3Float4 *b)
{
    return b3MprEq((*a).x, (*b).x)
            && b3MprEq((*a).y, (*b).y)
            && b3MprEq((*a).z, (*b).z);
}



inline b3Float4 b3LocalGetSupportVertex(b3Float4ConstArg supportVec,__global const b3ConvexPolyhedronData_t* hull, 	b3ConstArray(b3Float4) verticesA)
{
	b3Float4 supVec = b3MakeFloat4(0,0,0,0);
	float maxDot = -B3_LARGE_FLOAT;

    if( 0 < hull->m_numVertices )
    {
        const b3Float4 scaled = supportVec;
		int index = b3MaxDot(scaled, &verticesA[hull->m_vertexOffset], hull->m_numVertices, &maxDot);
        return verticesA[hull->m_vertexOffset+index];
    }

    return supVec;

}


B3_STATIC void b3MprConvexSupport(int pairIndex,int bodyIndex,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
													b3ConstArray(b3Collidable_t)				cpuCollidables,
													b3ConstArray(b3Float4)					cpuVertices,
													__global b3Float4* sepAxis,
														const b3Float4* _dir, b3Float4* outp, int logme)
{
	//dir is in worldspace, move to local space
	
	b3Float4 pos = cpuBodyBuf[bodyIndex].m_pos;
	b3Quat orn = cpuBodyBuf[bodyIndex].m_quat;
	
	b3Float4 dir = b3MakeFloat4((*_dir).x,(*_dir).y,(*_dir).z,0.f);
	
	const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn),dir);
	

	//find local support vertex
	int colIndex = cpuBodyBuf[bodyIndex].m_collidableIdx;
	
	b3Assert(cpuCollidables[colIndex].m_shapeType==SHAPE_CONVEX_HULL);
	__global const b3ConvexPolyhedronData_t* hull = &cpuConvexData[cpuCollidables[colIndex].m_shapeIndex];
	
	b3Float4 pInA;
	if (logme)
	{


	//	b3Float4 supVec = b3MakeFloat4(0,0,0,0);
		float maxDot = -B3_LARGE_FLOAT;

		if( 0 < hull->m_numVertices )
		{
			const b3Float4 scaled = localDir;
			int index = b3MaxDot(scaled, &cpuVertices[hull->m_vertexOffset], hull->m_numVertices, &maxDot);
			pInA = cpuVertices[hull->m_vertexOffset+index];
			
		}


	} else
	{
		pInA = b3LocalGetSupportVertex(localDir,hull,cpuVertices);
	}

	//move vertex to world space
	*outp = b3TransformPoint(pInA,pos,orn);
	
}

inline void b3MprSupport(int pairIndex,int bodyIndexA, int bodyIndexB,   b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
													b3ConstArray(b3Collidable_t)				cpuCollidables,
													b3ConstArray(b3Float4)					cpuVertices,
													__global b3Float4* sepAxis,
													const b3Float4* _dir, b3MprSupport_t *supp)
{
    b3Float4 dir;
	dir = *_dir;
	b3MprConvexSupport(pairIndex,bodyIndexA,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v1,0);
    dir = *_dir*-1.f;
	b3MprConvexSupport(pairIndex,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,&dir, &supp->v2,0);
    supp->v = supp->v1 - supp->v2;
}









inline void b3FindOrigin(int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, b3MprSupport_t *center)
{

    center->v1 = cpuBodyBuf[bodyIndexA].m_pos;
	center->v2 = cpuBodyBuf[bodyIndexB].m_pos;
    center->v = center->v1 - center->v2;
}

inline void b3MprVec3Set(b3Float4 *v, float x, float y, float z)
{
	(*v).x = x;
	(*v).y = y;
	(*v).z = z;
	(*v).w = 0.f;
}

inline void b3MprVec3Add(b3Float4 *v, const b3Float4 *w)
{
    (*v).x += (*w).x;
    (*v).y += (*w).y;
    (*v).z += (*w).z;
}

inline void b3MprVec3Copy(b3Float4 *v, const b3Float4 *w)
{
    *v = *w;
}

inline void b3MprVec3Scale(b3Float4 *d, float k)
{
    *d *= k;
}

inline float b3MprVec3Dot(const b3Float4 *a, const b3Float4 *b)
{
    float dot;

	dot = b3Dot3F4(*a,*b);
    return dot;
}


inline float b3MprVec3Len2(const b3Float4 *v)
{
    return b3MprVec3Dot(v, v);
}

inline void b3MprVec3Normalize(b3Float4 *d)
{
    float k = 1.f / B3_MPR_SQRT(b3MprVec3Len2(d));
    b3MprVec3Scale(d, k);
}

inline void b3MprVec3Cross(b3Float4 *d, const b3Float4 *a, const b3Float4 *b)
{
	*d = b3Cross3(*a,*b);
	
}


inline void b3MprVec3Sub2(b3Float4 *d, const b3Float4 *v, const b3Float4 *w)
{
	*d = *v - *w;
}

inline void b3PortalDir(const b3MprSimplex_t *portal, b3Float4 *dir)
{
    b3Float4 v2v1, v3v1;

    b3MprVec3Sub2(&v2v1, &b3MprSimplexPoint(portal, 2)->v,
                       &b3MprSimplexPoint(portal, 1)->v);
    b3MprVec3Sub2(&v3v1, &b3MprSimplexPoint(portal, 3)->v,
                       &b3MprSimplexPoint(portal, 1)->v);
    b3MprVec3Cross(dir, &v2v1, &v3v1);
    b3MprVec3Normalize(dir);
}


inline int portalEncapsulesOrigin(const b3MprSimplex_t *portal,
                                       const b3Float4 *dir)
{
    float dot;
    dot = b3MprVec3Dot(dir, &b3MprSimplexPoint(portal, 1)->v);
    return b3MprIsZero(dot) || dot > 0.f;
}

inline int portalReachTolerance(const b3MprSimplex_t *portal,
                                     const b3MprSupport_t *v4,
                                     const b3Float4 *dir)
{
    float dv1, dv2, dv3, dv4;
    float dot1, dot2, dot3;

    // find the smallest dot product of dir and {v1-v4, v2-v4, v3-v4}

    dv1 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, dir);
    dv2 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, dir);
    dv3 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, dir);
    dv4 = b3MprVec3Dot(&v4->v, dir);

    dot1 = dv4 - dv1;
    dot2 = dv4 - dv2;
    dot3 = dv4 - dv3;

    dot1 = B3_MPR_FMIN(dot1, dot2);
    dot1 = B3_MPR_FMIN(dot1, dot3);

    return b3MprEq(dot1, B3_MPR_TOLERANCE) || dot1 < B3_MPR_TOLERANCE;
}

inline int portalCanEncapsuleOrigin(const b3MprSimplex_t *portal,   
                                         const b3MprSupport_t *v4,
                                         const b3Float4 *dir)
{
    float dot;
    dot = b3MprVec3Dot(&v4->v, dir);
    return b3MprIsZero(dot) || dot > 0.f;
}

inline void b3ExpandPortal(b3MprSimplex_t *portal,
                              const b3MprSupport_t *v4)
{
    float dot;
    b3Float4 v4v0;

    b3MprVec3Cross(&v4v0, &v4->v, &b3MprSimplexPoint(portal, 0)->v);
    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &v4v0);
    if (dot > 0.f){
        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &v4v0);
        if (dot > 0.f){
            b3MprSimplexSet(portal, 1, v4);
        }else{
            b3MprSimplexSet(portal, 3, v4);
        }
    }else{
        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &v4v0);
        if (dot > 0.f){
            b3MprSimplexSet(portal, 2, v4);
        }else{
            b3MprSimplexSet(portal, 1, v4);
        }
    }
}



B3_STATIC int b3DiscoverPortal(int pairIndex, int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
													b3ConstArray(b3Collidable_t)				cpuCollidables,
													b3ConstArray(b3Float4)					cpuVertices,
													__global b3Float4* sepAxis,
													__global int*	hasSepAxis,
													b3MprSimplex_t *portal)
{
    b3Float4 dir, va, vb;
    float dot;
    int cont;
	
	

    // vertex 0 is center of portal
    b3FindOrigin(bodyIndexA,bodyIndexB,cpuBodyBuf, b3MprSimplexPointW(portal, 0));
    // vertex 0 is center of portal
    b3MprSimplexSetSize(portal, 1);
	


	b3Float4 zero = b3MakeFloat4(0,0,0,0);
	b3Float4* b3mpr_vec3_origin = &zero;

    if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 0)->v, b3mpr_vec3_origin)){
        // Portal's center lies on origin (0,0,0) => we know that objects
        // intersect but we would need to know penetration info.
        // So move center little bit...
        b3MprVec3Set(&va, FLT_EPSILON * 10.f, 0.f, 0.f);
        b3MprVec3Add(&b3MprSimplexPointW(portal, 0)->v, &va);
    }


    // vertex 1 = support in direction of origin
    b3MprVec3Copy(&dir, &b3MprSimplexPoint(portal, 0)->v);
    b3MprVec3Scale(&dir, -1.f);
    b3MprVec3Normalize(&dir);


    b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 1));

    b3MprSimplexSetSize(portal, 2);

    // test if origin isn't outside of v1
    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &dir);
	

    if (b3MprIsZero(dot) || dot < 0.f)
        return -1;


    // vertex 2
    b3MprVec3Cross(&dir, &b3MprSimplexPoint(portal, 0)->v,
                       &b3MprSimplexPoint(portal, 1)->v);
    if (b3MprIsZero(b3MprVec3Len2(&dir))){
        if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 1)->v, b3mpr_vec3_origin)){
            // origin lies on v1
            return 1;
        }else{
            // origin lies on v0-v1 segment
            return 2;
        }
    }

    b3MprVec3Normalize(&dir);
	 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 2));
    
    dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &dir);
    if (b3MprIsZero(dot) || dot < 0.f)
        return -1;

    b3MprSimplexSetSize(portal, 3);

    // vertex 3 direction
    b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,
                     &b3MprSimplexPoint(portal, 0)->v);
    b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,
                     &b3MprSimplexPoint(portal, 0)->v);
    b3MprVec3Cross(&dir, &va, &vb);
    b3MprVec3Normalize(&dir);

    // it is better to form portal faces to be oriented "outside" origin
    dot = b3MprVec3Dot(&dir, &b3MprSimplexPoint(portal, 0)->v);
    if (dot > 0.f){
        b3MprSimplexSwap(portal, 1, 2);
        b3MprVec3Scale(&dir, -1.f);
    }

    while (b3MprSimplexSize(portal) < 4){
		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, b3MprSimplexPointW(portal, 3));
        
        dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &dir);
        if (b3MprIsZero(dot) || dot < 0.f)
            return -1;

        cont = 0;

        // test if origin is outside (v1, v0, v3) - set v2 as v3 and
        // continue
        b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 1)->v,
                          &b3MprSimplexPoint(portal, 3)->v);
        dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);
        if (dot < 0.f && !b3MprIsZero(dot)){
            b3MprSimplexSet(portal, 2, b3MprSimplexPoint(portal, 3));
            cont = 1;
        }

        if (!cont){
            // test if origin is outside (v3, v0, v2) - set v1 as v3 and
            // continue
            b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 3)->v,
                              &b3MprSimplexPoint(portal, 2)->v);
            dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);
            if (dot < 0.f && !b3MprIsZero(dot)){
                b3MprSimplexSet(portal, 1, b3MprSimplexPoint(portal, 3));
                cont = 1;
            }
        }

        if (cont){
            b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,
                             &b3MprSimplexPoint(portal, 0)->v);
            b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,
                             &b3MprSimplexPoint(portal, 0)->v);
            b3MprVec3Cross(&dir, &va, &vb);
            b3MprVec3Normalize(&dir);
        }else{
            b3MprSimplexSetSize(portal, 4);
        }
    }

    return 0;
}


B3_STATIC int b3RefinePortal(int pairIndex,int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
													b3ConstArray(b3Collidable_t)				cpuCollidables,
													b3ConstArray(b3Float4)					cpuVertices,
													__global b3Float4* sepAxis,
													b3MprSimplex_t *portal)
{
    b3Float4 dir;
    b3MprSupport_t v4;

	for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)
    //while (1)
	{
        // compute direction outside the portal (from v0 throught v1,v2,v3
        // face)
        b3PortalDir(portal, &dir);

        // test if origin is inside the portal
        if (portalEncapsulesOrigin(portal, &dir))
            return 0;

        // get next support point
        
		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);


        // test if v4 can expand portal to contain origin and if portal
        // expanding doesn't reach given tolerance
        if (!portalCanEncapsuleOrigin(portal, &v4, &dir)
                || portalReachTolerance(portal, &v4, &dir))
		{
            return -1;
        }

        // v1-v2-v3 triangle must be rearranged to face outside Minkowski
        // difference (direction from v0).
        b3ExpandPortal(portal, &v4);
    }

    return -1;
}

B3_STATIC void b3FindPos(const b3MprSimplex_t *portal, b3Float4 *pos)
{

	b3Float4 zero = b3MakeFloat4(0,0,0,0);
	b3Float4* b3mpr_vec3_origin = &zero;

    b3Float4 dir;
    size_t i;
    float b[4], sum, inv;
    b3Float4 vec, p1, p2;

    b3PortalDir(portal, &dir);

    // use barycentric coordinates of tetrahedron to find origin
    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,
                       &b3MprSimplexPoint(portal, 2)->v);
    b[0] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);

    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,
                       &b3MprSimplexPoint(portal, 2)->v);
    b[1] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);

    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 0)->v,
                       &b3MprSimplexPoint(portal, 1)->v);
    b[2] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);

    b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,
                       &b3MprSimplexPoint(portal, 1)->v);
    b[3] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);

	sum = b[0] + b[1] + b[2] + b[3];

    if (b3MprIsZero(sum) || sum < 0.f){
		b[0] = 0.f;

        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,
                           &b3MprSimplexPoint(portal, 3)->v);
        b[1] = b3MprVec3Dot(&vec, &dir);
        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,
                           &b3MprSimplexPoint(portal, 1)->v);
        b[2] = b3MprVec3Dot(&vec, &dir);
        b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,
                           &b3MprSimplexPoint(portal, 2)->v);
        b[3] = b3MprVec3Dot(&vec, &dir);

		sum = b[1] + b[2] + b[3];
	}

	inv = 1.f / sum;

    b3MprVec3Copy(&p1, b3mpr_vec3_origin);
    b3MprVec3Copy(&p2, b3mpr_vec3_origin);
    for (i = 0; i < 4; i++){
        b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v1);
        b3MprVec3Scale(&vec, b[i]);
        b3MprVec3Add(&p1, &vec);

        b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v2);
        b3MprVec3Scale(&vec, b[i]);
        b3MprVec3Add(&p2, &vec);
    }
    b3MprVec3Scale(&p1, inv);
    b3MprVec3Scale(&p2, inv);

    b3MprVec3Copy(pos, &p1);
    b3MprVec3Add(pos, &p2);
    b3MprVec3Scale(pos, 0.5);
}

inline float b3MprVec3Dist2(const b3Float4 *a, const b3Float4 *b)
{
    b3Float4 ab;
    b3MprVec3Sub2(&ab, a, b);
    return b3MprVec3Len2(&ab);
}

inline float _b3MprVec3PointSegmentDist2(const b3Float4 *P,
                                                  const b3Float4 *x0,
                                                  const b3Float4 *b,
                                                  b3Float4 *witness)
{
    // The computation comes from solving equation of segment:
    //      S(t) = x0 + t.d
    //          where - x0 is initial point of segment
    //                - d is direction of segment from x0 (|d| > 0)
    //                - t belongs to <0, 1> interval
    // 
    // Than, distance from a segment to some point P can be expressed:
    //      D(t) = |x0 + t.d - P|^2
    //          which is distance from any point on segment. Minimization
    //          of this function brings distance from P to segment.
    // Minimization of D(t) leads to simple quadratic equation that's
    // solving is straightforward.
    //
    // Bonus of this method is witness point for free.

    float dist, t;
    b3Float4 d, a;

    // direction of segment
    b3MprVec3Sub2(&d, b, x0);

    // precompute vector from P to x0
    b3MprVec3Sub2(&a, x0, P);

    t  = -1.f * b3MprVec3Dot(&a, &d);
    t /= b3MprVec3Len2(&d);

    if (t < 0.f || b3MprIsZero(t)){
        dist = b3MprVec3Dist2(x0, P);
        if (witness)
            b3MprVec3Copy(witness, x0);
    }else if (t > 1.f || b3MprEq(t, 1.f)){
        dist = b3MprVec3Dist2(b, P);
        if (witness)
            b3MprVec3Copy(witness, b);
    }else{
        if (witness){
            b3MprVec3Copy(witness, &d);
            b3MprVec3Scale(witness, t);
            b3MprVec3Add(witness, x0);
            dist = b3MprVec3Dist2(witness, P);
        }else{
            // recycling variables
            b3MprVec3Scale(&d, t);
            b3MprVec3Add(&d, &a);
            dist = b3MprVec3Len2(&d);
        }
    }

    return dist;
}


inline float b3MprVec3PointTriDist2(const b3Float4 *P,
                                const b3Float4 *x0, const b3Float4 *B,
                                const b3Float4 *C,
                                b3Float4 *witness)
{
    // Computation comes from analytic expression for triangle (x0, B, C)
    //      T(s, t) = x0 + s.d1 + t.d2, where d1 = B - x0 and d2 = C - x0 and
    // Then equation for distance is:
    //      D(s, t) = | T(s, t) - P |^2
    // This leads to minimization of quadratic function of two variables.
    // The solution from is taken only if s is between 0 and 1, t is
    // between 0 and 1 and t + s < 1, otherwise distance from segment is
    // computed.

    b3Float4 d1, d2, a;
    float u, v, w, p, q, r;
    float s, t, dist, dist2;
    b3Float4 witness2;

    b3MprVec3Sub2(&d1, B, x0);
    b3MprVec3Sub2(&d2, C, x0);
    b3MprVec3Sub2(&a, x0, P);

    u = b3MprVec3Dot(&a, &a);
    v = b3MprVec3Dot(&d1, &d1);
    w = b3MprVec3Dot(&d2, &d2);
    p = b3MprVec3Dot(&a, &d1);
    q = b3MprVec3Dot(&a, &d2);
    r = b3MprVec3Dot(&d1, &d2);

    s = (q * r - w * p) / (w * v - r * r);
    t = (-s * r - q) / w;

    if ((b3MprIsZero(s) || s > 0.f)
            && (b3MprEq(s, 1.f) || s < 1.f)
            && (b3MprIsZero(t) || t > 0.f)
            && (b3MprEq(t, 1.f) || t < 1.f)
            && (b3MprEq(t + s, 1.f) || t + s < 1.f)){

        if (witness){
            b3MprVec3Scale(&d1, s);
            b3MprVec3Scale(&d2, t);
            b3MprVec3Copy(witness, x0);
            b3MprVec3Add(witness, &d1);
            b3MprVec3Add(witness, &d2);

            dist = b3MprVec3Dist2(witness, P);
        }else{
            dist  = s * s * v;
            dist += t * t * w;
            dist += 2.f * s * t * r;
            dist += 2.f * s * p;
            dist += 2.f * t * q;
            dist += u;
        }
    }else{
        dist = _b3MprVec3PointSegmentDist2(P, x0, B, witness);

        dist2 = _b3MprVec3PointSegmentDist2(P, x0, C, &witness2);
        if (dist2 < dist){
            dist = dist2;
            if (witness)
                b3MprVec3Copy(witness, &witness2);
        }

        dist2 = _b3MprVec3PointSegmentDist2(P, B, C, &witness2);
        if (dist2 < dist){
            dist = dist2;
            if (witness)
                b3MprVec3Copy(witness, &witness2);
        }
    }

    return dist;
}


B3_STATIC void b3FindPenetr(int pairIndex,int bodyIndexA, int bodyIndexB,  b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, 
													b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
													b3ConstArray(b3Collidable_t)				cpuCollidables,
													b3ConstArray(b3Float4)					cpuVertices,
													__global b3Float4* sepAxis,
                       b3MprSimplex_t *portal,
                       float *depth, b3Float4 *pdir, b3Float4 *pos)
{
    b3Float4 dir;
    b3MprSupport_t v4;
    unsigned long iterations;

	b3Float4 zero = b3MakeFloat4(0,0,0,0);
	b3Float4* b3mpr_vec3_origin = &zero;


    iterations = 1UL;
	for (int i=0;i<B3_MPR_MAX_ITERATIONS;i++)
    //while (1)
	{
        // compute portal direction and obtain next support point
        b3PortalDir(portal, &dir);
        
		 b3MprSupport(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&dir, &v4);


        // reached tolerance -> find penetration info
        if (portalReachTolerance(portal, &v4, &dir)
                || iterations ==B3_MPR_MAX_ITERATIONS)
		{
            *depth = b3MprVec3PointTriDist2(b3mpr_vec3_origin,&b3MprSimplexPoint(portal, 1)->v,&b3MprSimplexPoint(portal, 2)->v,&b3MprSimplexPoint(portal, 3)->v,pdir);
            *depth = B3_MPR_SQRT(*depth);
			
			if (b3MprIsZero((*pdir).x) && b3MprIsZero((*pdir).y) && b3MprIsZero((*pdir).z))
			{
				
				*pdir = dir;
			} 
			b3MprVec3Normalize(pdir);
			
            // barycentric coordinates:
            b3FindPos(portal, pos);


            return;
        }

        b3ExpandPortal(portal, &v4);

        iterations++;
    }
}

B3_STATIC void b3FindPenetrTouch(b3MprSimplex_t *portal,float *depth, b3Float4 *dir, b3Float4 *pos)
{
    // Touching contact on portal's v1 - so depth is zero and direction
    // is unimportant and pos can be guessed
    *depth = 0.f;
    b3Float4 zero = b3MakeFloat4(0,0,0,0);
	b3Float4* b3mpr_vec3_origin = &zero;


	b3MprVec3Copy(dir, b3mpr_vec3_origin);

    b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);
    b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);
    b3MprVec3Scale(pos, 0.5);
}

B3_STATIC void b3FindPenetrSegment(b3MprSimplex_t *portal,
                              float *depth, b3Float4 *dir, b3Float4 *pos)
{
    
    // Origin lies on v0-v1 segment.
    // Depth is distance to v1, direction also and position must be
    // computed

    b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);
    b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);
    b3MprVec3Scale(pos, 0.5f);

    
    b3MprVec3Copy(dir, &b3MprSimplexPoint(portal, 1)->v);
    *depth = B3_MPR_SQRT(b3MprVec3Len2(dir));
    b3MprVec3Normalize(dir);
}



inline int b3MprPenetration(int pairIndex, int bodyIndexA, int bodyIndexB,
					b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,
					b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData, 
					b3ConstArray(b3Collidable_t)	cpuCollidables,
					b3ConstArray(b3Float4)	cpuVertices,
					__global b3Float4* sepAxis,
					__global int*	hasSepAxis,
					float *depthOut, b3Float4* dirOut, b3Float4* posOut)
{
	
	 b3MprSimplex_t portal;

	 
//	if (!hasSepAxis[pairIndex])
	//	return -1;
	
	hasSepAxis[pairIndex] = 0;
	 int res;

    // Phase 1: Portal discovery
    res = b3DiscoverPortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices,sepAxis,hasSepAxis, &portal);
	
	  
	//sepAxis[pairIndex] = *pdir;//or -dir?

	switch (res)
	{
	case 0:
		{
			// Phase 2: Portal refinement
		
			res = b3RefinePortal(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal);
			if (res < 0)
				return -1;

			// Phase 3. Penetration info
			b3FindPenetr(pairIndex,bodyIndexA,bodyIndexB,cpuBodyBuf,cpuConvexData,cpuCollidables,cpuVertices, sepAxis,&portal, depthOut, dirOut, posOut);
			hasSepAxis[pairIndex] = 1;
			sepAxis[pairIndex] = -*dirOut;
			break;
		}
	case 1:
		{
			 // Touching contact on portal's v1.
			b3FindPenetrTouch(&portal, depthOut, dirOut, posOut);
			break;
		}
	case 2:
		{
			
			b3FindPenetrSegment( &portal, depthOut, dirOut, posOut);
			break;
		}
	default:
		{
			hasSepAxis[pairIndex]=0;
			//if (res < 0)
			//{
				// Origin isn't inside portal - no collision.
				return -1;
			//}
		}
	};
	
	return 0;
};



#endif //B3_MPR_PENETRATION_H