134 files changed, 17464 insertions, 8500 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index b2707e7f7c..73a62458c3 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -40,11 +40,9 @@ Files extracted from upstream source:
 ## bullet
 
 - Upstream: https://github.com/bulletphysics/bullet3
-- Version: git pre-2.90 (cd8cf7521cbb8b7808126a6adebd47bb83ea166a, 2020)
+- Version: 3.07 (e32fc59c88a3908876949c6f2665e8d091d987fa, 2020)
 - License: zlib
 
-Important: Synced with a pre-release version of bullet 2.90 from the master branch.
-
 Files extracted from upstream source:
 
 - src/* apart from CMakeLists.txt and premake4.lua files
@@ -341,7 +339,7 @@ changes are marked with `// -- GODOT --` comments.
 ## mbedtls
 
 - Upstream: https://tls.mbed.org/
-- Version: 2.16.8 (2020)
+- Version: 2.16.9 (2020)
 - License: Apache 2.0
 
 File extracted from upstream release tarball:
@@ -358,6 +356,21 @@ File extracted from upstream release tarball:
   for light bundling with core.
 
 
+## meshoptimizer
+
+- Upstream: https://github.com/zeux/meshoptimizer
+- Version: 0.15 (2020)
+- License: MIT
+
+File extracted from upstream release tarball:
+
+- All files in `src/`.
+
+Important: Some files have Godot-made changes.
+They can be applied with the patch in the `patches` folder, but are meant to be superseded
+by upstream API changes.
+
+
 ## miniupnpc
 
 - Upstream: https://github.com/miniupnp/miniupnp/tree/master/miniupnpc
diff --git a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
index 6f2c5251a0..4938fa17af 100644
--- a/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
+++ b/thirdparty/bullet/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
@@ -285,7 +285,6 @@ void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface, int
 				meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numfaces, indicestype, nodeSubPart);
 
 				curNodeSubPart = nodeSubPart;
-				b3Assert(indicestype == PHY_INTEGER || indicestype == PHY_SHORT);
 			}
 			//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
 
@@ -293,7 +292,13 @@ void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface, int
 
 			for (int j = 2; j >= 0; j--)
 			{
-				int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j];
+				int graphicsindex;
+                                switch (indicestype) {
+                                        case PHY_INTEGER: graphicsindex = gfxbase[j]; break;
+                                        case PHY_SHORT: graphicsindex = ((unsigned short*)gfxbase)[j]; break;
+                                        case PHY_UCHAR: graphicsindex = ((unsigned char*)gfxbase)[j]; break;
+                                        default: b3Assert(0);
+                                }
 				if (type == PHY_FLOAT)
 				{
 					float* graphicsbase = (float*)(vertexbase + graphicsindex * stride);
diff --git a/thirdparty/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.cpp b/thirdparty/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.cpp
index 145de62db3..f6c779a919 100644
--- a/thirdparty/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.cpp
+++ b/thirdparty/bullet/Bullet3Serialize/Bullet2FileLoader/b3File.cpp
@@ -851,12 +851,12 @@ void bFile::swapData(char *data, short type, int arraySize, bool ignoreEndianFla
 
 void bFile::safeSwapPtr(char *dst, const char *src)
 {
+	if (!src || !dst)
+		return;
+
 	int ptrFile = mFileDNA->getPointerSize();
 	int ptrMem = mMemoryDNA->getPointerSize();
 
-	if (!src && !dst)
-		return;
-
 	if (ptrFile == ptrMem)
 	{
 		memcpy(dst, src, ptrMem);
diff --git a/thirdparty/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp b/thirdparty/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp
index 4954e773e2..19f1737b73 100644
--- a/thirdparty/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp
+++ b/thirdparty/bullet/BulletCollision/BroadphaseCollision/btQuantizedBvh.cpp
@@ -346,8 +346,6 @@ void btQuantizedBvh::reportAabbOverlappingNodex(btNodeOverlapCallback* nodeCallb
 	}
 }
 
-int maxIterations = 0;
-
 void btQuantizedBvh::walkStacklessTree(btNodeOverlapCallback* nodeCallback, const btVector3& aabbMin, const btVector3& aabbMax) const
 {
 	btAssert(!m_useQuantization);
@@ -387,8 +385,6 @@ void btQuantizedBvh::walkStacklessTree(btNodeOverlapCallback* nodeCallback, cons
 			curIndex += escapeIndex;
 		}
 	}
-	if (maxIterations < walkIterations)
-		maxIterations = walkIterations;
 }
 
 /*
@@ -529,8 +525,6 @@ void btQuantizedBvh::walkStacklessTreeAgainstRay(btNodeOverlapCallback* nodeCall
 			curIndex += escapeIndex;
 		}
 	}
-	if (maxIterations < walkIterations)
-		maxIterations = walkIterations;
 }
 
 void btQuantizedBvh::walkStacklessQuantizedTreeAgainstRay(btNodeOverlapCallback* nodeCallback, const btVector3& raySource, const btVector3& rayTarget, const btVector3& aabbMin, const btVector3& aabbMax, int startNodeIndex, int endNodeIndex) const
@@ -654,8 +648,6 @@ void btQuantizedBvh::walkStacklessQuantizedTreeAgainstRay(btNodeOverlapCallback*
 			curIndex += escapeIndex;
 		}
 	}
-	if (maxIterations < walkIterations)
-		maxIterations = walkIterations;
 }
 
 void btQuantizedBvh::walkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax, int startNodeIndex, int endNodeIndex) const
@@ -718,8 +710,6 @@ void btQuantizedBvh::walkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallb
 			curIndex += escapeIndex;
 		}
 	}
-	if (maxIterations < walkIterations)
-		maxIterations = walkIterations;
 }
 
 //This traversal can be called from Playstation 3 SPU
diff --git a/thirdparty/bullet/BulletCollision/CollisionDispatch/btCollisionObject.h b/thirdparty/bullet/BulletCollision/CollisionDispatch/btCollisionObject.h
index 85dc488c8c..e085c40892 100644
--- a/thirdparty/bullet/BulletCollision/CollisionDispatch/btCollisionObject.h
+++ b/thirdparty/bullet/BulletCollision/CollisionDispatch/btCollisionObject.h
@@ -127,6 +127,7 @@ public:
 
 	enum CollisionFlags
 	{
+		CF_DYNAMIC_OBJECT = 0,
 		CF_STATIC_OBJECT = 1,
 		CF_KINEMATIC_OBJECT = 2,
 		CF_NO_CONTACT_RESPONSE = 4,
@@ -251,6 +252,16 @@ public:
 		m_checkCollideWith = m_objectsWithoutCollisionCheck.size() > 0;
 	}
 
+        int getNumObjectsWithoutCollision() const
+	{
+		return m_objectsWithoutCollisionCheck.size();
+	}
+
+	const btCollisionObject* getObjectWithoutCollision(int index)
+	{
+		return m_objectsWithoutCollisionCheck[index];
+	}
+
 	virtual bool checkCollideWithOverride(const btCollisionObject* co) const
 	{
 		int index = m_objectsWithoutCollisionCheck.findLinearSearch(co);
diff --git a/thirdparty/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp b/thirdparty/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp
index a4252c296a..a71700f58a 100644
--- a/thirdparty/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp
+++ b/thirdparty/bullet/BulletCollision/CollisionDispatch/btInternalEdgeUtility.cpp
@@ -361,7 +361,13 @@ void btGenerateInternalEdgeInfo(btBvhTriangleMeshShape* trimeshShape, btTriangle
 
 			for (int j = 2; j >= 0; j--)
 			{
-				int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j];
+				int graphicsindex;
+                                switch (indicestype) {
+                                        case PHY_INTEGER: graphicsindex = gfxbase[j]; break;
+                                        case PHY_SHORT: graphicsindex = ((unsigned short*)gfxbase)[j]; break;
+                                        case PHY_UCHAR: graphicsindex = ((unsigned char*)gfxbase)[j]; break;
+                                        default: btAssert(0);
+                                }
 				if (type == PHY_FLOAT)
 				{
 					float* graphicsbase = (float*)(vertexbase + graphicsindex * stride);
diff --git a/thirdparty/bullet/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp b/thirdparty/bullet/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp
index d663b3d6d6..c66ce58e3e 100644
--- a/thirdparty/bullet/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp
+++ b/thirdparty/bullet/BulletCollision/CollisionShapes/btBvhTriangleMeshShape.cpp
@@ -124,12 +124,17 @@ void btBvhTriangleMeshShape::performRaycast(btTriangleCallback* callback, const
 				nodeSubPart);
 
 			unsigned int* gfxbase = (unsigned int*)(indexbase + nodeTriangleIndex * indexstride);
-			btAssert(indicestype == PHY_INTEGER || indicestype == PHY_SHORT);
 
 			const btVector3& meshScaling = m_meshInterface->getScaling();
 			for (int j = 2; j >= 0; j--)
 			{
-				int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j];
+				int graphicsindex;
+                                switch (indicestype) {
+                                        case PHY_INTEGER: graphicsindex = gfxbase[j]; break;
+                                        case PHY_SHORT: graphicsindex = ((unsigned short*)gfxbase)[j]; break;
+                                        case PHY_UCHAR: graphicsindex = ((unsigned char*)gfxbase)[j]; break;
+                                        default: btAssert(0);
+                                }
 
 				if (type == PHY_FLOAT)
 				{
@@ -193,12 +198,17 @@ void btBvhTriangleMeshShape::performConvexcast(btTriangleCallback* callback, con
 				nodeSubPart);
 
 			unsigned int* gfxbase = (unsigned int*)(indexbase + nodeTriangleIndex * indexstride);
-			btAssert(indicestype == PHY_INTEGER || indicestype == PHY_SHORT);
 
 			const btVector3& meshScaling = m_meshInterface->getScaling();
 			for (int j = 2; j >= 0; j--)
 			{
-				int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j];
+				int graphicsindex;
+                                switch (indicestype) {
+                                        case PHY_INTEGER: graphicsindex = gfxbase[j]; break;
+                                        case PHY_SHORT: graphicsindex = ((unsigned short*)gfxbase)[j]; break;
+                                        case PHY_UCHAR: graphicsindex = ((unsigned char*)gfxbase)[j]; break;
+                                        default: btAssert(0);
+                                }
 
 				if (type == PHY_FLOAT)
 				{
diff --git a/thirdparty/bullet/BulletCollision/CollisionShapes/btCollisionShape.h b/thirdparty/bullet/BulletCollision/CollisionShapes/btCollisionShape.h
index c80e105a4d..16f9e0c77a 100644
--- a/thirdparty/bullet/BulletCollision/CollisionShapes/btCollisionShape.h
+++ b/thirdparty/bullet/BulletCollision/CollisionShapes/btCollisionShape.h
@@ -30,11 +30,12 @@ protected:
 	int m_shapeType;
 	void* m_userPointer;
 	int m_userIndex;
+	int m_userIndex2;
 
 public:
 	BT_DECLARE_ALIGNED_ALLOCATOR();
 
-	btCollisionShape() : m_shapeType(INVALID_SHAPE_PROXYTYPE), m_userPointer(0), m_userIndex(-1)
+	btCollisionShape() : m_shapeType(INVALID_SHAPE_PROXYTYPE), m_userPointer(0), m_userIndex(-1), m_userIndex2(-1)
 	{
 	}
 
@@ -137,6 +138,16 @@ public:
 		return m_userIndex;
 	}
 
+	void setUserIndex2(int index)
+	{
+		m_userIndex2 = index;
+	}
+
+	int getUserIndex2() const
+	{
+		return m_userIndex2;
+	}
+
 	virtual int calculateSerializeBufferSize() const;
 
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
diff --git a/thirdparty/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp b/thirdparty/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp
index 34e7926f17..cab6980b65 100644
--- a/thirdparty/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp
+++ b/thirdparty/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.cpp
@@ -21,8 +21,7 @@ btHeightfieldTerrainShape::btHeightfieldTerrainShape(
 	int heightStickWidth, int heightStickLength, const void* heightfieldData,
 	btScalar heightScale, btScalar minHeight, btScalar maxHeight, int upAxis,
 	PHY_ScalarType hdt, bool flipQuadEdges)
-	:m_userIndex2(-1),
-	m_userValue3(0),
+	:m_userValue3(0),
 	m_triangleInfoMap(0)
 {
 	initialize(heightStickWidth, heightStickLength, heightfieldData,
@@ -31,8 +30,7 @@ btHeightfieldTerrainShape::btHeightfieldTerrainShape(
 }
 
 btHeightfieldTerrainShape::btHeightfieldTerrainShape(int heightStickWidth, int heightStickLength, const void* heightfieldData, btScalar maxHeight, int upAxis, bool useFloatData, bool flipQuadEdges)
-	:m_userIndex2(-1),
-	m_userValue3(0),
+	:	m_userValue3(0),
 	m_triangleInfoMap(0)
 {
 	// legacy constructor: support only float or unsigned char,
diff --git a/thirdparty/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h b/thirdparty/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h
index 8dea98fc6b..2cf3c00721 100644
--- a/thirdparty/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h
+++ b/thirdparty/bullet/BulletCollision/CollisionShapes/btHeightfieldTerrainShape.h
@@ -114,7 +114,7 @@ protected:
 	int m_vboundsGridLength;
 	int m_vboundsChunkSize;
 
-	int m_userIndex2;
+	
 	btScalar m_userValue3;
 
 	struct btTriangleInfoMap* m_triangleInfoMap;
@@ -192,14 +192,6 @@ public:
 	virtual const char* getName() const { return "HEIGHTFIELD"; }
 
 	
-	void setUserIndex2(int index)
-	{
-		m_userIndex2 = index;
-	}
-	int getUserIndex2() const
-	{
-		return m_userIndex2;
-	}
 	void setUserValue3(btScalar value)
 	{
 		m_userValue3 = value;
diff --git a/thirdparty/bullet/BulletCollision/CollisionShapes/btOptimizedBvh.cpp b/thirdparty/bullet/BulletCollision/CollisionShapes/btOptimizedBvh.cpp
index 687399e0a9..863ea6d6ac 100644
--- a/thirdparty/bullet/BulletCollision/CollisionShapes/btOptimizedBvh.cpp
+++ b/thirdparty/bullet/BulletCollision/CollisionShapes/btOptimizedBvh.cpp
@@ -286,7 +286,6 @@ void btOptimizedBvh::updateBvhNodes(btStridingMeshInterface* meshInterface, int
 				meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numfaces, indicestype, nodeSubPart);
 
 				curNodeSubPart = nodeSubPart;
-				btAssert(indicestype == PHY_INTEGER || indicestype == PHY_SHORT);
 			}
 			//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
 
@@ -294,7 +293,13 @@ void btOptimizedBvh::updateBvhNodes(btStridingMeshInterface* meshInterface, int
 
 			for (int j = 2; j >= 0; j--)
 			{
-				int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j];
+				int graphicsindex;
+                                switch (indicestype) {
+                                        case PHY_INTEGER: graphicsindex = gfxbase[j]; break;
+                                        case PHY_SHORT: graphicsindex = ((unsigned short*)gfxbase)[j]; break;
+                                        case PHY_UCHAR: graphicsindex = ((unsigned char*)gfxbase)[j]; break;
+                                        default: btAssert(0);
+                                }
 				if (type == PHY_FLOAT)
 				{
 					float* graphicsbase = (float*)(vertexbase + graphicsindex * stride);
diff --git a/thirdparty/bullet/BulletCollision/CollisionShapes/btSdfCollisionShape.cpp b/thirdparty/bullet/BulletCollision/CollisionShapes/btSdfCollisionShape.cpp
index 4a95dbea4f..23c95ad3ff 100644
--- a/thirdparty/bullet/BulletCollision/CollisionShapes/btSdfCollisionShape.cpp
+++ b/thirdparty/bullet/BulletCollision/CollisionShapes/btSdfCollisionShape.cpp
@@ -2,8 +2,11 @@
 #include "btMiniSDF.h"
 #include "LinearMath/btAabbUtil2.h"
 
-struct btSdfCollisionShapeInternalData
+ATTRIBUTE_ALIGNED16(struct)
+btSdfCollisionShapeInternalData
 {
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
 	btVector3 m_localScaling;
 	btScalar m_margin;
 	btMiniSDF m_sdf;
diff --git a/thirdparty/bullet/BulletCollision/Gimpact/btGImpactShape.h b/thirdparty/bullet/BulletCollision/Gimpact/btGImpactShape.h
index 5b85e87041..cc91079579 100644
--- a/thirdparty/bullet/BulletCollision/Gimpact/btGImpactShape.h
+++ b/thirdparty/bullet/BulletCollision/Gimpact/btGImpactShape.h
@@ -623,13 +623,21 @@ public:
 				i1 = s_indices[1];
 				i2 = s_indices[2];
 			}
-			else
+			else if (indicestype == PHY_INTEGER)
 			{
 				unsigned int* i_indices = (unsigned int*)(indexbase + face_index * indexstride);
 				i0 = i_indices[0];
 				i1 = i_indices[1];
 				i2 = i_indices[2];
 			}
+			else
+			{
+				btAssert(indicestype == PHY_UCHAR);
+				unsigned char* i_indices = (unsigned char*)(indexbase + face_index * indexstride);
+				i0 = i_indices[0];
+				i1 = i_indices[1];
+				i2 = i_indices[2];
+			}
 		}
 
 		SIMD_FORCE_INLINE void get_vertex(unsigned int vertex_index, btVector3& vertex) const
diff --git a/thirdparty/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp b/thirdparty/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp
index 45d1817135..7d53f8624a 100644
--- a/thirdparty/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp
+++ b/thirdparty/bullet/BulletCollision/NarrowPhaseCollision/btGjkEpa2.cpp
@@ -1049,7 +1049,8 @@ btScalar btGjkEpaSolver2::SignedDistance(const btVector3& position,
 		const btScalar length = delta.length();
 		results.normal = delta / length;
 		results.witnesses[0] += results.normal * margin;
-		return (length - margin);
+		results.distance = length - margin;
+		return results.distance;
 	}
 	else
 	{
diff --git a/thirdparty/bullet/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp b/thirdparty/bullet/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
index 27f76b8425..0f5ed1c2ce 100644
--- a/thirdparty/bullet/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
+++ b/thirdparty/bullet/BulletDynamics/ConstraintSolver/btBatchedConstraints.cpp
@@ -852,7 +852,7 @@ static void setupSpatialGridBatchesMt(
 		memHelper.addChunk((void**)&constraintRowBatchIds, sizeof(int) * numConstraintRows);
 		size_t scratchSize = memHelper.getSizeToAllocate();
 		// if we need to reallocate
-		if (scratchMemory->capacity() < scratchSize)
+		if (static_cast<size_t>(scratchMemory->capacity()) < scratchSize)
 		{
 			// allocate 6.25% extra to avoid repeated reallocs
 			scratchMemory->reserve(scratchSize + scratchSize / 16);
diff --git a/thirdparty/bullet/BulletDynamics/ConstraintSolver/btContactSolverInfo.h b/thirdparty/bullet/BulletDynamics/ConstraintSolver/btContactSolverInfo.h
index 4356c12abf..3316403a87 100644
--- a/thirdparty/bullet/BulletDynamics/ConstraintSolver/btContactSolverInfo.h
+++ b/thirdparty/bullet/BulletDynamics/ConstraintSolver/btContactSolverInfo.h
@@ -47,6 +47,8 @@ struct btContactSolverInfoData
 	btScalar m_erp;          //error reduction for non-contact constraints
 	btScalar m_erp2;         //error reduction for contact constraints
 	btScalar m_deformable_erp;          //error reduction for deformable constraints
+	btScalar m_deformable_cfm;          //constraint force mixing for deformable constraints
+	btScalar m_deformable_maxErrorReduction; // maxErrorReduction for deformable contact
 	btScalar m_globalCfm;    //constraint force mixing for contacts and non-contacts
 	btScalar m_frictionERP;  //error reduction for friction constraints
 	btScalar m_frictionCFM;  //constraint force mixing for friction constraints
@@ -83,7 +85,9 @@ struct btContactSolverInfo : public btContactSolverInfoData
 		m_numIterations = 10;
 		m_erp = btScalar(0.2);
 		m_erp2 = btScalar(0.2);
-		m_deformable_erp = btScalar(0.1);
+		m_deformable_erp = btScalar(0.06);
+		m_deformable_cfm = btScalar(0.01);
+		m_deformable_maxErrorReduction = btScalar(0.1);
 		m_globalCfm = btScalar(0.);
 		m_frictionERP = btScalar(0.2);  //positional friction 'anchors' are disabled by default
 		m_frictionCFM = btScalar(0.);
diff --git a/thirdparty/bullet/BulletDynamics/Dynamics/btRigidBody.h b/thirdparty/bullet/BulletDynamics/Dynamics/btRigidBody.h
index 943d724cce..7442dd1e6a 100644
--- a/thirdparty/bullet/BulletDynamics/Dynamics/btRigidBody.h
+++ b/thirdparty/bullet/BulletDynamics/Dynamics/btRigidBody.h
@@ -356,12 +356,12 @@ public:
         }
     }
     
-    btVector3 getPushVelocity()
+    btVector3 getPushVelocity() const
     {
         return m_pushVelocity;
     }
     
-    btVector3 getTurnVelocity()
+    btVector3 getTurnVelocity() const
     {
         return m_turnVelocity;
     }
@@ -465,6 +465,12 @@ public:
 		//for kinematic objects, we could also use use:
 		//		return 	(m_worldTransform(rel_pos) - m_interpolationWorldTransform(rel_pos)) / m_kinematicTimeStep;
 	}
+    
+    btVector3 getPushVelocityInLocalPoint(const btVector3& rel_pos) const
+    {
+        //we also calculate lin/ang velocity for kinematic objects
+        return m_pushVelocity + m_turnVelocity.cross(rel_pos);
+    }
 
 	void translate(const btVector3& v)
 	{
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBody.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBody.cpp
index a1d5bb9ca8..bec8c6530d 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBody.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBody.cpp
@@ -344,6 +344,8 @@ void btMultiBody::finalizeMultiDof()
 {
 	m_deltaV.resize(0);
 	m_deltaV.resize(6 + m_dofCount);
+    m_splitV.resize(0);
+    m_splitV.resize(6 + m_dofCount);
 	m_realBuf.resize(6 + m_dofCount + m_dofCount * m_dofCount + 6 + m_dofCount);  //m_dofCount for joint-space vels + m_dofCount^2 for "D" matrices + delta-pos vector (6 base "vels" + joint "vels")
 	m_vectorBuf.resize(2 * m_dofCount);                                           //two 3-vectors (i.e. one six-vector) for each system dof	("h" matrices)
 	m_matrixBuf.resize(m_links.size() + 1);
@@ -671,6 +673,30 @@ btScalar *btMultiBody::getJointTorqueMultiDof(int i)
 	return &m_links[i].m_jointTorque[0];
 }
 
+bool btMultiBody::hasFixedBase() const
+{
+	return m_fixedBase || (getBaseCollider() && getBaseCollider()->isStaticObject());
+}
+
+bool btMultiBody::isBaseStaticOrKinematic() const
+{
+	return m_fixedBase || (getBaseCollider() && getBaseCollider()->isStaticOrKinematicObject());
+}
+
+bool btMultiBody::isBaseKinematic() const
+{
+	return getBaseCollider() && getBaseCollider()->isKinematicObject();
+}
+
+void btMultiBody::setBaseDynamicType(int dynamicType)
+{
+	if(getBaseCollider()) {
+		int oldFlags = getBaseCollider()->getCollisionFlags();
+		oldFlags &= ~(btCollisionObject::CF_STATIC_OBJECT | btCollisionObject::CF_KINEMATIC_OBJECT);
+		getBaseCollider()->setCollisionFlags(oldFlags | dynamicType);
+	}
+}
+
 inline btMatrix3x3 outerProduct(const btVector3 &v0, const btVector3 &v1)  //renamed it from vecMulVecTranspose (http://en.wikipedia.org/wiki/Outer_product); maybe it should be moved to btVector3 like dot and cross?
 {
 	btVector3 row0 = btVector3(
@@ -796,7 +822,7 @@ void btMultiBody::computeAccelerationsArticulatedBodyAlgorithmMultiDof(btScalar
 	//create the vector of spatial velocity of the base by transforming global-coor linear and angular velocities into base-local coordinates
 	spatVel[0].setVector(rot_from_parent[0] * base_omega, rot_from_parent[0] * base_vel);
 
-	if (m_fixedBase)
+	if (isBaseStaticOrKinematic())
 	{
 		zeroAccSpatFrc[0].setZero();
 	}
@@ -872,31 +898,53 @@ void btMultiBody::computeAccelerationsArticulatedBodyAlgorithmMultiDof(btScalar
 
 		// calculate zhat_i^A
 		//
-		//external forces
-		btVector3 linkAppliedForce = isConstraintPass ? m_links[i].m_appliedConstraintForce : m_links[i].m_appliedForce;
-		btVector3 linkAppliedTorque = isConstraintPass ? m_links[i].m_appliedConstraintTorque : m_links[i].m_appliedTorque;
+		if (isLinkAndAllAncestorsKinematic(i))
+		{
+			zeroAccSpatFrc[i].setZero();
+		}
+		else{
+			//external forces
+			btVector3 linkAppliedForce = isConstraintPass ? m_links[i].m_appliedConstraintForce : m_links[i].m_appliedForce;
+			btVector3 linkAppliedTorque = isConstraintPass ? m_links[i].m_appliedConstraintTorque : m_links[i].m_appliedTorque;
 
-		zeroAccSpatFrc[i + 1].setVector(-(rot_from_world[i + 1] * linkAppliedTorque), -(rot_from_world[i + 1] * linkAppliedForce));
+			zeroAccSpatFrc[i + 1].setVector(-(rot_from_world[i + 1] * linkAppliedTorque), -(rot_from_world[i + 1] * linkAppliedForce));
 
 #if 0	
-		{
+			{
 
-			b3Printf("stepVelocitiesMultiDof zeroAccSpatFrc[%d] linear:%f,%f,%f, angular:%f,%f,%f",
-			i+1,
-			zeroAccSpatFrc[i+1].m_topVec[0],
-			zeroAccSpatFrc[i+1].m_topVec[1],
-			zeroAccSpatFrc[i+1].m_topVec[2],
+				b3Printf("stepVelocitiesMultiDof zeroAccSpatFrc[%d] linear:%f,%f,%f, angular:%f,%f,%f",
+				i+1,
+				zeroAccSpatFrc[i+1].m_topVec[0],
+				zeroAccSpatFrc[i+1].m_topVec[1],
+				zeroAccSpatFrc[i+1].m_topVec[2],
 
-			zeroAccSpatFrc[i+1].m_bottomVec[0],
-			zeroAccSpatFrc[i+1].m_bottomVec[1],
-			zeroAccSpatFrc[i+1].m_bottomVec[2]);
-		}
+				zeroAccSpatFrc[i+1].m_bottomVec[0],
+				zeroAccSpatFrc[i+1].m_bottomVec[1],
+				zeroAccSpatFrc[i+1].m_bottomVec[2]);
+			}
 #endif
-		//
-		//adding damping terms (only)
-		btScalar linDampMult = 1., angDampMult = 1.;
-		zeroAccSpatFrc[i + 1].addVector(angDampMult * m_links[i].m_inertiaLocal * spatVel[i + 1].getAngular() * (DAMPING_K1_ANGULAR + DAMPING_K2_ANGULAR * spatVel[i + 1].getAngular().safeNorm()),
-										linDampMult * m_links[i].m_mass * spatVel[i + 1].getLinear() * (DAMPING_K1_LINEAR + DAMPING_K2_LINEAR * spatVel[i + 1].getLinear().safeNorm()));
+			//
+			//adding damping terms (only)
+			btScalar linDampMult = 1., angDampMult = 1.;
+			zeroAccSpatFrc[i + 1].addVector(angDampMult * m_links[i].m_inertiaLocal * spatVel[i + 1].getAngular() * (DAMPING_K1_ANGULAR + DAMPING_K2_ANGULAR * spatVel[i + 1].getAngular().safeNorm()),
+											linDampMult * m_links[i].m_mass * spatVel[i + 1].getLinear() * (DAMPING_K1_LINEAR + DAMPING_K2_LINEAR * spatVel[i + 1].getLinear().safeNorm()));
+			//p += vhat x Ihat vhat - done in a simpler way
+			if (m_useGyroTerm)
+				zeroAccSpatFrc[i + 1].addAngular(spatVel[i + 1].getAngular().cross(m_links[i].m_inertiaLocal * spatVel[i + 1].getAngular()));
+			//
+			zeroAccSpatFrc[i + 1].addLinear(m_links[i].m_mass * spatVel[i + 1].getAngular().cross(spatVel[i + 1].getLinear()));
+			//
+			//btVector3 temp = m_links[i].m_mass * spatVel[i+1].getAngular().cross(spatVel[i+1].getLinear());
+			////clamp parent's omega
+			//btScalar parOmegaMod = temp.length();
+			//btScalar parOmegaModMax = 1000;
+			//if(parOmegaMod > parOmegaModMax)
+			//	temp *= parOmegaModMax / parOmegaMod;
+			//zeroAccSpatFrc[i+1].addLinear(temp);
+			//printf("|zeroAccSpatFrc[%d]| = %.4f\n", i+1, temp.length());
+			//temp = spatCoriolisAcc[i].getLinear();
+			//printf("|spatCoriolisAcc[%d]| = %.4f\n", i+1, temp.length());
+		}
 
 		// calculate Ihat_i^A
 		//init the spatial AB inertia (it has the simple form thanks to choosing local body frames origins at their COMs)
@@ -909,22 +957,6 @@ void btMultiBody::computeAccelerationsArticulatedBodyAlgorithmMultiDof(btScalar
 									 btMatrix3x3(m_links[i].m_inertiaLocal[0], 0, 0,
 												 0, m_links[i].m_inertiaLocal[1], 0,
 												 0, 0, m_links[i].m_inertiaLocal[2]));
-		//
-		//p += vhat x Ihat vhat - done in a simpler way
-		if (m_useGyroTerm)
-			zeroAccSpatFrc[i + 1].addAngular(spatVel[i + 1].getAngular().cross(m_links[i].m_inertiaLocal * spatVel[i + 1].getAngular()));
-		//
-		zeroAccSpatFrc[i + 1].addLinear(m_links[i].m_mass * spatVel[i + 1].getAngular().cross(spatVel[i + 1].getLinear()));
-		//btVector3 temp = m_links[i].m_mass * spatVel[i+1].getAngular().cross(spatVel[i+1].getLinear());
-		////clamp parent's omega
-		//btScalar parOmegaMod = temp.length();
-		//btScalar parOmegaModMax = 1000;
-		//if(parOmegaMod > parOmegaModMax)
-		//	temp *= parOmegaModMax / parOmegaMod;
-		//zeroAccSpatFrc[i+1].addLinear(temp);
-		//printf("|zeroAccSpatFrc[%d]| = %.4f\n", i+1, temp.length());
-		//temp = spatCoriolisAcc[i].getLinear();
-		//printf("|spatCoriolisAcc[%d]| = %.4f\n", i+1, temp.length());
 
 		//printf("w[%d] = [%.4f %.4f %.4f]\n", i, vel_top_angular[i+1].x(), vel_top_angular[i+1].y(), vel_top_angular[i+1].z());
 		//printf("v[%d] = [%.4f %.4f %.4f]\n", i, vel_bottom_linear[i+1].x(), vel_bottom_linear[i+1].y(), vel_bottom_linear[i+1].z());
@@ -935,6 +967,8 @@ void btMultiBody::computeAccelerationsArticulatedBodyAlgorithmMultiDof(btScalar
 	// (part of TreeForwardDynamics in Mirtich.)
 	for (int i = num_links - 1; i >= 0; --i)
 	{
+		if(isLinkAndAllAncestorsKinematic(i))
+			continue;
 		const int parent = m_links[i].m_parent;
 		fromParent.m_rotMat = rot_from_parent[i + 1];
 		fromParent.m_trnVec = m_links[i].m_cachedRVector;
@@ -1047,7 +1081,7 @@ void btMultiBody::computeAccelerationsArticulatedBodyAlgorithmMultiDof(btScalar
 	// Second 'upward' loop
 	// (part of TreeForwardDynamics in Mirtich)
 
-	if (m_fixedBase)
+	if (isBaseStaticOrKinematic())
 	{
 		spatAcc[0].setZero();
 	}
@@ -1081,21 +1115,23 @@ void btMultiBody::computeAccelerationsArticulatedBodyAlgorithmMultiDof(btScalar
 
 		fromParent.transform(spatAcc[parent + 1], spatAcc[i + 1]);
 
-		for (int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+		if(!isLinkAndAllAncestorsKinematic(i))
 		{
-			const btSpatialForceVector &hDof = h[m_links[i].m_dofOffset + dof];
-			//
-			Y_minus_hT_a[dof] = Y[m_links[i].m_dofOffset + dof] - spatAcc[i + 1].dot(hDof);
-		}
-
-		btScalar *invDi = &invD[m_links[i].m_dofOffset * m_links[i].m_dofOffset];
-		//D^{-1} * (Y - h^{T}*apar)
-		mulMatrix(invDi, Y_minus_hT_a, m_links[i].m_dofCount, m_links[i].m_dofCount, m_links[i].m_dofCount, 1, &joint_accel[m_links[i].m_dofOffset]);
+			for (int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+			{
+				const btSpatialForceVector &hDof = h[m_links[i].m_dofOffset + dof];
+				//
+				Y_minus_hT_a[dof] = Y[m_links[i].m_dofOffset + dof] - spatAcc[i + 1].dot(hDof);
+			}
+			btScalar *invDi = &invD[m_links[i].m_dofOffset * m_links[i].m_dofOffset];
+			//D^{-1} * (Y - h^{T}*apar)
+			mulMatrix(invDi, Y_minus_hT_a, m_links[i].m_dofCount, m_links[i].m_dofCount, m_links[i].m_dofCount, 1, &joint_accel[m_links[i].m_dofOffset]);
 
-		spatAcc[i + 1] += spatCoriolisAcc[i];
+			spatAcc[i + 1] += spatCoriolisAcc[i];
 
-		for (int dof = 0; dof < m_links[i].m_dofCount; ++dof)
-			spatAcc[i + 1] += m_links[i].m_axes[dof] * joint_accel[m_links[i].m_dofOffset + dof];
+			for (int dof = 0; dof < m_links[i].m_dofCount; ++dof)
+				spatAcc[i + 1] += m_links[i].m_axes[dof] * joint_accel[m_links[i].m_dofOffset + dof];
+		}
 
 		if (m_links[i].m_jointFeedback)
 		{
@@ -1432,7 +1468,7 @@ void btMultiBody::calcAccelerationDeltasMultiDof(const btScalar *force, btScalar
 
 	// Fill in zero_acc
 	// -- set to force/torque on the base, zero otherwise
-	if (m_fixedBase)
+	if (isBaseStaticOrKinematic())
 	{
 		zeroAccSpatFrc[0].setZero();
 	}
@@ -1451,6 +1487,8 @@ void btMultiBody::calcAccelerationDeltasMultiDof(const btScalar *force, btScalar
 	// (part of TreeForwardDynamics in Mirtich.)
 	for (int i = num_links - 1; i >= 0; --i)
 	{
+		if(isLinkAndAllAncestorsKinematic(i))
+			continue;
 		const int parent = m_links[i].m_parent;
 		fromParent.m_rotMat = rot_from_parent[i + 1];
 		fromParent.m_trnVec = m_links[i].m_cachedRVector;
@@ -1494,7 +1532,7 @@ void btMultiBody::calcAccelerationDeltasMultiDof(const btScalar *force, btScalar
 	// Second 'upward' loop
 	// (part of TreeForwardDynamics in Mirtich)
 
-	if (m_fixedBase)
+	if (isBaseStaticOrKinematic())
 	{
 		spatAcc[0].setZero();
 	}
@@ -1507,6 +1545,8 @@ void btMultiBody::calcAccelerationDeltasMultiDof(const btScalar *force, btScalar
 	// now do the loop over the m_links
 	for (int i = 0; i < num_links; ++i)
 	{
+		if(isLinkAndAllAncestorsKinematic(i))
+			continue;
 		const int parent = m_links[i].m_parent;
 		fromParent.m_rotMat = rot_from_parent[i + 1];
 		fromParent.m_trnVec = m_links[i].m_cachedRVector;
@@ -1550,23 +1590,26 @@ void btMultiBody::calcAccelerationDeltasMultiDof(const btScalar *force, btScalar
 void btMultiBody::predictPositionsMultiDof(btScalar dt)
 {
     int num_links = getNumLinks();
-    // step position by adding dt * velocity
-    //btVector3 v = getBaseVel();
-    //m_basePos += dt * v;
-    //
-    btScalar *pBasePos;
-    btScalar *pBaseVel = &m_realBuf[3];  //note: the !pqd case assumes m_realBuf holds with base velocity at 3,4,5 (should be wrapped for safety)
-    
-    // reset to current position
-    for (int i = 0; i < 3; ++i)
-    {
-        m_basePos_interpolate[i] = m_basePos[i];
-    }
-    pBasePos = m_basePos_interpolate;
+		if(!isBaseKinematic())
+		{
+      // step position by adding dt * velocity
+      //btVector3 v = getBaseVel();
+      //m_basePos += dt * v;
+      //
+      btScalar *pBasePos;
+      btScalar *pBaseVel = &m_realBuf[3];  //note: the !pqd case assumes m_realBuf holds with base velocity at 3,4,5 (should be wrapped for safety)
     
-    pBasePos[0] += dt * pBaseVel[0];
-    pBasePos[1] += dt * pBaseVel[1];
-    pBasePos[2] += dt * pBaseVel[2];
+    	// reset to current position
+    	for (int i = 0; i < 3; ++i)
+    	{
+    	    m_basePos_interpolate[i] = m_basePos[i];
+    	}
+    	pBasePos = m_basePos_interpolate;
+    	
+    	pBasePos[0] += dt * pBaseVel[0];
+    	pBasePos[1] += dt * pBaseVel[1];
+    	pBasePos[2] += dt * pBaseVel[2];
+		}
     
     ///////////////////////////////
     //local functor for quaternion integration (to avoid error prone redundancy)
@@ -1617,26 +1660,29 @@ void btMultiBody::predictPositionsMultiDof(btScalar dt)
     
     //pQuatUpdateFun(getBaseOmega(), m_baseQuat, true, dt);
     //
-    btScalar *pBaseQuat;
-
-    // reset to current orientation
-    for (int i = 0; i < 4; ++i)
-    {
-        m_baseQuat_interpolate[i] = m_baseQuat[i];
-    }
-    pBaseQuat = m_baseQuat_interpolate;
+		if(!isBaseKinematic())
+		{
+        btScalar *pBaseQuat;
 
-    btScalar *pBaseOmega = &m_realBuf[0];  //note: the !pqd case assumes m_realBuf starts with base omega (should be wrapped for safety)
-    //
-    btQuaternion baseQuat;
-    baseQuat.setValue(pBaseQuat[0], pBaseQuat[1], pBaseQuat[2], pBaseQuat[3]);
-    btVector3 baseOmega;
-    baseOmega.setValue(pBaseOmega[0], pBaseOmega[1], pBaseOmega[2]);
-    pQuatUpdateFun(baseOmega, baseQuat, true, dt);
-    pBaseQuat[0] = baseQuat.x();
-    pBaseQuat[1] = baseQuat.y();
-    pBaseQuat[2] = baseQuat.z();
-    pBaseQuat[3] = baseQuat.w();
+        // reset to current orientation
+        for (int i = 0; i < 4; ++i)
+        {
+            m_baseQuat_interpolate[i] = m_baseQuat[i];
+        }
+        pBaseQuat = m_baseQuat_interpolate;
+
+        btScalar *pBaseOmega = &m_realBuf[0];  //note: the !pqd case assumes m_realBuf starts with base omega (should be wrapped for safety)
+        //
+        btQuaternion baseQuat;
+        baseQuat.setValue(pBaseQuat[0], pBaseQuat[1], pBaseQuat[2], pBaseQuat[3]);
+        btVector3 baseOmega;
+        baseOmega.setValue(pBaseOmega[0], pBaseOmega[1], pBaseOmega[2]);
+        pQuatUpdateFun(baseOmega, baseQuat, true, dt);
+        pBaseQuat[0] = baseQuat.x();
+        pBaseQuat[1] = baseQuat.y();
+        pBaseQuat[2] = baseQuat.z();
+        pBaseQuat[3] = baseQuat.w();
+		}
 
     // Finally we can update m_jointPos for each of the m_links
     for (int i = 0; i < num_links; ++i)
@@ -1644,55 +1690,88 @@ void btMultiBody::predictPositionsMultiDof(btScalar dt)
         btScalar *pJointPos;
         pJointPos = &m_links[i].m_jointPos_interpolate[0];
         
-        btScalar *pJointVel = getJointVelMultiDof(i);
-        
-        switch (m_links[i].m_jointType)
-        {
-            case btMultibodyLink::ePrismatic:
-            case btMultibodyLink::eRevolute:
-            {
-                //reset to current pos
-                pJointPos[0] = m_links[i].m_jointPos[0];
-                btScalar jointVel = pJointVel[0];
-                pJointPos[0] += dt * jointVel;
-                break;
-            }
-            case btMultibodyLink::eSpherical:
-            {
-                //reset to current pos
-
-                for (int j = 0; j < 4; ++j)
+        if (m_links[i].m_collider && m_links[i].m_collider->isStaticOrKinematic()) 
+		{
+            switch (m_links[i].m_jointType) 
+						{
+                case btMultibodyLink::ePrismatic:
+                case btMultibodyLink::eRevolute:
                 {
-                    pJointPos[j] = m_links[i].m_jointPos[j];
+                    pJointPos[0] = m_links[i].m_jointPos[0];
+                    break;
                 }
-                
-                btVector3 jointVel;
-                jointVel.setValue(pJointVel[0], pJointVel[1], pJointVel[2]);
-                btQuaternion jointOri;
-                jointOri.setValue(pJointPos[0], pJointPos[1], pJointPos[2], pJointPos[3]);
-                pQuatUpdateFun(jointVel, jointOri, false, dt);
-                pJointPos[0] = jointOri.x();
-                pJointPos[1] = jointOri.y();
-                pJointPos[2] = jointOri.z();
-                pJointPos[3] = jointOri.w();
-                break;
-            }
-            case btMultibodyLink::ePlanar:
-            {
-                for (int j = 0; j < 3; ++j)
+                case btMultibodyLink::eSpherical:
                 {
-                    pJointPos[j] = m_links[i].m_jointPos[j];
+                    for (int j = 0; j < 4; ++j)
+                    {
+                        pJointPos[j] = m_links[i].m_jointPos[j];
+                    }
+                    break;
                 }
-                pJointPos[0] += dt * getJointVelMultiDof(i)[0];
-                
-                btVector3 q0_coors_qd1qd2 = getJointVelMultiDof(i)[1] * m_links[i].getAxisBottom(1) + getJointVelMultiDof(i)[2] * m_links[i].getAxisBottom(2);
-                btVector3 no_q0_coors_qd1qd2 = quatRotate(btQuaternion(m_links[i].getAxisTop(0), pJointPos[0]), q0_coors_qd1qd2);
-                pJointPos[1] += m_links[i].getAxisBottom(1).dot(no_q0_coors_qd1qd2) * dt;
-                pJointPos[2] += m_links[i].getAxisBottom(2).dot(no_q0_coors_qd1qd2) * dt;
-                break;
+                case btMultibodyLink::ePlanar:
+                {
+                    for (int j = 0; j < 3; ++j)
+                    {
+                        pJointPos[j] = m_links[i].m_jointPos[j];
+                    }
+                    break;
+                }
+                default:
+                   break;
             }
-            default:
+        }
+        else
+        {
+            btScalar *pJointVel = getJointVelMultiDof(i); 
+
+            switch (m_links[i].m_jointType)
             {
+                case btMultibodyLink::ePrismatic:
+                case btMultibodyLink::eRevolute:
+                {
+                    //reset to current pos
+                    pJointPos[0] = m_links[i].m_jointPos[0];
+                    btScalar jointVel = pJointVel[0];
+                    pJointPos[0] += dt * jointVel;
+                    break;
+                }
+                case btMultibodyLink::eSpherical:
+                {
+                    //reset to current pos
+
+                    for (int j = 0; j < 4; ++j)
+                    {
+                        pJointPos[j] = m_links[i].m_jointPos[j];
+                    }
+                    
+                    btVector3 jointVel;
+                    jointVel.setValue(pJointVel[0], pJointVel[1], pJointVel[2]);
+                    btQuaternion jointOri;
+                    jointOri.setValue(pJointPos[0], pJointPos[1], pJointPos[2], pJointPos[3]);
+                    pQuatUpdateFun(jointVel, jointOri, false, dt);
+                    pJointPos[0] = jointOri.x();
+                    pJointPos[1] = jointOri.y();
+                    pJointPos[2] = jointOri.z();
+                    pJointPos[3] = jointOri.w();
+                    break;
+                }
+                case btMultibodyLink::ePlanar:
+                {
+                    for (int j = 0; j < 3; ++j)
+                    {
+                        pJointPos[j] = m_links[i].m_jointPos[j];
+                    }
+                    pJointPos[0] += dt * getJointVelMultiDof(i)[0];
+                    
+                    btVector3 q0_coors_qd1qd2 = getJointVelMultiDof(i)[1] * m_links[i].getAxisBottom(1) + getJointVelMultiDof(i)[2] * m_links[i].getAxisBottom(2);
+                    btVector3 no_q0_coors_qd1qd2 = quatRotate(btQuaternion(m_links[i].getAxisTop(0), pJointPos[0]), q0_coors_qd1qd2);
+                    pJointPos[1] += m_links[i].getAxisBottom(1).dot(no_q0_coors_qd1qd2) * dt;
+                    pJointPos[2] += m_links[i].getAxisBottom(2).dot(no_q0_coors_qd1qd2) * dt;
+                    break;
+                }
+                default:
+                {
+                }
             }
         }
         
@@ -1703,16 +1782,19 @@ void btMultiBody::predictPositionsMultiDof(btScalar dt)
 void btMultiBody::stepPositionsMultiDof(btScalar dt, btScalar *pq, btScalar *pqd)
 {
 	int num_links = getNumLinks();
-	// step position by adding dt * velocity
-	//btVector3 v = getBaseVel();
-	//m_basePos += dt * v;
-	//
-    btScalar *pBasePos = (pq ? &pq[4] : m_basePos);
-    btScalar *pBaseVel = (pqd ? &pqd[3] : &m_realBuf[3]);  //note: the !pqd case assumes m_realBuf holds with base velocity at 3,4,5 (should be wrapped for safety)
-    
-	pBasePos[0] += dt * pBaseVel[0];
-	pBasePos[1] += dt * pBaseVel[1];
-	pBasePos[2] += dt * pBaseVel[2];
+	if(!isBaseKinematic())
+	{
+		// step position by adding dt * velocity
+		//btVector3 v = getBaseVel();
+		//m_basePos += dt * v;
+		//
+  	  btScalar *pBasePos = (pq ? &pq[4] : m_basePos);
+  	  btScalar *pBaseVel = (pqd ? &pqd[3] : &m_realBuf[3]);  //note: the !pqd case assumes m_realBuf holds with base velocity at 3,4,5 (should be wrapped for safety)
+  	  
+		pBasePos[0] += dt * pBaseVel[0];
+		pBasePos[1] += dt * pBaseVel[1];
+		pBasePos[2] += dt * pBaseVel[2];
+	}
 
 	///////////////////////////////
 	//local functor for quaternion integration (to avoid error prone redundancy)
@@ -1763,22 +1845,25 @@ void btMultiBody::stepPositionsMultiDof(btScalar dt, btScalar *pq, btScalar *pqd
 
 	//pQuatUpdateFun(getBaseOmega(), m_baseQuat, true, dt);
 	//
-    btScalar *pBaseQuat = pq ? pq : m_baseQuat;
-	btScalar *pBaseOmega = pqd ? pqd : &m_realBuf[0];  //note: the !pqd case assumes m_realBuf starts with base omega (should be wrapped for safety)
-	//
-	btQuaternion baseQuat;
-	baseQuat.setValue(pBaseQuat[0], pBaseQuat[1], pBaseQuat[2], pBaseQuat[3]);
-	btVector3 baseOmega;
-	baseOmega.setValue(pBaseOmega[0], pBaseOmega[1], pBaseOmega[2]);
-	pQuatUpdateFun(baseOmega, baseQuat, true, dt);
-	pBaseQuat[0] = baseQuat.x();
-	pBaseQuat[1] = baseQuat.y();
-	pBaseQuat[2] = baseQuat.z();
-	pBaseQuat[3] = baseQuat.w();
-
-	//printf("pBaseOmega = %.4f %.4f %.4f\n", pBaseOmega->x(), pBaseOmega->y(), pBaseOmega->z());
-	//printf("pBaseVel = %.4f %.4f %.4f\n", pBaseVel->x(), pBaseVel->y(), pBaseVel->z());
-	//printf("baseQuat = %.4f %.4f %.4f %.4f\n", pBaseQuat->x(), pBaseQuat->y(), pBaseQuat->z(), pBaseQuat->w());
+	if(!isBaseKinematic())
+	{
+		btScalar *pBaseQuat = pq ? pq : m_baseQuat;
+		btScalar *pBaseOmega = pqd ? pqd : &m_realBuf[0];  //note: the !pqd case assumes m_realBuf starts with base omega (should be wrapped for safety)
+		//
+		btQuaternion baseQuat;
+		baseQuat.setValue(pBaseQuat[0], pBaseQuat[1], pBaseQuat[2], pBaseQuat[3]);
+		btVector3 baseOmega;
+		baseOmega.setValue(pBaseOmega[0], pBaseOmega[1], pBaseOmega[2]);
+		pQuatUpdateFun(baseOmega, baseQuat, true, dt);
+		pBaseQuat[0] = baseQuat.x();
+		pBaseQuat[1] = baseQuat.y();
+		pBaseQuat[2] = baseQuat.z();
+		pBaseQuat[3] = baseQuat.w();
+
+		//printf("pBaseOmega = %.4f %.4f %.4f\n", pBaseOmega->x(), pBaseOmega->y(), pBaseOmega->z());
+		//printf("pBaseVel = %.4f %.4f %.4f\n", pBaseVel->x(), pBaseVel->y(), pBaseVel->z());
+		//printf("baseQuat = %.4f %.4f %.4f %.4f\n", pBaseQuat->x(), pBaseQuat->y(), pBaseQuat->z(), pBaseQuat->w());
+	}
 
 	if (pq)
 		pq += 7;
@@ -1788,48 +1873,51 @@ void btMultiBody::stepPositionsMultiDof(btScalar dt, btScalar *pq, btScalar *pqd
 	// Finally we can update m_jointPos for each of the m_links
 	for (int i = 0; i < num_links; ++i)
 	{
-        btScalar *pJointPos;
-        pJointPos= (pq ? pq : &m_links[i].m_jointPos[0]);
-        
-		btScalar *pJointVel = (pqd ? pqd : getJointVelMultiDof(i));
-
-		switch (m_links[i].m_jointType)
+		if (!(m_links[i].m_collider && m_links[i].m_collider->isStaticOrKinematic()))
 		{
-			case btMultibodyLink::ePrismatic:
-			case btMultibodyLink::eRevolute:
-			{
-                //reset to current pos
-				btScalar jointVel = pJointVel[0];
-				pJointPos[0] += dt * jointVel;
-				break;
-			}
-			case btMultibodyLink::eSpherical:
-			{
-                //reset to current pos
-				btVector3 jointVel;
-				jointVel.setValue(pJointVel[0], pJointVel[1], pJointVel[2]);
-				btQuaternion jointOri;
-				jointOri.setValue(pJointPos[0], pJointPos[1], pJointPos[2], pJointPos[3]);
-				pQuatUpdateFun(jointVel, jointOri, false, dt);
-				pJointPos[0] = jointOri.x();
-				pJointPos[1] = jointOri.y();
-				pJointPos[2] = jointOri.z();
-				pJointPos[3] = jointOri.w();
-				break;
-			}
-			case btMultibodyLink::ePlanar:
+			btScalar *pJointPos;
+			pJointPos= (pq ? pq : &m_links[i].m_jointPos[0]);
+		
+			btScalar *pJointVel = (pqd ? pqd : getJointVelMultiDof(i));
+
+			switch (m_links[i].m_jointType)
 			{
-				pJointPos[0] += dt * getJointVelMultiDof(i)[0];
+				case btMultibodyLink::ePrismatic:
+				case btMultibodyLink::eRevolute:
+				{
+    	            //reset to current pos
+					btScalar jointVel = pJointVel[0];
+					pJointPos[0] += dt * jointVel;
+					break;
+				}
+				case btMultibodyLink::eSpherical:
+				{
+    	            //reset to current pos
+					btVector3 jointVel;
+					jointVel.setValue(pJointVel[0], pJointVel[1], pJointVel[2]);
+					btQuaternion jointOri;
+					jointOri.setValue(pJointPos[0], pJointPos[1], pJointPos[2], pJointPos[3]);
+					pQuatUpdateFun(jointVel, jointOri, false, dt);
+					pJointPos[0] = jointOri.x();
+					pJointPos[1] = jointOri.y();
+					pJointPos[2] = jointOri.z();
+					pJointPos[3] = jointOri.w();
+					break;
+				}
+				case btMultibodyLink::ePlanar:
+				{
+					pJointPos[0] += dt * getJointVelMultiDof(i)[0];
 
-				btVector3 q0_coors_qd1qd2 = getJointVelMultiDof(i)[1] * m_links[i].getAxisBottom(1) + getJointVelMultiDof(i)[2] * m_links[i].getAxisBottom(2);
-				btVector3 no_q0_coors_qd1qd2 = quatRotate(btQuaternion(m_links[i].getAxisTop(0), pJointPos[0]), q0_coors_qd1qd2);
-				pJointPos[1] += m_links[i].getAxisBottom(1).dot(no_q0_coors_qd1qd2) * dt;
-				pJointPos[2] += m_links[i].getAxisBottom(2).dot(no_q0_coors_qd1qd2) * dt;
+					btVector3 q0_coors_qd1qd2 = getJointVelMultiDof(i)[1] * m_links[i].getAxisBottom(1) + getJointVelMultiDof(i)[2] * m_links[i].getAxisBottom(2);
+					btVector3 no_q0_coors_qd1qd2 = quatRotate(btQuaternion(m_links[i].getAxisTop(0), pJointPos[0]), q0_coors_qd1qd2);
+					pJointPos[1] += m_links[i].getAxisBottom(1).dot(no_q0_coors_qd1qd2) * dt;
+					pJointPos[2] += m_links[i].getAxisBottom(2).dot(no_q0_coors_qd1qd2) * dt;
 
-				break;
-			}
-			default:
-			{
+					break;
+				}
+				default:
+				{
+				}
 			}
 		}
 
@@ -2135,8 +2223,15 @@ void btMultiBody::updateCollisionObjectInterpolationWorldTransforms(btAlignedObj
     world_to_local.resize(getNumLinks() + 1);
     local_origin.resize(getNumLinks() + 1);
     
-    world_to_local[0] = getInterpolateWorldToBaseRot();
-    local_origin[0] = getInterpolateBasePos();
+		if(isBaseKinematic()){
+        world_to_local[0] = getWorldToBaseRot();
+        local_origin[0] = getBasePos();
+		}
+		else
+		{
+        world_to_local[0] = getInterpolateWorldToBaseRot();
+        local_origin[0] = getInterpolateBasePos();
+		}
     
     if (getBaseCollider())
     {
@@ -2282,3 +2377,81 @@ const char *btMultiBody::serialize(void *dataBuffer, class btSerializer *seriali
 
 	return btMultiBodyDataName;
 }
+
+void btMultiBody::saveKinematicState(btScalar timeStep)
+{
+	//todo: clamp to some (user definable) safe minimum timestep, to limit maximum angular/linear velocities
+	if (timeStep != btScalar(0.))
+	{
+		btVector3 linearVelocity, angularVelocity;
+		btTransformUtil::calculateVelocity(getInterpolateBaseWorldTransform(), getBaseWorldTransform(), timeStep, linearVelocity, angularVelocity);
+		setBaseVel(linearVelocity);
+		setBaseOmega(angularVelocity);
+		setInterpolateBaseWorldTransform(getBaseWorldTransform());
+	}
+}
+
+void btMultiBody::setLinkDynamicType(const int i, int type)
+{
+	if (i == -1)
+	{
+		setBaseDynamicType(type);
+	}
+	else if (i >= 0 && i < getNumLinks())
+	{
+		if (m_links[i].m_collider)
+		{
+			m_links[i].m_collider->setDynamicType(type);
+		}
+	}
+}
+
+bool btMultiBody::isLinkStaticOrKinematic(const int i) const
+{
+	if (i == -1)
+	{
+		return isBaseStaticOrKinematic();
+	}
+	else
+	{
+		if (m_links[i].m_collider)
+			return m_links[i].m_collider->isStaticOrKinematic();
+	}
+	return false;
+}
+
+bool btMultiBody::isLinkKinematic(const int i) const
+{
+	if (i == -1)
+	{
+		return isBaseKinematic();
+	}
+	else
+	{
+		if (m_links[i].m_collider)
+			return m_links[i].m_collider->isKinematic();
+	}
+	return false;
+}
+
+bool btMultiBody::isLinkAndAllAncestorsStaticOrKinematic(const int i) const
+{
+	int link = i;
+	while (link != -1) {
+		if (!isLinkStaticOrKinematic(link))
+			return false;
+		link = m_links[link].m_parent;
+	}
+	return isBaseStaticOrKinematic();
+}
+
+bool btMultiBody::isLinkAndAllAncestorsKinematic(const int i) const
+{
+	int link = i;
+	while (link != -1) {
+		if (!isLinkKinematic(link))
+			return false;
+		link = m_links[link].m_parent;
+	}
+	return isBaseKinematic();
+}
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBody.h b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBody.h
index be795633fd..25112a6805 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBody.h
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBody.h
@@ -210,7 +210,13 @@ public:
 	void setBasePos(const btVector3 &pos)
 	{
 		m_basePos = pos;
-        m_basePos_interpolate = pos;
+		if(!isBaseKinematic())
+			m_basePos_interpolate = pos;
+	}
+
+	void setInterpolateBasePos(const btVector3 &pos)
+	{
+		m_basePos_interpolate = pos;
 	}
 
 	void setBaseWorldTransform(const btTransform &tr)
@@ -227,17 +233,39 @@ public:
 		return tr;
 	}
 
+	void setInterpolateBaseWorldTransform(const btTransform &tr)
+	{
+		setInterpolateBasePos(tr.getOrigin());
+		setInterpolateWorldToBaseRot(tr.getRotation().inverse());
+	}
+
+	btTransform getInterpolateBaseWorldTransform() const
+	{
+		btTransform tr;
+		tr.setOrigin(getInterpolateBasePos());
+		tr.setRotation(getInterpolateWorldToBaseRot().inverse());
+		return tr;
+	}
+
 	void setBaseVel(const btVector3 &vel)
 	{
 		m_realBuf[3] = vel[0];
 		m_realBuf[4] = vel[1];
 		m_realBuf[5] = vel[2];
 	}
+
 	void setWorldToBaseRot(const btQuaternion &rot)
 	{
 		m_baseQuat = rot;  //m_baseQuat asumed to ba alias!?
-        m_baseQuat_interpolate = rot;
+		if(!isBaseKinematic())
+			m_baseQuat_interpolate = rot;
+	}
+
+	void setInterpolateWorldToBaseRot(const btQuaternion &rot)
+	{
+		m_baseQuat_interpolate = rot;
 	}
+
 	void setBaseOmega(const btVector3 &omega)
 	{
 		m_realBuf[0] = omega[0];
@@ -245,6 +273,8 @@ public:
 		m_realBuf[2] = omega[2];
 	}
 
+	void saveKinematicState(btScalar timeStep);
+
 	//
 	// get/set pos/vel for child m_links (i = 0 to num_links-1)
 	//
@@ -278,6 +308,11 @@ public:
     {
         return &m_deltaV[0];
     }
+    
+    const btScalar *getSplitVelocityVector() const
+    {
+        return &m_splitV[0];
+    }
 	/*    btScalar * getVelocityVector() 
 	{ 
 		return &real_buf[0]; 
@@ -397,6 +432,26 @@ public:
 			m_deltaV[dof] += delta_vee[dof] * multiplier;
 		}
 	}
+    void applyDeltaSplitVeeMultiDof(const btScalar *delta_vee, btScalar multiplier)
+    {
+        for (int dof = 0; dof < 6 + getNumDofs(); ++dof)
+        {
+            m_splitV[dof] += delta_vee[dof] * multiplier;
+        }
+    }
+    void addSplitV()
+    {
+        applyDeltaVeeMultiDof(&m_splitV[0], 1);
+    }
+    void substractSplitV()
+    {
+        applyDeltaVeeMultiDof(&m_splitV[0], -1);
+        
+        for (int dof = 0; dof < 6 + getNumDofs(); ++dof)
+        {
+            m_splitV[dof] = 0.f;
+        }
+    }
 	void processDeltaVeeMultiDof2()
 	{
 		applyDeltaVeeMultiDof(&m_deltaV[0], 1);
@@ -495,14 +550,22 @@ public:
 	void goToSleep();
 	void checkMotionAndSleepIfRequired(btScalar timestep);
 
-	bool hasFixedBase() const
-	{
-		return m_fixedBase;
-	}
+	bool hasFixedBase() const;
+
+	bool isBaseKinematic() const;
+
+	bool isBaseStaticOrKinematic() const;
+
+	// set the dynamic type in the base's collision flags.
+	void setBaseDynamicType(int dynamicType);
 
 	void setFixedBase(bool fixedBase)
 	{
 		m_fixedBase = fixedBase;
+		if(m_fixedBase)
+			setBaseDynamicType(btCollisionObject::CF_STATIC_OBJECT);
+		else
+			setBaseDynamicType(btCollisionObject::CF_DYNAMIC_OBJECT);
 	}
 
 	int getCompanionId() const
@@ -653,7 +716,15 @@ public:
 		btVector3 &top_out,         // top part of output vector
 		btVector3 &bottom_out);      // bottom part of output vector
 
+	void setLinkDynamicType(const int i, int type);
+
+	bool isLinkStaticOrKinematic(const int i) const;
+
+	bool isLinkKinematic(const int i) const;
+
+	bool isLinkAndAllAncestorsStaticOrKinematic(const int i) const;
 
+	bool isLinkAndAllAncestorsKinematic(const int i) const;
 
 private:
 	btMultiBody(const btMultiBody &);     // not implemented
@@ -711,6 +782,7 @@ private:
 	//  offset         size         array
 	//   0              num_links+1  rot_from_parent
 	//
+    btAlignedObjectArray<btScalar> m_splitV;
 	btAlignedObjectArray<btScalar> m_deltaV;
 	btAlignedObjectArray<btScalar> m_realBuf;
 	btAlignedObjectArray<btVector3> m_vectorBuf;
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.cpp
index d7ed05ce57..1ba5861145 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.cpp
@@ -2,11 +2,12 @@
 #include "BulletDynamics/Dynamics/btRigidBody.h"
 #include "btMultiBodyPoint2Point.h"  //for testing (BTMBP2PCONSTRAINT_BLOCK_ANGULAR_MOTION_TEST macro)
 
-btMultiBodyConstraint::btMultiBodyConstraint(btMultiBody* bodyA, btMultiBody* bodyB, int linkA, int linkB, int numRows, bool isUnilateral)
+btMultiBodyConstraint::btMultiBodyConstraint(btMultiBody* bodyA, btMultiBody* bodyB, int linkA, int linkB, int numRows, bool isUnilateral, int type)
 	: m_bodyA(bodyA),
 	  m_bodyB(bodyB),
 	  m_linkA(linkA),
 	  m_linkB(linkB),
+	  m_type(type),
 	  m_numRows(numRows),
 	  m_jacSizeA(0),
 	  m_jacSizeBoth(0),
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.h b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.h
index 5c15f3e851..4a6007ee3e 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.h
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyConstraint.h
@@ -20,6 +20,21 @@ subject to the following restrictions:
 #include "LinearMath/btAlignedObjectArray.h"
 #include "btMultiBody.h"
 
+
+//Don't change any of the existing enum values, so add enum types at the end for serialization compatibility
+enum btTypedMultiBodyConstraintType
+{
+	MULTIBODY_CONSTRAINT_LIMIT=3,
+	MULTIBODY_CONSTRAINT_1DOF_JOINT_MOTOR,
+	MULTIBODY_CONSTRAINT_GEAR,
+	MULTIBODY_CONSTRAINT_POINT_TO_POINT,
+	MULTIBODY_CONSTRAINT_SLIDER,
+	MULTIBODY_CONSTRAINT_SPHERICAL_MOTOR,
+	MULTIBODY_CONSTRAINT_FIXED,
+	
+	MAX_MULTIBODY_CONSTRAINT_TYPE,
+};
+
 class btMultiBody;
 struct btSolverInfo;
 
@@ -46,6 +61,8 @@ protected:
 	int m_linkA;
 	int m_linkB;
 
+	int m_type; //btTypedMultiBodyConstraintType
+
 	int m_numRows;
 	int m_jacSizeA;
 	int m_jacSizeBoth;
@@ -82,12 +99,16 @@ protected:
 public:
 	BT_DECLARE_ALIGNED_ALLOCATOR();
 
-	btMultiBodyConstraint(btMultiBody * bodyA, btMultiBody * bodyB, int linkA, int linkB, int numRows, bool isUnilateral);
+	btMultiBodyConstraint(btMultiBody * bodyA, btMultiBody * bodyB, int linkA, int linkB, int numRows, bool isUnilateral, int type);
 	virtual ~btMultiBodyConstraint();
 
 	void updateJacobianSizes();
 	void allocateJacobiansMultiDof();
 
+	int getConstraintType() const
+	{
+		return m_type;
+	}
 	//many constraints have setFrameInB/setPivotInB. Will use 'getConstraintType' later.
 	virtual void setFrameInB(const btMatrix3x3& frameInB) {}
 	virtual void setPivotInB(const btVector3& pivotInB) {}
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.cpp
index cd1bad089e..fef95f0c4e 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.cpp
@@ -592,6 +592,7 @@ void btMultiBodyDynamicsWorld::integrateMultiBodyTransforms(btScalar timeStep)
 
 			if (!isSleeping)
 			{
+				bod->addSplitV();
 				int nLinks = bod->getNumLinks();
 
 				///base + num m_links
@@ -610,6 +611,7 @@ void btMultiBodyDynamicsWorld::integrateMultiBodyTransforms(btScalar timeStep)
 				m_scratch_world_to_local.resize(nLinks + 1);
 				m_scratch_local_origin.resize(nLinks + 1);
                 bod->updateCollisionObjectWorldTransforms(m_scratch_world_to_local, m_scratch_local_origin);
+				bod->substractSplitV();
 			}
 			else
 			{
@@ -867,6 +869,18 @@ void btMultiBodyDynamicsWorld::serializeMultiBodies(btSerializer* serializer)
 		}
 	}
 }
+
+void btMultiBodyDynamicsWorld::saveKinematicState(btScalar timeStep)
+{
+	btDiscreteDynamicsWorld::saveKinematicState(timeStep);
+	for(int i = 0; i < m_multiBodies.size(); i++)
+	{
+		btMultiBody* body = m_multiBodies[i];
+		if(body->isBaseKinematic())
+			body->saveKinematicState(timeStep);
+	}
+}
+
 //
 //void btMultiBodyDynamicsWorld::setSplitIslands(bool split)
 //{
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.h b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.h
index 9ac46f4b64..d2d76c8b92 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.h
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyDynamicsWorld.h
@@ -120,5 +120,7 @@ public:
     virtual void solveExternalForces(btContactSolverInfo& solverInfo);
     virtual void solveInternalConstraints(btContactSolverInfo& solverInfo);
     void buildIslands();
+
+	virtual void saveKinematicState(btScalar timeStep);
 };
 #endif  //BT_MULTIBODY_DYNAMICS_WORLD_H
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.cpp
index 5ef9444c2f..df2abbe97a 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyFixedConstraint.cpp
@@ -24,7 +24,7 @@ subject to the following restrictions:
 #define BTMBFIXEDCONSTRAINT_DIM 6
 
 btMultiBodyFixedConstraint::btMultiBodyFixedConstraint(btMultiBody* body, int link, btRigidBody* bodyB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB)
-	: btMultiBodyConstraint(body, 0, link, -1, BTMBFIXEDCONSTRAINT_DIM, false),
+	: btMultiBodyConstraint(body, 0, link, -1, BTMBFIXEDCONSTRAINT_DIM, false, MULTIBODY_CONSTRAINT_FIXED),
 	  m_rigidBodyA(0),
 	  m_rigidBodyB(bodyB),
 	  m_pivotInA(pivotInA),
@@ -36,7 +36,7 @@ btMultiBodyFixedConstraint::btMultiBodyFixedConstraint(btMultiBody* body, int li
 }
 
 btMultiBodyFixedConstraint::btMultiBodyFixedConstraint(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB)
-	: btMultiBodyConstraint(bodyA, bodyB, linkA, linkB, BTMBFIXEDCONSTRAINT_DIM, false),
+	: btMultiBodyConstraint(bodyA, bodyB, linkA, linkB, BTMBFIXEDCONSTRAINT_DIM, false, MULTIBODY_CONSTRAINT_FIXED),
 	  m_rigidBodyA(0),
 	  m_rigidBodyB(0),
 	  m_pivotInA(pivotInA),
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyGearConstraint.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyGearConstraint.cpp
index bf6b811d26..ee02cf9b07 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyGearConstraint.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyGearConstraint.cpp
@@ -21,7 +21,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 
 btMultiBodyGearConstraint::btMultiBodyGearConstraint(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB)
-	: btMultiBodyConstraint(bodyA, bodyB, linkA, linkB, 1, false),
+	: btMultiBodyConstraint(bodyA, bodyB, linkA, linkB, 1, false, MULTIBODY_CONSTRAINT_GEAR),
 	  m_gearRatio(1),
 	  m_gearAuxLink(-1),
 	  m_erp(0),
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.cpp
index 8791ad2868..94b36ac108 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.cpp
@@ -22,7 +22,7 @@ subject to the following restrictions:
 
 btMultiBodyJointLimitConstraint::btMultiBodyJointLimitConstraint(btMultiBody* body, int link, btScalar lower, btScalar upper)
 	//:btMultiBodyConstraint(body,0,link,-1,2,true),
-	: btMultiBodyConstraint(body, body, link, body->getLink(link).m_parent, 2, true),
+	: btMultiBodyConstraint(body, body, link, body->getLink(link).m_parent, 2, true, MULTIBODY_CONSTRAINT_LIMIT),
 	  m_lowerBound(lower),
 	  m_upperBound(upper)
 {
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.h b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.h
index 6716ba490f..b810692b4c 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.h
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointLimitConstraint.h
@@ -42,6 +42,22 @@ public:
 	{
 		//todo(erwincoumans)
 	}
+	btScalar getLowerBound() const
+	{
+		return m_lowerBound;
+	}
+	btScalar getUpperBound() const
+	{
+		return m_upperBound;
+	}
+	void setLowerBound(btScalar lower)
+	{
+		m_lowerBound = lower;
+	}
+	void setUpperBound(btScalar upper)
+	{
+		m_upperBound = upper;
+	}
 };
 
 #endif  //BT_MULTIBODY_JOINT_LIMIT_CONSTRAINT_H
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
index 5c816c4987..fec9b03213 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
@@ -21,7 +21,7 @@ subject to the following restrictions:
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 
 btMultiBodyJointMotor::btMultiBodyJointMotor(btMultiBody* body, int link, btScalar desiredVelocity, btScalar maxMotorImpulse)
-	: btMultiBodyConstraint(body, body, link, body->getLink(link).m_parent, 1, true),
+	: btMultiBodyConstraint(body, body, link, body->getLink(link).m_parent, 1, true, MULTIBODY_CONSTRAINT_1DOF_JOINT_MOTOR),
 	  m_desiredVelocity(desiredVelocity),
 	  m_desiredPosition(0),
 	  m_kd(1.),
@@ -51,7 +51,7 @@ void btMultiBodyJointMotor::finalizeMultiDof()
 
 btMultiBodyJointMotor::btMultiBodyJointMotor(btMultiBody* body, int link, int linkDoF, btScalar desiredVelocity, btScalar maxMotorImpulse)
 	//:btMultiBodyConstraint(body,0,link,-1,1,true),
-	: btMultiBodyConstraint(body, body, link, body->getLink(link).m_parent, 1, true),
+	: btMultiBodyConstraint(body, body, link, body->getLink(link).m_parent, 1, true, MULTIBODY_CONSTRAINT_1DOF_JOINT_MOTOR),
 	  m_desiredVelocity(desiredVelocity),
 	  m_desiredPosition(0),
 	  m_kd(1.),
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyLink.h b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyLink.h
index 01d5583c2f..5a1429340f 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyLink.h
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyLink.h
@@ -295,6 +295,9 @@ struct btMultibodyLink
             }
         }
     }
+
+ 
+
 };
 
 #endif  //BT_MULTIBODY_LINK_H
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyLinkCollider.h b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyLinkCollider.h
index bc909990c2..3dc35a5814 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyLinkCollider.h
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyLinkCollider.h
@@ -130,6 +130,23 @@ public:
 		return true;
 	}
 
+	bool isStaticOrKinematic() const
+	{
+		return isStaticOrKinematicObject();
+	}
+
+	bool isKinematic() const
+	{
+		return isKinematicObject();
+	}
+
+	void setDynamicType(int dynamicType)
+	{
+		int oldFlags = getCollisionFlags();
+		oldFlags &= ~(btCollisionObject::CF_STATIC_OBJECT | btCollisionObject::CF_KINEMATIC_OBJECT);
+		setCollisionFlags(oldFlags | dynamicType);
+	}
+
 	virtual int calculateSerializeBufferSize() const;
 
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.cpp
index 37d3aede37..f51e69deb1 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyPoint2Point.cpp
@@ -27,7 +27,7 @@ subject to the following restrictions:
 #endif
 
 btMultiBodyPoint2Point::btMultiBodyPoint2Point(btMultiBody* body, int link, btRigidBody* bodyB, const btVector3& pivotInA, const btVector3& pivotInB)
-	: btMultiBodyConstraint(body, 0, link, -1, BTMBP2PCONSTRAINT_DIM, false),
+	: btMultiBodyConstraint(body, 0, link, -1, BTMBP2PCONSTRAINT_DIM, false, MULTIBODY_CONSTRAINT_POINT_TO_POINT),
 	  m_rigidBodyA(0),
 	  m_rigidBodyB(bodyB),
 	  m_pivotInA(pivotInA),
@@ -37,7 +37,7 @@ btMultiBodyPoint2Point::btMultiBodyPoint2Point(btMultiBody* body, int link, btRi
 }
 
 btMultiBodyPoint2Point::btMultiBodyPoint2Point(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB)
-	: btMultiBodyConstraint(bodyA, bodyB, linkA, linkB, BTMBP2PCONSTRAINT_DIM, false),
+	: btMultiBodyConstraint(bodyA, bodyB, linkA, linkB, BTMBP2PCONSTRAINT_DIM, false, MULTIBODY_CONSTRAINT_POINT_TO_POINT),
 	  m_rigidBodyA(0),
 	  m_rigidBodyB(0),
 	  m_pivotInA(pivotInA),
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.cpp
index e025302ce6..48ec1d5af2 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySliderConstraint.cpp
@@ -25,7 +25,7 @@ subject to the following restrictions:
 #define EPSILON 0.000001
 
 btMultiBodySliderConstraint::btMultiBodySliderConstraint(btMultiBody* body, int link, btRigidBody* bodyB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB, const btVector3& jointAxis)
-	: btMultiBodyConstraint(body, 0, link, -1, BTMBSLIDERCONSTRAINT_DIM, false),
+	: btMultiBodyConstraint(body, 0, link, -1, BTMBSLIDERCONSTRAINT_DIM, false, MULTIBODY_CONSTRAINT_SLIDER),
 	  m_rigidBodyA(0),
 	  m_rigidBodyB(bodyB),
 	  m_pivotInA(pivotInA),
@@ -38,7 +38,7 @@ btMultiBodySliderConstraint::btMultiBodySliderConstraint(btMultiBody* body, int
 }
 
 btMultiBodySliderConstraint::btMultiBodySliderConstraint(btMultiBody* bodyA, int linkA, btMultiBody* bodyB, int linkB, const btVector3& pivotInA, const btVector3& pivotInB, const btMatrix3x3& frameInA, const btMatrix3x3& frameInB, const btVector3& jointAxis)
-	: btMultiBodyConstraint(bodyA, bodyB, linkA, linkB, BTMBSLIDERCONSTRAINT_DIM, false),
+	: btMultiBodyConstraint(bodyA, bodyB, linkA, linkB, BTMBSLIDERCONSTRAINT_DIM, false, MULTIBODY_CONSTRAINT_SLIDER),
 	  m_rigidBodyA(0),
 	  m_rigidBodyB(0),
 	  m_pivotInA(pivotInA),
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp
index 3e5aa30f28..25ddd539bf 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp
@@ -23,7 +23,7 @@ subject to the following restrictions:
 #include "BulletDynamics/ConstraintSolver/btGeneric6DofSpring2Constraint.h"
 
 btMultiBodySphericalJointMotor::btMultiBodySphericalJointMotor(btMultiBody* body, int link, btScalar maxMotorImpulse)
-	: btMultiBodyConstraint(body, body, link, body->getLink(link).m_parent, 3, true),
+	: btMultiBodyConstraint(body, body, link, body->getLink(link).m_parent, 3, true, MULTIBODY_CONSTRAINT_SPHERICAL_MOTOR),
 	m_desiredVelocity(0, 0, 0),
 	m_desiredPosition(0,0,0,1),
 	m_kd(1.),
diff --git a/thirdparty/bullet/BulletSoftBody/DeformableBodyInplaceSolverIslandCallback.h b/thirdparty/bullet/BulletSoftBody/DeformableBodyInplaceSolverIslandCallback.h
index 7b225701f6..01c7e93a1b 100644
--- a/thirdparty/bullet/BulletSoftBody/DeformableBodyInplaceSolverIslandCallback.h
+++ b/thirdparty/bullet/BulletSoftBody/DeformableBodyInplaceSolverIslandCallback.h
@@ -13,13 +13,12 @@ struct DeformableBodyInplaceSolverIslandCallback : public MultiBodyInplaceSolver
 	btDeformableMultiBodyConstraintSolver* m_deformableSolver;
 
 	DeformableBodyInplaceSolverIslandCallback(btDeformableMultiBodyConstraintSolver* solver,
-		 btDispatcher* dispatcher)
-	: MultiBodyInplaceSolverIslandCallback(solver, dispatcher), m_deformableSolver(solver)
+											  btDispatcher* dispatcher)
+		: MultiBodyInplaceSolverIslandCallback(solver, dispatcher), m_deformableSolver(solver)
 	{
 	}
 
-
-	virtual void processConstraints(int islandId=-1)
+	virtual void processConstraints(int islandId = -1)
 	{
 		btCollisionObject** bodies = m_bodies.size() ? &m_bodies[0] : 0;
 		btCollisionObject** softBodies = m_softBodies.size() ? &m_softBodies[0] : 0;
@@ -30,7 +29,7 @@ struct DeformableBodyInplaceSolverIslandCallback : public MultiBodyInplaceSolver
 		//printf("mb contacts = %d, mb constraints = %d\n", mbContacts, m_multiBodyConstraints.size());
 
 		m_deformableSolver->solveDeformableBodyGroup(bodies, m_bodies.size(), softBodies, m_softBodies.size(), manifold, m_manifolds.size(), constraints, m_constraints.size(), multiBodyConstraints, m_multiBodyConstraints.size(), *m_solverInfo, m_debugDrawer, m_dispatcher);
-		if (m_bodies.size() && (m_solverInfo->m_reportSolverAnalytics&1))
+		if (m_bodies.size() && (m_solverInfo->m_reportSolverAnalytics & 1))
 		{
 			m_deformableSolver->m_analyticsData.m_islandId = islandId;
 			m_islandAnalyticsData.push_back(m_solver->m_analyticsData);
diff --git a/thirdparty/bullet/BulletSoftBody/btCGProjection.h b/thirdparty/bullet/BulletSoftBody/btCGProjection.h
index d047e6d3d9..e05970664c 100644
--- a/thirdparty/bullet/BulletSoftBody/btCGProjection.h
+++ b/thirdparty/bullet/BulletSoftBody/btCGProjection.h
@@ -22,85 +22,83 @@
 
 struct DeformableContactConstraint
 {
-    const btSoftBody::Node* m_node;
-    btAlignedObjectArray<const btSoftBody::RContact*> m_contact;
-    btAlignedObjectArray<btVector3> m_total_normal_dv;
-    btAlignedObjectArray<btVector3> m_total_tangent_dv;
-    btAlignedObjectArray<bool> m_static;
-    btAlignedObjectArray<bool> m_can_be_dynamic;
-    
-    DeformableContactConstraint(const btSoftBody::RContact& rcontact): m_node(rcontact.m_node)
-    {
-        append(rcontact);
-    }
-    
-    DeformableContactConstraint(): m_node(NULL)
-    {
-        m_contact.push_back(NULL);
-    }
-    
-    void append(const btSoftBody::RContact& rcontact)
-    {
-        m_contact.push_back(&rcontact);
-        m_total_normal_dv.push_back(btVector3(0,0,0));
-        m_total_tangent_dv.push_back(btVector3(0,0,0));
-        m_static.push_back(false);
-        m_can_be_dynamic.push_back(true);
-    }
-
-    void replace(const btSoftBody::RContact& rcontact)
-    {
-        m_contact.clear();
-        m_total_normal_dv.clear();
-        m_total_tangent_dv.clear();
-        m_static.clear();
-        m_can_be_dynamic.clear();
-        append(rcontact);
-    }
-    
-    ~DeformableContactConstraint()
-    {
-    }
+	const btSoftBody::Node* m_node;
+	btAlignedObjectArray<const btSoftBody::RContact*> m_contact;
+	btAlignedObjectArray<btVector3> m_total_normal_dv;
+	btAlignedObjectArray<btVector3> m_total_tangent_dv;
+	btAlignedObjectArray<bool> m_static;
+	btAlignedObjectArray<bool> m_can_be_dynamic;
+
+	DeformableContactConstraint(const btSoftBody::RContact& rcontact) : m_node(rcontact.m_node)
+	{
+		append(rcontact);
+	}
+
+	DeformableContactConstraint() : m_node(NULL)
+	{
+		m_contact.push_back(NULL);
+	}
+
+	void append(const btSoftBody::RContact& rcontact)
+	{
+		m_contact.push_back(&rcontact);
+		m_total_normal_dv.push_back(btVector3(0, 0, 0));
+		m_total_tangent_dv.push_back(btVector3(0, 0, 0));
+		m_static.push_back(false);
+		m_can_be_dynamic.push_back(true);
+	}
+
+	void replace(const btSoftBody::RContact& rcontact)
+	{
+		m_contact.clear();
+		m_total_normal_dv.clear();
+		m_total_tangent_dv.clear();
+		m_static.clear();
+		m_can_be_dynamic.clear();
+		append(rcontact);
+	}
+
+	~DeformableContactConstraint()
+	{
+	}
 };
 
 class btCGProjection
 {
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    typedef btAlignedObjectArray<btAlignedObjectArray<btVector3> > TVArrayStack;
-    typedef btAlignedObjectArray<btAlignedObjectArray<btScalar> > TArrayStack;
-    btAlignedObjectArray<btSoftBody *>& m_softBodies;
-    const btScalar& m_dt;
-    // map from node indices to node pointers
-    const btAlignedObjectArray<btSoftBody::Node*>* m_nodes;
-    
-    btCGProjection(btAlignedObjectArray<btSoftBody *>& softBodies, const btScalar& dt)
-    : m_softBodies(softBodies)
-    , m_dt(dt)
-    {
-    }
-    
-    virtual ~btCGProjection()
-    {
-    }
-    
-    // apply the constraints
-    virtual void project(TVStack& x) = 0;
-    
-    virtual void setConstraints() = 0;
-    
-    // update the constraints
-    virtual btScalar update() = 0;
-    
-    virtual void reinitialize(bool nodeUpdated)
-    {
-    }
-    
-    virtual void setIndices(const btAlignedObjectArray<btSoftBody::Node*>* nodes)
-    {
-        m_nodes = nodes;
-    }
-};
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	typedef btAlignedObjectArray<btAlignedObjectArray<btVector3> > TVArrayStack;
+	typedef btAlignedObjectArray<btAlignedObjectArray<btScalar> > TArrayStack;
+	btAlignedObjectArray<btSoftBody*>& m_softBodies;
+	const btScalar& m_dt;
+	// map from node indices to node pointers
+	const btAlignedObjectArray<btSoftBody::Node*>* m_nodes;
+
+	btCGProjection(btAlignedObjectArray<btSoftBody*>& softBodies, const btScalar& dt)
+		: m_softBodies(softBodies), m_dt(dt)
+	{
+	}
 
+	virtual ~btCGProjection()
+	{
+	}
+
+	// apply the constraints
+	virtual void project(TVStack& x) = 0;
+
+	virtual void setConstraints() = 0;
+
+	// update the constraints
+	virtual btScalar update() = 0;
+
+	virtual void reinitialize(bool nodeUpdated)
+	{
+	}
+
+	virtual void setIndices(const btAlignedObjectArray<btSoftBody::Node*>* nodes)
+	{
+		m_nodes = nodes;
+	}
+};
 
 #endif /* btCGProjection_h */
diff --git a/thirdparty/bullet/BulletSoftBody/btConjugateGradient.h b/thirdparty/bullet/BulletSoftBody/btConjugateGradient.h
index bd51e584b9..bcd5e6b519 100644
--- a/thirdparty/bullet/BulletSoftBody/btConjugateGradient.h
+++ b/thirdparty/bullet/BulletSoftBody/btConjugateGradient.h
@@ -15,144 +15,103 @@
 
 #ifndef BT_CONJUGATE_GRADIENT_H
 #define BT_CONJUGATE_GRADIENT_H
-#include <iostream>
-#include <cmath>
-#include <limits>
-#include <LinearMath/btAlignedObjectArray.h>
-#include <LinearMath/btVector3.h>
-#include "LinearMath/btQuickprof.h"
+#include "btKrylovSolver.h"
 template <class MatrixX>
-class btConjugateGradient
+class btConjugateGradient : public btKrylovSolver<MatrixX>
 {
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    TVStack r,p,z,temp;
-    int max_iterations;
-    btScalar tolerance_squared;
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	typedef btKrylovSolver<MatrixX> Base;
+	TVStack r, p, z, temp;
+
 public:
-    btConjugateGradient(const int max_it_in)
-    : max_iterations(max_it_in)
-    {
-       tolerance_squared = 1e-5;
-    }
-    
-    virtual ~btConjugateGradient(){}
-    
-    // return the number of iterations taken
-    int solve(MatrixX& A, TVStack& x, const TVStack& b, bool verbose = false)
-    {
-        BT_PROFILE("CGSolve");
-        btAssert(x.size() == b.size());
-        reinitialize(b);
-        // r = b - A * x --with assigned dof zeroed out
-        A.multiply(x, temp);
-        r = sub(b, temp);
-        A.project(r);
-        // z = M^(-1) * r
-        A.precondition(r, z);
-        A.project(z);
-        btScalar r_dot_z = dot(z,r);
-        if (r_dot_z <= tolerance_squared) {
-            if (verbose)
-            {
-                std::cout << "Iteration = 0" << std::endl;
-                std::cout << "Two norm of the residual = " << r_dot_z << std::endl;
-            }
-            return 0;
-        }
-        p = z;
-        btScalar r_dot_z_new = r_dot_z;
-        for (int k = 1; k <= max_iterations; k++) {
-            // temp = A*p
-            A.multiply(p, temp);
-            A.project(temp);
-            if (dot(p,temp) < SIMD_EPSILON)
-            {
-                if (verbose)
-                    std::cout << "Encountered negative direction in CG!" << std::endl;
-                if (k == 1)
-                {
-                    x = b;
-                }
-              return k;
-            }
-            // alpha = r^T * z / (p^T * A * p)
-            btScalar alpha = r_dot_z_new / dot(p, temp);
-            //  x += alpha * p;
-            multAndAddTo(alpha, p, x);
-            //  r -= alpha * temp;
-            multAndAddTo(-alpha, temp, r);
-            // z = M^(-1) * r
-            A.precondition(r, z);
-            r_dot_z = r_dot_z_new;
-            r_dot_z_new = dot(r,z);
-            if (r_dot_z_new < tolerance_squared) {
-                if (verbose)
-                {
-                    std::cout << "ConjugateGradient iterations " << k << std::endl;
-                }
-                return k;
-            }
+	btConjugateGradient(const int max_it_in)
+		: btKrylovSolver<MatrixX>(max_it_in, SIMD_EPSILON)
+	{
+	}
+
+	virtual ~btConjugateGradient() {}
+
+	// return the number of iterations taken
+	int solve(MatrixX& A, TVStack& x, const TVStack& b, bool verbose = false)
+	{
+		BT_PROFILE("CGSolve");
+		btAssert(x.size() == b.size());
+		reinitialize(b);
+		temp = b;
+		A.project(temp);
+		p = temp;
+		A.precondition(p, z);
+		btScalar d0 = this->dot(z, temp);
+		d0 = btMin(btScalar(1), d0);
+		// r = b - A * x --with assigned dof zeroed out
+		A.multiply(x, temp);
+		r = this->sub(b, temp);
+		A.project(r);
+		// z = M^(-1) * r
+		A.precondition(r, z);
+		A.project(z);
+		btScalar r_dot_z = this->dot(z, r);
+		if (r_dot_z <= Base::m_tolerance * d0)
+		{
+			if (verbose)
+			{
+				std::cout << "Iteration = 0" << std::endl;
+				std::cout << "Two norm of the residual = " << r_dot_z << std::endl;
+			}
+			return 0;
+		}
+		p = z;
+		btScalar r_dot_z_new = r_dot_z;
+		for (int k = 1; k <= Base::m_maxIterations; k++)
+		{
+			// temp = A*p
+			A.multiply(p, temp);
+			A.project(temp);
+			if (this->dot(p, temp) < 0)
+			{
+				if (verbose)
+					std::cout << "Encountered negative direction in CG!" << std::endl;
+				if (k == 1)
+				{
+					x = b;
+				}
+				return k;
+			}
+			// alpha = r^T * z / (p^T * A * p)
+			btScalar alpha = r_dot_z_new / this->dot(p, temp);
+			//  x += alpha * p;
+			this->multAndAddTo(alpha, p, x);
+			//  r -= alpha * temp;
+			this->multAndAddTo(-alpha, temp, r);
+			// z = M^(-1) * r
+			A.precondition(r, z);
+			r_dot_z = r_dot_z_new;
+			r_dot_z_new = this->dot(r, z);
+			if (r_dot_z_new < Base::m_tolerance * d0)
+			{
+				if (verbose)
+				{
+					std::cout << "ConjugateGradient iterations " << k << " residual = " << r_dot_z_new << std::endl;
+				}
+				return k;
+			}
+
+			btScalar beta = r_dot_z_new / r_dot_z;
+			p = this->multAndAdd(beta, p, z);
+		}
+		if (verbose)
+		{
+			std::cout << "ConjugateGradient max iterations reached " << Base::m_maxIterations << " error = " << r_dot_z_new << std::endl;
+		}
+		return Base::m_maxIterations;
+	}
 
-            btScalar beta = r_dot_z_new/r_dot_z;
-            p = multAndAdd(beta, p, z);
-        }
-        if (verbose)
-        {
-            std::cout << "ConjugateGradient max iterations reached " << max_iterations << std::endl;
-        }
-        return max_iterations;
-    }
-    
-    void reinitialize(const TVStack& b)
-    {
-        r.resize(b.size());
-        p.resize(b.size());
-        z.resize(b.size());
-        temp.resize(b.size());
-    }
-    
-    TVStack sub(const TVStack& a, const TVStack& b)
-    {
-        // c = a-b
-        btAssert(a.size() == b.size());
-        TVStack c;
-        c.resize(a.size());
-        for (int i = 0; i < a.size(); ++i)
-        {
-            c[i] = a[i] - b[i];
-        }
-        return c;
-    }
-    
-    btScalar squaredNorm(const TVStack& a)
-    {
-        return dot(a,a);
-    }
-    
-    btScalar dot(const TVStack& a, const TVStack& b)
-    {
-        btScalar ans(0);
-        for (int i = 0; i < a.size(); ++i)
-            ans += a[i].dot(b[i]);
-        return ans;
-    }
-    
-    void multAndAddTo(btScalar s, const TVStack& a, TVStack& result)
-    {
-//        result += s*a
-        btAssert(a.size() == result.size());
-        for (int i = 0; i < a.size(); ++i)
-            result[i] += s * a[i];
-    }
-    
-    TVStack multAndAdd(btScalar s, const TVStack& a, const TVStack& b)
-    {
-        // result = a*s + b
-        TVStack result;
-        result.resize(a.size());
-        for (int i = 0; i < a.size(); ++i)
-            result[i] = s * a[i] + b[i];
-        return result;
-    }
+	void reinitialize(const TVStack& b)
+	{
+		r.resize(b.size());
+		p.resize(b.size());
+		z.resize(b.size());
+		temp.resize(b.size());
+	}
 };
 #endif /* btConjugateGradient_h */
diff --git a/thirdparty/bullet/BulletSoftBody/btConjugateResidual.h b/thirdparty/bullet/BulletSoftBody/btConjugateResidual.h
index 7b211c4172..6146120365 100644
--- a/thirdparty/bullet/BulletSoftBody/btConjugateResidual.h
+++ b/thirdparty/bullet/BulletSoftBody/btConjugateResidual.h
@@ -15,174 +15,98 @@
 
 #ifndef BT_CONJUGATE_RESIDUAL_H
 #define BT_CONJUGATE_RESIDUAL_H
-#include <iostream>
-#include <cmath>
-#include <limits>
-#include <LinearMath/btAlignedObjectArray.h>
-#include <LinearMath/btVector3.h>
-#include <LinearMath/btScalar.h>
-#include "LinearMath/btQuickprof.h"
+#include "btKrylovSolver.h"
+
 template <class MatrixX>
-class btConjugateResidual
+class btConjugateResidual : public btKrylovSolver<MatrixX>
 {
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    TVStack r,p,z,temp_p, temp_r, best_x;
-    // temp_r = A*r
-    // temp_p = A*p
-    // z = M^(-1) * temp_p = M^(-1) * A * p
-    int max_iterations;
-    btScalar tolerance_squared, best_r;
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	typedef btKrylovSolver<MatrixX> Base;
+	TVStack r, p, z, temp_p, temp_r, best_x;
+	// temp_r = A*r
+	// temp_p = A*p
+	// z = M^(-1) * temp_p = M^(-1) * A * p
+	btScalar best_r;
+
 public:
-    btConjugateResidual(const int max_it_in)
-    : max_iterations(max_it_in)
-    {
-        tolerance_squared = 1e-2;
-    }
-    
-    virtual ~btConjugateResidual(){}
-    
-    // return the number of iterations taken
-    int solve(MatrixX& A, TVStack& x, const TVStack& b, bool verbose = false)
-    {
-        BT_PROFILE("CRSolve");
-        btAssert(x.size() == b.size());
-        reinitialize(b);
-        // r = b - A * x --with assigned dof zeroed out
-        A.multiply(x, temp_r); // borrow temp_r here to store A*x
-        r = sub(b, temp_r);
-        // z = M^(-1) * r
-        A.precondition(r, z);  // borrow z to store preconditioned r
-        r = z;
-        btScalar residual_norm = norm(r);
-        if (residual_norm <= tolerance_squared) {
-            if (verbose)
-            {
-                std::cout << "Iteration = 0" << std::endl;
-                std::cout << "Two norm of the residual = " << residual_norm << std::endl;
-            }
-            return 0;
-        }
-        p = r;
-        btScalar r_dot_Ar, r_dot_Ar_new;
-        // temp_p = A*p
-        A.multiply(p, temp_p);
-        // temp_r = A*r
-        temp_r = temp_p;
-        r_dot_Ar = dot(r, temp_r);
-        for (int k = 1; k <= max_iterations; k++) {
-            // z = M^(-1) * Ap
-            A.precondition(temp_p, z);
-            // alpha = r^T * A * r / (Ap)^T * M^-1 * Ap)
-            btScalar alpha = r_dot_Ar / dot(temp_p, z);
-            //  x += alpha * p;
-            multAndAddTo(alpha, p, x);
-            //  r -= alpha * z;
-            multAndAddTo(-alpha, z, r);
-            btScalar norm_r = norm(r);
-            if (norm_r < best_r)
-            {
-                best_x = x;
-                best_r = norm_r;
-                if (norm_r < tolerance_squared) {
-                    if (verbose)
-                    {
-                        std::cout << "ConjugateResidual iterations " << k << std::endl;
-                    }
-                    return k;
-                }
-                else
-                {
-                    if (verbose)
-                    {
-                        std::cout << "ConjugateResidual iterations " << k << " has residual "<< norm_r << std::endl;
-                    }
-                }
-            }
-            // temp_r = A * r;
-            A.multiply(r, temp_r);
-            r_dot_Ar_new = dot(r, temp_r);
-            btScalar beta = r_dot_Ar_new/r_dot_Ar;
-            r_dot_Ar = r_dot_Ar_new;
-            // p = beta*p + r;
-            p = multAndAdd(beta, p, r);
-            // temp_p = beta*temp_p + temp_r;
-            temp_p = multAndAdd(beta, temp_p, temp_r);
-        }
-        if (verbose)
-        {
-            std::cout << "ConjugateResidual max iterations reached " << max_iterations << std::endl;
-        }
-        x = best_x;
-        return max_iterations;
-    }
-    
-    void reinitialize(const TVStack& b)
-    {
-        r.resize(b.size());
-        p.resize(b.size());
-        z.resize(b.size());
-        temp_p.resize(b.size());
-        temp_r.resize(b.size());
-        best_x.resize(b.size());
-        best_r = SIMD_INFINITY;
-    }
-    
-    TVStack sub(const TVStack& a, const TVStack& b)
-    {
-        // c = a-b
-        btAssert(a.size() == b.size());
-        TVStack c;
-        c.resize(a.size());
-        for (int i = 0; i < a.size(); ++i)
-        {
-            c[i] = a[i] - b[i];
-        }
-        return c;
-    }
-    
-    btScalar squaredNorm(const TVStack& a)
-    {
-        return dot(a,a);
-    }
-    
-    btScalar norm(const TVStack& a)
-    {
-        btScalar ret = 0;
-        for (int i = 0; i < a.size(); ++i)
-        {
-            for (int d = 0; d < 3; ++d)
-            {
-                ret = btMax(ret, btFabs(a[i][d]));
-            }
-        }
-        return ret;
-    }
-    
-    btScalar dot(const TVStack& a, const TVStack& b)
-    {
-        btScalar ans(0);
-        for (int i = 0; i < a.size(); ++i)
-            ans += a[i].dot(b[i]);
-        return ans;
-    }
-    
-    void multAndAddTo(btScalar s, const TVStack& a, TVStack& result)
-    {
-        //        result += s*a
-        btAssert(a.size() == result.size());
-        for (int i = 0; i < a.size(); ++i)
-            result[i] += s * a[i];
-    }
-    
-    TVStack multAndAdd(btScalar s, const TVStack& a, const TVStack& b)
-    {
-        // result = a*s + b
-        TVStack result;
-        result.resize(a.size());
-        for (int i = 0; i < a.size(); ++i)
-            result[i] = s * a[i] + b[i];
-        return result;
-    }
+	btConjugateResidual(const int max_it_in)
+		: Base(max_it_in, 1e-8)
+	{
+	}
+
+	virtual ~btConjugateResidual() {}
+
+	// return the number of iterations taken
+	int solve(MatrixX& A, TVStack& x, const TVStack& b, bool verbose = false)
+	{
+		BT_PROFILE("CRSolve");
+		btAssert(x.size() == b.size());
+		reinitialize(b);
+		// r = b - A * x --with assigned dof zeroed out
+		A.multiply(x, temp_r);  // borrow temp_r here to store A*x
+		r = this->sub(b, temp_r);
+		// z = M^(-1) * r
+		A.precondition(r, z);  // borrow z to store preconditioned r
+		r = z;
+		btScalar residual_norm = this->norm(r);
+		if (residual_norm <= Base::m_tolerance)
+		{
+			return 0;
+		}
+		p = r;
+		btScalar r_dot_Ar, r_dot_Ar_new;
+		// temp_p = A*p
+		A.multiply(p, temp_p);
+		// temp_r = A*r
+		temp_r = temp_p;
+		r_dot_Ar = this->dot(r, temp_r);
+		for (int k = 1; k <= Base::m_maxIterations; k++)
+		{
+			// z = M^(-1) * Ap
+			A.precondition(temp_p, z);
+			// alpha = r^T * A * r / (Ap)^T * M^-1 * Ap)
+			btScalar alpha = r_dot_Ar / this->dot(temp_p, z);
+			//  x += alpha * p;
+			this->multAndAddTo(alpha, p, x);
+			//  r -= alpha * z;
+			this->multAndAddTo(-alpha, z, r);
+			btScalar norm_r = this->norm(r);
+			if (norm_r < best_r)
+			{
+				best_x = x;
+				best_r = norm_r;
+				if (norm_r < Base::m_tolerance)
+				{
+					return k;
+				}
+			}
+			// temp_r = A * r;
+			A.multiply(r, temp_r);
+			r_dot_Ar_new = this->dot(r, temp_r);
+			btScalar beta = r_dot_Ar_new / r_dot_Ar;
+			r_dot_Ar = r_dot_Ar_new;
+			// p = beta*p + r;
+			p = this->multAndAdd(beta, p, r);
+			// temp_p = beta*temp_p + temp_r;
+			temp_p = this->multAndAdd(beta, temp_p, temp_r);
+		}
+		if (verbose)
+		{
+			std::cout << "ConjugateResidual max iterations reached, residual = " << best_r << std::endl;
+		}
+		x = best_x;
+		return Base::m_maxIterations;
+	}
+
+	void reinitialize(const TVStack& b)
+	{
+		r.resize(b.size());
+		p.resize(b.size());
+		z.resize(b.size());
+		temp_p.resize(b.size());
+		temp_r.resize(b.size());
+		best_x.resize(b.size());
+		best_r = SIMD_INFINITY;
+	}
 };
 #endif /* btConjugateResidual_h */
-
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableBackwardEulerObjective.cpp b/thirdparty/bullet/BulletSoftBody/btDeformableBackwardEulerObjective.cpp
index 5381ee6265..2455ed2138 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableBackwardEulerObjective.cpp
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableBackwardEulerObjective.cpp
@@ -17,211 +17,283 @@
 #include "btPreconditioner.h"
 #include "LinearMath/btQuickprof.h"
 
-btDeformableBackwardEulerObjective::btDeformableBackwardEulerObjective(btAlignedObjectArray<btSoftBody *>& softBodies, const TVStack& backup_v)
-: m_softBodies(softBodies)
-, m_projection(softBodies)
-, m_backupVelocity(backup_v)
-, m_implicit(false)
+btDeformableBackwardEulerObjective::btDeformableBackwardEulerObjective(btAlignedObjectArray<btSoftBody*>& softBodies, const TVStack& backup_v)
+	: m_softBodies(softBodies), m_projection(softBodies), m_backupVelocity(backup_v), m_implicit(false)
 {
-    m_massPreconditioner = new MassPreconditioner(m_softBodies);
-    m_KKTPreconditioner = new KKTPreconditioner(m_softBodies, m_projection, m_lf, m_dt, m_implicit);
-    m_preconditioner = m_KKTPreconditioner;
+	m_massPreconditioner = new MassPreconditioner(m_softBodies);
+	m_KKTPreconditioner = new KKTPreconditioner(m_softBodies, m_projection, m_lf, m_dt, m_implicit);
+	m_preconditioner = m_KKTPreconditioner;
 }
 
 btDeformableBackwardEulerObjective::~btDeformableBackwardEulerObjective()
 {
-    delete m_KKTPreconditioner;
-    delete m_massPreconditioner;
+	delete m_KKTPreconditioner;
+	delete m_massPreconditioner;
 }
 
 void btDeformableBackwardEulerObjective::reinitialize(bool nodeUpdated, btScalar dt)
 {
-    BT_PROFILE("reinitialize");
-    if (dt > 0)
-    {
-        setDt(dt);
-    }
-    if(nodeUpdated)
-    {
-        updateId();
-    }
-    for (int i = 0; i < m_lf.size(); ++i)
-    {
-        m_lf[i]->reinitialize(nodeUpdated);
-    }
-    m_projection.reinitialize(nodeUpdated);
-//    m_preconditioner->reinitialize(nodeUpdated);
+	BT_PROFILE("reinitialize");
+	if (dt > 0)
+	{
+		setDt(dt);
+	}
+	if (nodeUpdated)
+	{
+		updateId();
+	}
+	for (int i = 0; i < m_lf.size(); ++i)
+	{
+		m_lf[i]->reinitialize(nodeUpdated);
+	}
+	btMatrix3x3 I;
+	I.setIdentity();
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			if (psb->m_nodes[j].m_im > 0)
+				psb->m_nodes[j].m_effectiveMass = I * (1.0 / psb->m_nodes[j].m_im);
+		}
+	}
+	m_projection.reinitialize(nodeUpdated);
+	//    m_preconditioner->reinitialize(nodeUpdated);
 }
 
 void btDeformableBackwardEulerObjective::setDt(btScalar dt)
 {
-    m_dt = dt;
+	m_dt = dt;
 }
 
 void btDeformableBackwardEulerObjective::multiply(const TVStack& x, TVStack& b) const
 {
-    BT_PROFILE("multiply");
-    // add in the mass term
-    size_t counter = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            const btSoftBody::Node& node = psb->m_nodes[j];
-            b[counter] = (node.m_im == 0) ? btVector3(0,0,0) : x[counter] / node.m_im;
-            ++counter;
-        }
-    }
-
-    for (int i = 0; i < m_lf.size(); ++i)
-    {
-        // add damping matrix
-        m_lf[i]->addScaledDampingForceDifferential(-m_dt, x, b);
-        if (m_implicit)
-        {
-             m_lf[i]->addScaledElasticForceDifferential(-m_dt*m_dt, x, b);
-        }
-    }
-    int offset = m_nodes.size();
-    for (int i = offset; i < b.size(); ++i)
-    {
-        b[i].setZero();
-    }
-    // add in the lagrange multiplier terms
-    
-    for (int c = 0; c < m_projection.m_lagrangeMultipliers.size(); ++c)
-    {
-        // C^T * lambda
-        const LagrangeMultiplier& lm = m_projection.m_lagrangeMultipliers[c];
-        for (int i = 0; i < lm.m_num_nodes; ++i)
-        {
-            for (int j = 0; j < lm.m_num_constraints; ++j)
-            {
-                b[lm.m_indices[i]] += x[offset+c][j] * lm.m_weights[i] * lm.m_dirs[j];
-            }
-        }
-        // C * x
-        for (int d = 0; d < lm.m_num_constraints; ++d)
-        {
-            for (int i = 0; i < lm.m_num_nodes; ++i)
-            {
-                b[offset+c][d] += lm.m_weights[i] * x[lm.m_indices[i]].dot(lm.m_dirs[d]);
-            }
-        }
-    }
+	BT_PROFILE("multiply");
+	// add in the mass term
+	size_t counter = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			const btSoftBody::Node& node = psb->m_nodes[j];
+			b[counter] = (node.m_im == 0) ? btVector3(0, 0, 0) : x[counter] / node.m_im;
+			++counter;
+		}
+	}
+
+	for (int i = 0; i < m_lf.size(); ++i)
+	{
+		// add damping matrix
+		m_lf[i]->addScaledDampingForceDifferential(-m_dt, x, b);
+        // Always integrate picking force implicitly for stability.
+        if (m_implicit || m_lf[i]->getForceType() == BT_MOUSE_PICKING_FORCE)
+		{
+			m_lf[i]->addScaledElasticForceDifferential(-m_dt * m_dt, x, b);
+		}
+	}
+	int offset = m_nodes.size();
+	for (int i = offset; i < b.size(); ++i)
+	{
+		b[i].setZero();
+	}
+	// add in the lagrange multiplier terms
+
+	for (int c = 0; c < m_projection.m_lagrangeMultipliers.size(); ++c)
+	{
+		// C^T * lambda
+		const LagrangeMultiplier& lm = m_projection.m_lagrangeMultipliers[c];
+		for (int i = 0; i < lm.m_num_nodes; ++i)
+		{
+			for (int j = 0; j < lm.m_num_constraints; ++j)
+			{
+				b[lm.m_indices[i]] += x[offset + c][j] * lm.m_weights[i] * lm.m_dirs[j];
+			}
+		}
+		// C * x
+		for (int d = 0; d < lm.m_num_constraints; ++d)
+		{
+			for (int i = 0; i < lm.m_num_nodes; ++i)
+			{
+				b[offset + c][d] += lm.m_weights[i] * x[lm.m_indices[i]].dot(lm.m_dirs[d]);
+			}
+		}
+	}
 }
 
 void btDeformableBackwardEulerObjective::updateVelocity(const TVStack& dv)
 {
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            btSoftBody::Node& node = psb->m_nodes[j];
-            node.m_v = m_backupVelocity[node.index] + dv[node.index];
-        }
-    }
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			btSoftBody::Node& node = psb->m_nodes[j];
+			node.m_v = m_backupVelocity[node.index] + dv[node.index];
+		}
+	}
 }
 
 void btDeformableBackwardEulerObjective::applyForce(TVStack& force, bool setZero)
 {
-    size_t counter = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        if (!psb->isActive())
-        {
-            counter += psb->m_nodes.size();
-            continue;
-        }
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            btScalar one_over_mass = (psb->m_nodes[j].m_im == 0) ? 0 : psb->m_nodes[j].m_im;
-            psb->m_nodes[j].m_v += one_over_mass * force[counter++];
-        }
-    }
-    if (setZero)
-    {
-        for (int i = 0; i < force.size(); ++i)
-            force[i].setZero();
-    }
+	size_t counter = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		if (!psb->isActive())
+		{
+			counter += psb->m_nodes.size();
+			continue;
+		}
+		if (m_implicit)
+		{
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				if (psb->m_nodes[j].m_im != 0)
+				{
+					psb->m_nodes[j].m_v += psb->m_nodes[j].m_effectiveMass_inv * force[counter++];
+				}
+			}
+		}
+		else
+		{
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				btScalar one_over_mass = (psb->m_nodes[j].m_im == 0) ? 0 : psb->m_nodes[j].m_im;
+				psb->m_nodes[j].m_v += one_over_mass * force[counter++];
+			}
+		}
+	}
+	if (setZero)
+	{
+		for (int i = 0; i < force.size(); ++i)
+			force[i].setZero();
+	}
 }
 
-void btDeformableBackwardEulerObjective::computeResidual(btScalar dt, TVStack &residual)
+void btDeformableBackwardEulerObjective::computeResidual(btScalar dt, TVStack& residual)
 {
-    BT_PROFILE("computeResidual");
-    // add implicit force
-    for (int i = 0; i < m_lf.size(); ++i)
-    {
-        if (m_implicit)
-        {
-            m_lf[i]->addScaledForces(dt, residual);
-        }
-        else
-        {
-            m_lf[i]->addScaledDampingForce(dt, residual);
-        }
-    }
-//    m_projection.project(residual);
+	BT_PROFILE("computeResidual");
+	// add implicit force
+	for (int i = 0; i < m_lf.size(); ++i)
+	{
+        // Always integrate picking force implicitly for stability.
+		if (m_implicit || m_lf[i]->getForceType() == BT_MOUSE_PICKING_FORCE)
+		{
+			m_lf[i]->addScaledForces(dt, residual);
+		}
+		else
+		{
+			m_lf[i]->addScaledDampingForce(dt, residual);
+		}
+	}
+	//    m_projection.project(residual);
 }
 
 btScalar btDeformableBackwardEulerObjective::computeNorm(const TVStack& residual) const
 {
-    btScalar mag = 0;
-    for (int i = 0; i < residual.size(); ++i)
-    {
-        mag += residual[i].length2();
-    }
-    return std::sqrt(mag);
+	btScalar mag = 0;
+	for (int i = 0; i < residual.size(); ++i)
+	{
+		mag += residual[i].length2();
+	}
+	return std::sqrt(mag);
 }
 
 btScalar btDeformableBackwardEulerObjective::totalEnergy(btScalar dt)
 {
-    btScalar e = 0;
-    for (int i = 0; i < m_lf.size(); ++i)
-    {
-        e += m_lf[i]->totalEnergy(dt);
-    }
-    return e;
+	btScalar e = 0;
+	for (int i = 0; i < m_lf.size(); ++i)
+	{
+		e += m_lf[i]->totalEnergy(dt);
+	}
+	return e;
 }
 
 void btDeformableBackwardEulerObjective::applyExplicitForce(TVStack& force)
 {
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        m_softBodies[i]->advanceDeformation();
-    }
-    
-    for (int i = 0; i < m_lf.size(); ++i)
-    {
-        m_lf[i]->addScaledExplicitForce(m_dt, force);
-    }
-    applyForce(force, true);
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		m_softBodies[i]->advanceDeformation();
+	}
+	if (m_implicit)
+	{
+		// apply forces except gravity force
+		btVector3 gravity;
+		for (int i = 0; i < m_lf.size(); ++i)
+		{
+			if (m_lf[i]->getForceType() == BT_GRAVITY_FORCE)
+			{
+				gravity = static_cast<btDeformableGravityForce*>(m_lf[i])->m_gravity;
+			}
+			else
+			{
+				m_lf[i]->addScaledForces(m_dt, force);
+			}
+		}
+		for (int i = 0; i < m_lf.size(); ++i)
+		{
+			m_lf[i]->addScaledHessian(m_dt);
+		}
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (psb->isActive())
+			{
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+				{
+					// add gravity explicitly
+					psb->m_nodes[j].m_v += m_dt * psb->m_gravityFactor * gravity;
+				}
+			}
+		}
+	}
+	else
+	{
+		for (int i = 0; i < m_lf.size(); ++i)
+		{
+			m_lf[i]->addScaledExplicitForce(m_dt, force);
+		}
+	}
+	// calculate inverse mass matrix for all nodes
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		if (psb->isActive())
+		{
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				if (psb->m_nodes[j].m_im > 0)
+				{
+					psb->m_nodes[j].m_effectiveMass_inv = psb->m_nodes[j].m_effectiveMass.inverse();
+				}
+			}
+		}
+	}
+	applyForce(force, true);
 }
 
 void btDeformableBackwardEulerObjective::initialGuess(TVStack& dv, const TVStack& residual)
 {
-    size_t counter = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            dv[counter] = psb->m_nodes[j].m_im * residual[counter];
-            ++counter;
-        }
-    }
+	size_t counter = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			dv[counter] = psb->m_nodes[j].m_im * residual[counter];
+			++counter;
+		}
+	}
 }
 
 //set constraints as projections
 void btDeformableBackwardEulerObjective::setConstraints(const btContactSolverInfo& infoGlobal)
 {
-    m_projection.setConstraints(infoGlobal);
+	m_projection.setConstraints(infoGlobal);
 }
 
 void btDeformableBackwardEulerObjective::applyDynamicFriction(TVStack& r)
 {
-     m_projection.applyDynamicFriction(r);
+	m_projection.applyDynamicFriction(r);
 }
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableBackwardEulerObjective.h b/thirdparty/bullet/BulletSoftBody/btDeformableBackwardEulerObjective.h
index 86579e71ac..eb05b9f010 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableBackwardEulerObjective.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableBackwardEulerObjective.h
@@ -31,143 +31,168 @@
 class btDeformableBackwardEulerObjective
 {
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    btScalar m_dt;
-    btAlignedObjectArray<btDeformableLagrangianForce*> m_lf;
-    btAlignedObjectArray<btSoftBody *>& m_softBodies;
-    Preconditioner* m_preconditioner;
-    btDeformableContactProjection m_projection;
-    const TVStack& m_backupVelocity;
-    btAlignedObjectArray<btSoftBody::Node* > m_nodes;
-    bool m_implicit;
-    MassPreconditioner* m_massPreconditioner;
-    KKTPreconditioner* m_KKTPreconditioner;
-
-    btDeformableBackwardEulerObjective(btAlignedObjectArray<btSoftBody *>& softBodies, const TVStack& backup_v);
-    
-    virtual ~btDeformableBackwardEulerObjective();
-    
-    void initialize(){}
-    
-    // compute the rhs for CG solve, i.e, add the dt scaled implicit force to residual
-    void computeResidual(btScalar dt, TVStack& residual);
-    
-    // add explicit force to the velocity
-    void applyExplicitForce(TVStack& force);
-    
-    // apply force to velocity and optionally reset the force to zero
-    void applyForce(TVStack& force, bool setZero);
-    
-    // compute the norm of the residual
-    btScalar computeNorm(const TVStack& residual) const;
-    
-    // compute one step of the solve (there is only one solve if the system is linear)
-    void computeStep(TVStack& dv, const TVStack& residual, const btScalar& dt);
-    
-    // perform A*x = b
-    void multiply(const TVStack& x, TVStack& b) const;
-    
-    // set initial guess for CG solve
-    void initialGuess(TVStack& dv, const TVStack& residual);
-    
-    // reset data structure and reset dt
-    void reinitialize(bool nodeUpdated, btScalar dt);
-    
-    void setDt(btScalar dt);
-    
-    // add friction force to residual
-    void applyDynamicFriction(TVStack& r);
-    
-    // add dv to velocity
-    void updateVelocity(const TVStack& dv);
-    
-    //set constraints as projections
-    void setConstraints(const btContactSolverInfo& infoGlobal);
-    
-    // update the projections and project the residual
-    void project(TVStack& r)
-    {
-        BT_PROFILE("project");
-        m_projection.project(r);
-    }
-    
-    // perform precondition M^(-1) x = b
-    void precondition(const TVStack& x, TVStack& b)
-    {
-        m_preconditioner->operator()(x,b);
-    }
-
-    // reindex all the vertices 
-    virtual void updateId()
-    {
-        size_t node_id = 0;
-        size_t face_id = 0;
-        m_nodes.clear();
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                psb->m_nodes[j].index = node_id;
-                m_nodes.push_back(&psb->m_nodes[j]);
-                ++node_id;
-            }
-            for (int j = 0; j < psb->m_faces.size(); ++j)
-            {
-                psb->m_faces[j].m_index = face_id;
-                ++face_id;
-            }
-        }
-    }
-    
-    const btAlignedObjectArray<btSoftBody::Node*>* getIndices() const
-    {
-        return &m_nodes;
-    }
-    
-    void setImplicit(bool implicit)
-    {
-        m_implicit = implicit;
-    }
-
-    // Calculate the total potential energy in the system
-    btScalar totalEnergy(btScalar dt);
-    
-    void addLagrangeMultiplier(const TVStack& vec, TVStack& extended_vec)
-    {
-        extended_vec.resize(vec.size() + m_projection.m_lagrangeMultipliers.size());
-        for (int i = 0; i < vec.size(); ++i)
-        {
-            extended_vec[i] = vec[i];
-        }
-        int offset = vec.size();
-        for (int i = 0; i < m_projection.m_lagrangeMultipliers.size(); ++i)
-        {
-            extended_vec[offset + i].setZero();
-        }
-    }
-    
-    void addLagrangeMultiplierRHS(const TVStack& residual, const TVStack& m_dv, TVStack& extended_residual)
-    {
-        extended_residual.resize(residual.size() + m_projection.m_lagrangeMultipliers.size());
-        for (int i = 0; i < residual.size(); ++i)
-        {
-            extended_residual[i] = residual[i];
-        }
-        int offset = residual.size();
-        for (int i = 0; i < m_projection.m_lagrangeMultipliers.size(); ++i)
-        {
-            const LagrangeMultiplier& lm = m_projection.m_lagrangeMultipliers[i];
-            extended_residual[offset + i].setZero();
-            for (int d = 0; d < lm.m_num_constraints; ++d)
-            {
-                for (int n = 0; n < lm.m_num_nodes; ++n)
-                {
-                    extended_residual[offset + i][d] += lm.m_weights[n] * m_dv[lm.m_indices[n]].dot(lm.m_dirs[d]);
-                }
-            }
-        }
-    }
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	btScalar m_dt;
+	btAlignedObjectArray<btDeformableLagrangianForce*> m_lf;
+	btAlignedObjectArray<btSoftBody*>& m_softBodies;
+	Preconditioner* m_preconditioner;
+	btDeformableContactProjection m_projection;
+	const TVStack& m_backupVelocity;
+	btAlignedObjectArray<btSoftBody::Node*> m_nodes;
+	bool m_implicit;
+	MassPreconditioner* m_massPreconditioner;
+	KKTPreconditioner* m_KKTPreconditioner;
+
+	btDeformableBackwardEulerObjective(btAlignedObjectArray<btSoftBody*>& softBodies, const TVStack& backup_v);
+
+	virtual ~btDeformableBackwardEulerObjective();
+
+	void initialize() {}
+
+	// compute the rhs for CG solve, i.e, add the dt scaled implicit force to residual
+	void computeResidual(btScalar dt, TVStack& residual);
+
+	// add explicit force to the velocity
+	void applyExplicitForce(TVStack& force);
+
+	// apply force to velocity and optionally reset the force to zero
+	void applyForce(TVStack& force, bool setZero);
+
+	// compute the norm of the residual
+	btScalar computeNorm(const TVStack& residual) const;
+
+	// compute one step of the solve (there is only one solve if the system is linear)
+	void computeStep(TVStack& dv, const TVStack& residual, const btScalar& dt);
+
+	// perform A*x = b
+	void multiply(const TVStack& x, TVStack& b) const;
+
+	// set initial guess for CG solve
+	void initialGuess(TVStack& dv, const TVStack& residual);
+
+	// reset data structure and reset dt
+	void reinitialize(bool nodeUpdated, btScalar dt);
+
+	void setDt(btScalar dt);
+
+	// add friction force to residual
+	void applyDynamicFriction(TVStack& r);
+
+	// add dv to velocity
+	void updateVelocity(const TVStack& dv);
+
+	//set constraints as projections
+	void setConstraints(const btContactSolverInfo& infoGlobal);
+
+	// update the projections and project the residual
+	void project(TVStack& r)
+	{
+		BT_PROFILE("project");
+		m_projection.project(r);
+	}
+
+	// perform precondition M^(-1) x = b
+	void precondition(const TVStack& x, TVStack& b)
+	{
+		m_preconditioner->operator()(x, b);
+	}
+
+	// reindex all the vertices
+	virtual void updateId()
+	{
+		size_t node_id = 0;
+		size_t face_id = 0;
+		m_nodes.clear();
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				psb->m_nodes[j].index = node_id;
+				m_nodes.push_back(&psb->m_nodes[j]);
+				++node_id;
+			}
+			for (int j = 0; j < psb->m_faces.size(); ++j)
+			{
+				psb->m_faces[j].m_index = face_id;
+				++face_id;
+			}
+		}
+	}
+
+	const btAlignedObjectArray<btSoftBody::Node*>* getIndices() const
+	{
+		return &m_nodes;
+	}
+
+	void setImplicit(bool implicit)
+	{
+		m_implicit = implicit;
+	}
+
+	// Calculate the total potential energy in the system
+	btScalar totalEnergy(btScalar dt);
+
+	void addLagrangeMultiplier(const TVStack& vec, TVStack& extended_vec)
+	{
+		extended_vec.resize(vec.size() + m_projection.m_lagrangeMultipliers.size());
+		for (int i = 0; i < vec.size(); ++i)
+		{
+			extended_vec[i] = vec[i];
+		}
+		int offset = vec.size();
+		for (int i = 0; i < m_projection.m_lagrangeMultipliers.size(); ++i)
+		{
+			extended_vec[offset + i].setZero();
+		}
+	}
+
+	void addLagrangeMultiplierRHS(const TVStack& residual, const TVStack& m_dv, TVStack& extended_residual)
+	{
+		extended_residual.resize(residual.size() + m_projection.m_lagrangeMultipliers.size());
+		for (int i = 0; i < residual.size(); ++i)
+		{
+			extended_residual[i] = residual[i];
+		}
+		int offset = residual.size();
+		for (int i = 0; i < m_projection.m_lagrangeMultipliers.size(); ++i)
+		{
+			const LagrangeMultiplier& lm = m_projection.m_lagrangeMultipliers[i];
+			extended_residual[offset + i].setZero();
+			for (int d = 0; d < lm.m_num_constraints; ++d)
+			{
+				for (int n = 0; n < lm.m_num_nodes; ++n)
+				{
+					extended_residual[offset + i][d] += lm.m_weights[n] * m_dv[lm.m_indices[n]].dot(lm.m_dirs[d]);
+				}
+			}
+		}
+	}
+
+	void calculateContactForce(const TVStack& dv, const TVStack& rhs, TVStack& f)
+	{
+		size_t counter = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				const btSoftBody::Node& node = psb->m_nodes[j];
+				f[counter] = (node.m_im == 0) ? btVector3(0, 0, 0) : dv[counter] / node.m_im;
+				++counter;
+			}
+		}
+		for (int i = 0; i < m_lf.size(); ++i)
+		{
+			// add damping matrix
+			m_lf[i]->addScaledDampingForceDifferential(-m_dt, dv, f);
+		}
+		counter = 0;
+		for (; counter < f.size(); ++counter)
+		{
+			f[counter] = rhs[counter] - f[counter];
+		}
+	}
 };
 
 #endif /* btBackwardEulerObjective_h */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableBodySolver.cpp b/thirdparty/bullet/BulletSoftBody/btDeformableBodySolver.cpp
index 132699c54f..4b11fccecb 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableBodySolver.cpp
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableBodySolver.cpp
@@ -18,468 +18,489 @@
 #include "btDeformableBodySolver.h"
 #include "btSoftBodyInternals.h"
 #include "LinearMath/btQuickprof.h"
-static const int kMaxConjugateGradientIterations = 50;
+static const int kMaxConjugateGradientIterations = 300;
 btDeformableBodySolver::btDeformableBodySolver()
-: m_numNodes(0)
-, m_cg(kMaxConjugateGradientIterations)
-, m_cr(kMaxConjugateGradientIterations)
-, m_maxNewtonIterations(5)
-, m_newtonTolerance(1e-4)
-, m_lineSearch(false)
-, m_useProjection(false)
+	: m_numNodes(0), m_cg(kMaxConjugateGradientIterations), m_cr(kMaxConjugateGradientIterations), m_maxNewtonIterations(1), m_newtonTolerance(1e-4), m_lineSearch(false), m_useProjection(false)
 {
-    m_objective = new btDeformableBackwardEulerObjective(m_softBodies, m_backupVelocity);
+	m_objective = new btDeformableBackwardEulerObjective(m_softBodies, m_backupVelocity);
 }
 
 btDeformableBodySolver::~btDeformableBodySolver()
 {
-    delete m_objective;
+	delete m_objective;
 }
 
 void btDeformableBodySolver::solveDeformableConstraints(btScalar solverdt)
 {
-    BT_PROFILE("solveDeformableConstraints");
-    if (!m_implicit)
-    {
-        m_objective->computeResidual(solverdt, m_residual);
-        m_objective->applyDynamicFriction(m_residual);
-        if (m_useProjection)
-        {
-            computeStep(m_dv, m_residual);
-        }
-        else
-        {
-            TVStack rhs, x;
-            m_objective->addLagrangeMultiplierRHS(m_residual, m_dv, rhs);
-            m_objective->addLagrangeMultiplier(m_dv, x);
-            m_objective->m_preconditioner->reinitialize(true);
-            computeStep(x, rhs);
-            for (int i = 0; i<m_dv.size(); ++i)
-            {
-                    m_dv[i] = x[i];
-            }
-        }
-        updateVelocity();
-    }
-    else
-    {
-        for (int i = 0; i < m_maxNewtonIterations; ++i)
-        {
-            updateState();
-            // add the inertia term in the residual
-            int counter = 0;
-            for (int k = 0; k < m_softBodies.size(); ++k)
-            {
-                btSoftBody* psb = m_softBodies[k];
-                for (int j = 0; j < psb->m_nodes.size(); ++j)
-                {
-                    if (psb->m_nodes[j].m_im > 0)
-                    {
-                        m_residual[counter] = (-1./psb->m_nodes[j].m_im) *  m_dv[counter];
-                    }
-                    ++counter;
-                }
-            }
-
-            m_objective->computeResidual(solverdt, m_residual);
-            if (m_objective->computeNorm(m_residual) < m_newtonTolerance && i > 0)
-            {
-                break;
-            }
-            // todo xuchenhan@: this really only needs to be calculated once
-            m_objective->applyDynamicFriction(m_residual);
-            if (m_lineSearch)
-            {
-                btScalar inner_product = computeDescentStep(m_ddv,m_residual);
-                btScalar alpha = 0.01, beta = 0.5; // Boyd & Vandenberghe suggested alpha between 0.01 and 0.3, beta between 0.1 to 0.8
-                btScalar scale = 2;
-                btScalar f0 = m_objective->totalEnergy(solverdt)+kineticEnergy(), f1, f2;
-                backupDv();
-                do {
-                    scale *= beta;
-                    if (scale < 1e-8) {
-                        return;
-                    }
-                    updateEnergy(scale);
-                    f1 = m_objective->totalEnergy(solverdt)+kineticEnergy();
-                    f2 = f0 - alpha * scale * inner_product;
-                } while (!(f1 < f2+SIMD_EPSILON)); // if anything here is nan then the search continues
-                revertDv();
-                updateDv(scale);
-            }
-            else
-            {
-                computeStep(m_ddv, m_residual);
-                updateDv();
-            }
-            for (int j = 0; j < m_numNodes; ++j)
-            {
-                m_ddv[j].setZero();
-                m_residual[j].setZero();
-            }
-        }
-        updateVelocity();
-    }
+	BT_PROFILE("solveDeformableConstraints");
+	if (!m_implicit)
+	{
+		m_objective->computeResidual(solverdt, m_residual);
+		m_objective->applyDynamicFriction(m_residual);
+		if (m_useProjection)
+		{
+			computeStep(m_dv, m_residual);
+		}
+		else
+		{
+			TVStack rhs, x;
+			m_objective->addLagrangeMultiplierRHS(m_residual, m_dv, rhs);
+			m_objective->addLagrangeMultiplier(m_dv, x);
+			m_objective->m_preconditioner->reinitialize(true);
+			computeStep(x, rhs);
+			for (int i = 0; i < m_dv.size(); ++i)
+			{
+				m_dv[i] = x[i];
+			}
+		}
+		updateVelocity();
+	}
+	else
+	{
+		for (int i = 0; i < m_maxNewtonIterations; ++i)
+		{
+			updateState();
+			// add the inertia term in the residual
+			int counter = 0;
+			for (int k = 0; k < m_softBodies.size(); ++k)
+			{
+				btSoftBody* psb = m_softBodies[k];
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+				{
+					if (psb->m_nodes[j].m_im > 0)
+					{
+						m_residual[counter] = (-1. / psb->m_nodes[j].m_im) * m_dv[counter];
+					}
+					++counter;
+				}
+			}
+
+			m_objective->computeResidual(solverdt, m_residual);
+			if (m_objective->computeNorm(m_residual) < m_newtonTolerance && i > 0)
+			{
+				break;
+			}
+			// todo xuchenhan@: this really only needs to be calculated once
+			m_objective->applyDynamicFriction(m_residual);
+			if (m_lineSearch)
+			{
+				btScalar inner_product = computeDescentStep(m_ddv, m_residual);
+				btScalar alpha = 0.01, beta = 0.5;  // Boyd & Vandenberghe suggested alpha between 0.01 and 0.3, beta between 0.1 to 0.8
+				btScalar scale = 2;
+				btScalar f0 = m_objective->totalEnergy(solverdt) + kineticEnergy(), f1, f2;
+				backupDv();
+				do
+				{
+					scale *= beta;
+					if (scale < 1e-8)
+					{
+						return;
+					}
+					updateEnergy(scale);
+					f1 = m_objective->totalEnergy(solverdt) + kineticEnergy();
+					f2 = f0 - alpha * scale * inner_product;
+				} while (!(f1 < f2 + SIMD_EPSILON));  // if anything here is nan then the search continues
+				revertDv();
+				updateDv(scale);
+			}
+			else
+			{
+				computeStep(m_ddv, m_residual);
+				updateDv();
+			}
+			for (int j = 0; j < m_numNodes; ++j)
+			{
+				m_ddv[j].setZero();
+				m_residual[j].setZero();
+			}
+		}
+		updateVelocity();
+	}
 }
 
 btScalar btDeformableBodySolver::kineticEnergy()
 {
-    btScalar ke = 0;
-    for (int i = 0; i < m_softBodies.size();++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        for (int j = 0; j < psb->m_nodes.size();++j)
-        {
-            btSoftBody::Node& node = psb->m_nodes[j];
-            if (node.m_im > 0)
-            {
-                ke += m_dv[node.index].length2() * 0.5 / node.m_im;
-            }
-        }
-    }
-    return ke;
+	btScalar ke = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			btSoftBody::Node& node = psb->m_nodes[j];
+			if (node.m_im > 0)
+			{
+				ke += m_dv[node.index].length2() * 0.5 / node.m_im;
+			}
+		}
+	}
+	return ke;
 }
 
 void btDeformableBodySolver::backupDv()
 {
-    m_backup_dv.resize(m_dv.size());
-    for (int i = 0; i<m_backup_dv.size(); ++i)
-    {
-        m_backup_dv[i] = m_dv[i];
-    }
+	m_backup_dv.resize(m_dv.size());
+	for (int i = 0; i < m_backup_dv.size(); ++i)
+	{
+		m_backup_dv[i] = m_dv[i];
+	}
 }
 
 void btDeformableBodySolver::revertDv()
 {
-    for (int i = 0; i<m_backup_dv.size(); ++i)
-    {
-        m_dv[i] = m_backup_dv[i];
-    }
+	for (int i = 0; i < m_backup_dv.size(); ++i)
+	{
+		m_dv[i] = m_backup_dv[i];
+	}
 }
 
 void btDeformableBodySolver::updateEnergy(btScalar scale)
 {
-    for (int i = 0; i<m_dv.size(); ++i)
-    {
-        m_dv[i] = m_backup_dv[i] + scale * m_ddv[i];
-    }
-    updateState();
+	for (int i = 0; i < m_dv.size(); ++i)
+	{
+		m_dv[i] = m_backup_dv[i] + scale * m_ddv[i];
+	}
+	updateState();
 }
 
-
 btScalar btDeformableBodySolver::computeDescentStep(TVStack& ddv, const TVStack& residual, bool verbose)
 {
-    m_cg.solve(*m_objective, ddv, residual, false);
-    btScalar inner_product = m_cg.dot(residual, m_ddv);
-    btScalar res_norm = m_objective->computeNorm(residual);
-    btScalar tol = 1e-5 * res_norm * m_objective->computeNorm(m_ddv);
-    if (inner_product < -tol)
-    {
-        if (verbose)
-        {
-            std::cout << "Looking backwards!" << std::endl;
-        }
-        for (int i = 0; i < m_ddv.size();++i)
-        {
-            m_ddv[i] = -m_ddv[i];
-        }
-        inner_product = -inner_product;
-    }
-    else if (std::abs(inner_product) < tol)
-    {
-        if (verbose)
-        {
-            std::cout << "Gradient Descent!" << std::endl;
-        }
-        btScalar scale = m_objective->computeNorm(m_ddv) / res_norm;
-        for (int i = 0; i < m_ddv.size();++i)
-        {
-            m_ddv[i] = scale * residual[i];
-        }
-        inner_product = scale * res_norm * res_norm;
-    }
-    return inner_product;
+	m_cg.solve(*m_objective, ddv, residual, false);
+	btScalar inner_product = m_cg.dot(residual, m_ddv);
+	btScalar res_norm = m_objective->computeNorm(residual);
+	btScalar tol = 1e-5 * res_norm * m_objective->computeNorm(m_ddv);
+	if (inner_product < -tol)
+	{
+		if (verbose)
+		{
+			std::cout << "Looking backwards!" << std::endl;
+		}
+		for (int i = 0; i < m_ddv.size(); ++i)
+		{
+			m_ddv[i] = -m_ddv[i];
+		}
+		inner_product = -inner_product;
+	}
+	else if (std::abs(inner_product) < tol)
+	{
+		if (verbose)
+		{
+			std::cout << "Gradient Descent!" << std::endl;
+		}
+		btScalar scale = m_objective->computeNorm(m_ddv) / res_norm;
+		for (int i = 0; i < m_ddv.size(); ++i)
+		{
+			m_ddv[i] = scale * residual[i];
+		}
+		inner_product = scale * res_norm * res_norm;
+	}
+	return inner_product;
 }
 
 void btDeformableBodySolver::updateState()
 {
-    updateVelocity();
-    updateTempPosition();
+	updateVelocity();
+	updateTempPosition();
 }
 
 void btDeformableBodySolver::updateDv(btScalar scale)
 {
-    for (int i = 0; i < m_numNodes; ++i)
-    {
-        m_dv[i] += scale * m_ddv[i];
-    }
+	for (int i = 0; i < m_numNodes; ++i)
+	{
+		m_dv[i] += scale * m_ddv[i];
+	}
 }
 
 void btDeformableBodySolver::computeStep(TVStack& ddv, const TVStack& residual)
 {
-    if (m_useProjection)
-        m_cg.solve(*m_objective, ddv, residual, false);
-    else
-        m_cr.solve(*m_objective, ddv, residual, false);
+	if (m_useProjection)
+		m_cg.solve(*m_objective, ddv, residual, false);
+	else
+		m_cr.solve(*m_objective, ddv, residual, false);
 }
 
-void btDeformableBodySolver::reinitialize(const btAlignedObjectArray<btSoftBody *>& softBodies, btScalar dt)
+void btDeformableBodySolver::reinitialize(const btAlignedObjectArray<btSoftBody*>& softBodies, btScalar dt)
 {
-    m_softBodies.copyFromArray(softBodies);
-    bool nodeUpdated = updateNodes();
-    
-    if (nodeUpdated)
-    {
-        m_dv.resize(m_numNodes, btVector3(0,0,0));
-        m_ddv.resize(m_numNodes, btVector3(0,0,0));
-        m_residual.resize(m_numNodes, btVector3(0,0,0));
-        m_backupVelocity.resize(m_numNodes, btVector3(0,0,0));
-    }
-    
-    // need to setZero here as resize only set value for newly allocated items
-    for (int i = 0; i < m_numNodes; ++i)
-    {
-        m_dv[i].setZero();
-        m_ddv[i].setZero();
-        m_residual[i].setZero();
-    }
-    
-    m_dt = dt;
-    m_objective->reinitialize(nodeUpdated, dt);
-    updateSoftBodies();
-}
+	m_softBodies.copyFromArray(softBodies);
+	bool nodeUpdated = updateNodes();
 
-void btDeformableBodySolver::setConstraints(const btContactSolverInfo& infoGlobal)
-{
-    BT_PROFILE("setConstraint");
-    m_objective->setConstraints(infoGlobal);
+	if (nodeUpdated)
+	{
+		m_dv.resize(m_numNodes, btVector3(0, 0, 0));
+		m_ddv.resize(m_numNodes, btVector3(0, 0, 0));
+		m_residual.resize(m_numNodes, btVector3(0, 0, 0));
+		m_backupVelocity.resize(m_numNodes, btVector3(0, 0, 0));
+	}
+
+	// need to setZero here as resize only set value for newly allocated items
+	for (int i = 0; i < m_numNodes; ++i)
+	{
+		m_dv[i].setZero();
+		m_ddv[i].setZero();
+		m_residual[i].setZero();
+	}
+
+	if (dt > 0)
+	{
+		m_dt = dt;
+	}
+	m_objective->reinitialize(nodeUpdated, dt);
+	updateSoftBodies();
 }
 
-btScalar btDeformableBodySolver::solveContactConstraints(btCollisionObject** deformableBodies,int numDeformableBodies, const btContactSolverInfo& infoGlobal)
+void btDeformableBodySolver::setConstraints(const btContactSolverInfo& infoGlobal)
 {
-    BT_PROFILE("solveContactConstraints");
-    btScalar maxSquaredResidual = m_objective->m_projection.update(deformableBodies,numDeformableBodies, infoGlobal);
-    return maxSquaredResidual;
+	BT_PROFILE("setConstraint");
+	m_objective->setConstraints(infoGlobal);
 }
 
-void btDeformableBodySolver::splitImpulseSetup(const btContactSolverInfo& infoGlobal)
+btScalar btDeformableBodySolver::solveContactConstraints(btCollisionObject** deformableBodies, int numDeformableBodies, const btContactSolverInfo& infoGlobal)
 {
-     m_objective->m_projection.splitImpulseSetup(infoGlobal);
+	BT_PROFILE("solveContactConstraints");
+	btScalar maxSquaredResidual = m_objective->m_projection.update(deformableBodies, numDeformableBodies, infoGlobal);
+	return maxSquaredResidual;
 }
 
 void btDeformableBodySolver::updateVelocity()
 {
-    int counter = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        psb->m_maxSpeedSquared = 0;
-        if (!psb->isActive())
-        {
-            counter += psb->m_nodes.size();
-            continue;
-        }
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            // set NaN to zero;
-            if (m_dv[counter] != m_dv[counter])
-            {
-                m_dv[counter].setZero();
-            }
-            psb->m_nodes[j].m_v = m_backupVelocity[counter]+m_dv[counter];
-            psb->m_maxSpeedSquared = btMax(psb->m_maxSpeedSquared, psb->m_nodes[j].m_v.length2());
-            ++counter;
-        }
-    }
+	int counter = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		psb->m_maxSpeedSquared = 0;
+		if (!psb->isActive())
+		{
+			counter += psb->m_nodes.size();
+			continue;
+		}
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			// set NaN to zero;
+			if (m_dv[counter] != m_dv[counter])
+			{
+				m_dv[counter].setZero();
+			}
+			if (m_implicit)
+			{
+				psb->m_nodes[j].m_v = m_backupVelocity[counter] + m_dv[counter];
+			}
+			else
+			{
+				psb->m_nodes[j].m_v = m_backupVelocity[counter] + m_dv[counter] - psb->m_nodes[j].m_splitv;
+			}
+			psb->m_maxSpeedSquared = btMax(psb->m_maxSpeedSquared, psb->m_nodes[j].m_v.length2());
+			++counter;
+		}
+	}
 }
 
 void btDeformableBodySolver::updateTempPosition()
 {
-    int counter = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        if (!psb->isActive())
-        {
-            counter += psb->m_nodes.size();
-            continue;
-        }
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            psb->m_nodes[j].m_q = psb->m_nodes[j].m_x + m_dt * psb->m_nodes[j].m_v;
-            ++counter;
-        }
-        psb->updateDeformation();
-    }
+	int counter = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		if (!psb->isActive())
+		{
+			counter += psb->m_nodes.size();
+			continue;
+		}
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			psb->m_nodes[j].m_q = psb->m_nodes[j].m_x + m_dt * (psb->m_nodes[j].m_v + psb->m_nodes[j].m_splitv);
+			++counter;
+		}
+		psb->updateDeformation();
+	}
 }
 
 void btDeformableBodySolver::backupVelocity()
 {
-    int counter = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            m_backupVelocity[counter++] = psb->m_nodes[j].m_v;
-        }
-    }
+	int counter = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			m_backupVelocity[counter++] = psb->m_nodes[j].m_v;
+		}
+	}
 }
 
 void btDeformableBodySolver::setupDeformableSolve(bool implicit)
 {
-    int counter = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        if (!psb->isActive())
-        {
-            counter += psb->m_nodes.size();
-            continue;
-        }
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            if (implicit)
-            {
-                if ((psb->m_nodes[j].m_v - m_backupVelocity[counter]).norm() < SIMD_EPSILON)
-                    m_dv[counter] = psb->m_nodes[j].m_v - m_backupVelocity[counter];
-                else
-                    m_dv[counter] = psb->m_nodes[j].m_v - psb->m_nodes[j].m_vn;
-                m_backupVelocity[counter] = psb->m_nodes[j].m_vn;
-            }
-            else
-            {
-                m_dv[counter] =  psb->m_nodes[j].m_v - m_backupVelocity[counter];
-            }
-            psb->m_nodes[j].m_v = m_backupVelocity[counter];
-            ++counter;
-        }
-    }
+	int counter = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		if (!psb->isActive())
+		{
+			counter += psb->m_nodes.size();
+			continue;
+		}
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			if (implicit)
+			{
+				// setting the initial guess for newton, need m_dv = v_{n+1} - v_n for dofs that are in constraint.
+				if (psb->m_nodes[j].m_v == m_backupVelocity[counter])
+					m_dv[counter].setZero();
+				else
+					m_dv[counter] = psb->m_nodes[j].m_v - psb->m_nodes[j].m_vn;
+				m_backupVelocity[counter] = psb->m_nodes[j].m_vn;
+			}
+			else
+			{
+				m_dv[counter] = psb->m_nodes[j].m_v + psb->m_nodes[j].m_splitv - m_backupVelocity[counter];
+			}
+			psb->m_nodes[j].m_v = m_backupVelocity[counter];
+			++counter;
+		}
+	}
 }
 
 void btDeformableBodySolver::revertVelocity()
 {
-    int counter = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            psb->m_nodes[j].m_v = m_backupVelocity[counter++];
-        }
-    }
+	int counter = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			psb->m_nodes[j].m_v = m_backupVelocity[counter++];
+		}
+	}
 }
 
 bool btDeformableBodySolver::updateNodes()
 {
-    int numNodes = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-        numNodes += m_softBodies[i]->m_nodes.size();
-    if (numNodes != m_numNodes)
-    {
-        m_numNodes = numNodes;
-        return true;
-    }
-    return false;
+	int numNodes = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+		numNodes += m_softBodies[i]->m_nodes.size();
+	if (numNodes != m_numNodes)
+	{
+		m_numNodes = numNodes;
+		return true;
+	}
+	return false;
 }
 
-
 void btDeformableBodySolver::predictMotion(btScalar solverdt)
 {
-    // apply explicit forces to velocity
-    m_objective->applyExplicitForce(m_residual);
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody *psb = m_softBodies[i];
-        
-        if (psb->isActive())
-        {
-            // predict motion for collision detection
-            predictDeformableMotion(psb, solverdt);
-        }
-    }
+	// apply explicit forces to velocity
+	if (m_implicit)
+	{
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (psb->isActive())
+			{
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+				{
+					psb->m_nodes[j].m_q = psb->m_nodes[j].m_x + psb->m_nodes[j].m_v * solverdt;
+				}
+			}
+		}
+	}
+	m_objective->applyExplicitForce(m_residual);
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+
+		if (psb->isActive())
+		{
+			// predict motion for collision detection
+			predictDeformableMotion(psb, solverdt);
+		}
+	}
 }
 
 void btDeformableBodySolver::predictDeformableMotion(btSoftBody* psb, btScalar dt)
 {
-    BT_PROFILE("btDeformableBodySolver::predictDeformableMotion");
-    int i, ni;
-    
-    /* Update                */
-    if (psb->m_bUpdateRtCst)
-    {
-        psb->m_bUpdateRtCst = false;
-        psb->updateConstants();
-        psb->m_fdbvt.clear();
-        if (psb->m_cfg.collisions & btSoftBody::fCollision::SDF_RD)
-        {
-            psb->initializeFaceTree();
-        }
-    }
-    
-    /* Prepare                */
-    psb->m_sst.sdt = dt * psb->m_cfg.timescale;
-    psb->m_sst.isdt = 1 / psb->m_sst.sdt;
-    psb->m_sst.velmrg = psb->m_sst.sdt * 3;
-    psb->m_sst.radmrg = psb->getCollisionShape()->getMargin();
-    psb->m_sst.updmrg = psb->m_sst.radmrg * (btScalar)0.25;
-    /* Bounds                */
-    psb->updateBounds();
-    
-    /* Integrate            */
-    // do not allow particles to move more than the bounding box size
-    btScalar max_v = (psb->m_bounds[1]-psb->m_bounds[0]).norm() / dt;
-    for (i = 0, ni = psb->m_nodes.size(); i < ni; ++i)
-    {
-        btSoftBody::Node& n = psb->m_nodes[i];
-        // apply drag
-        n.m_v *= (1 - psb->m_cfg.drag);
-        // scale velocity back
-        if (n.m_v.norm() > max_v)
-        {
-            n.m_v.safeNormalize();
-            n.m_v *= max_v;
-        }
-        n.m_q = n.m_x + n.m_v * dt;
-        n.m_penetration = 0;
-    }
-
-    /* Nodes                */
-    psb->updateNodeTree(true, true);
-    if (!psb->m_fdbvt.empty())
-    {
-        psb->updateFaceTree(true, true);
-    }
-    /* Clear contacts */
-    psb->m_nodeRigidContacts.resize(0);
-    psb->m_faceRigidContacts.resize(0);
-    psb->m_faceNodeContacts.resize(0);
-    /* Optimize dbvt's        */
-//    psb->m_ndbvt.optimizeIncremental(1);
-//    psb->m_fdbvt.optimizeIncremental(1);
-}
+	BT_PROFILE("btDeformableBodySolver::predictDeformableMotion");
+	int i, ni;
+
+	/* Update                */
+	if (psb->m_bUpdateRtCst)
+	{
+		psb->m_bUpdateRtCst = false;
+		psb->updateConstants();
+		psb->m_fdbvt.clear();
+		if (psb->m_cfg.collisions & btSoftBody::fCollision::SDF_RD)
+		{
+			psb->initializeFaceTree();
+		}
+	}
 
+	/* Prepare                */
+	psb->m_sst.sdt = dt * psb->m_cfg.timescale;
+	psb->m_sst.isdt = 1 / psb->m_sst.sdt;
+	psb->m_sst.velmrg = psb->m_sst.sdt * 3;
+	psb->m_sst.radmrg = psb->getCollisionShape()->getMargin();
+	psb->m_sst.updmrg = psb->m_sst.radmrg * (btScalar)0.25;
+	/* Bounds                */
+	psb->updateBounds();
+
+	/* Integrate            */
+	// do not allow particles to move more than the bounding box size
+	btScalar max_v = (psb->m_bounds[1] - psb->m_bounds[0]).norm() / dt;
+	for (i = 0, ni = psb->m_nodes.size(); i < ni; ++i)
+	{
+		btSoftBody::Node& n = psb->m_nodes[i];
+		// apply drag
+		n.m_v *= (1 - psb->m_cfg.drag);
+		// scale velocity back
+		if (m_implicit)
+		{
+			n.m_q = n.m_x;
+		}
+		else
+		{
+			if (n.m_v.norm() > max_v)
+			{
+				n.m_v.safeNormalize();
+				n.m_v *= max_v;
+			}
+			n.m_q = n.m_x + n.m_v * dt;
+		}
+		n.m_splitv.setZero();
+		n.m_constrained = false;
+	}
+
+	/* Nodes                */
+	psb->updateNodeTree(true, true);
+	if (!psb->m_fdbvt.empty())
+	{
+		psb->updateFaceTree(true, true);
+	}
+	/* Clear contacts */
+	psb->m_nodeRigidContacts.resize(0);
+	psb->m_faceRigidContacts.resize(0);
+	psb->m_faceNodeContacts.resize(0);
+	/* Optimize dbvt's        */
+	//    psb->m_ndbvt.optimizeIncremental(1);
+	//    psb->m_fdbvt.optimizeIncremental(1);
+}
 
 void btDeformableBodySolver::updateSoftBodies()
 {
-    BT_PROFILE("updateSoftBodies");
-    for (int i = 0; i < m_softBodies.size(); i++)
-    {
-        btSoftBody *psb = (btSoftBody *)m_softBodies[i];
-        if (psb->isActive())
-        {
-            psb->updateNormals();
-        }
-    }
+	BT_PROFILE("updateSoftBodies");
+	for (int i = 0; i < m_softBodies.size(); i++)
+	{
+		btSoftBody* psb = (btSoftBody*)m_softBodies[i];
+		if (psb->isActive())
+		{
+			psb->updateNormals();
+		}
+	}
 }
 
 void btDeformableBodySolver::setImplicit(bool implicit)
 {
-    m_implicit = implicit;
-    m_objective->setImplicit(implicit);
+	m_implicit = implicit;
+	m_objective->setImplicit(implicit);
 }
 
 void btDeformableBodySolver::setLineSearch(bool lineSearch)
 {
-    m_lineSearch = lineSearch;
+	m_lineSearch = lineSearch;
 }
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableBodySolver.h b/thirdparty/bullet/BulletSoftBody/btDeformableBodySolver.h
index d4e5f4c603..ae674d6e89 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableBodySolver.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableBodySolver.h
@@ -16,7 +16,6 @@
 #ifndef BT_DEFORMABLE_BODY_SOLVERS_H
 #define BT_DEFORMABLE_BODY_SOLVERS_H
 
-
 #include "btSoftBodySolvers.h"
 #include "btDeformableBackwardEulerObjective.h"
 #include "btDeformableMultiBodyDynamicsWorld.h"
@@ -30,133 +29,132 @@ class btDeformableMultiBodyDynamicsWorld;
 
 class btDeformableBodySolver : public btSoftBodySolver
 {
-    typedef btAlignedObjectArray<btVector3> TVStack;
+	typedef btAlignedObjectArray<btVector3> TVStack;
+
 protected:
-    int m_numNodes;                 // total number of deformable body nodes
-    TVStack m_dv;                   // v_{n+1} - v_n
-    TVStack m_backup_dv;            // backed up dv
-    TVStack m_ddv;                  // incremental dv
-    TVStack m_residual;             // rhs of the linear solve
-    btAlignedObjectArray<btSoftBody *> m_softBodies;  // all deformable bodies
-    TVStack m_backupVelocity;       // backed up v, equals v_n for implicit, equals v_{n+1}^* for explicit
-    btScalar m_dt;                  // dt
-    btConjugateGradient<btDeformableBackwardEulerObjective> m_cg;  // CG solver
-    btConjugateResidual<btDeformableBackwardEulerObjective> m_cr;  // CR solver
-    bool m_implicit;                // use implicit scheme if true, explicit scheme if false
-    int m_maxNewtonIterations;      // max number of newton iterations
-    btScalar m_newtonTolerance;     // stop newton iterations if f(x) < m_newtonTolerance
-    bool m_lineSearch;              // If true, use newton's method with line search under implicit scheme
+	int m_numNodes;                                                // total number of deformable body nodes
+	TVStack m_dv;                                                  // v_{n+1} - v_n
+	TVStack m_backup_dv;                                           // backed up dv
+	TVStack m_ddv;                                                 // incremental dv
+	TVStack m_residual;                                            // rhs of the linear solve
+	btAlignedObjectArray<btSoftBody*> m_softBodies;                // all deformable bodies
+	TVStack m_backupVelocity;                                      // backed up v, equals v_n for implicit, equals v_{n+1}^* for explicit
+	btScalar m_dt;                                                 // dt
+	btConjugateGradient<btDeformableBackwardEulerObjective> m_cg;  // CG solver
+	btConjugateResidual<btDeformableBackwardEulerObjective> m_cr;  // CR solver
+	bool m_implicit;                                               // use implicit scheme if true, explicit scheme if false
+	int m_maxNewtonIterations;                                     // max number of newton iterations
+	btScalar m_newtonTolerance;                                    // stop newton iterations if f(x) < m_newtonTolerance
+	bool m_lineSearch;                                             // If true, use newton's method with line search under implicit scheme
 public:
-    // handles data related to objective function
-    btDeformableBackwardEulerObjective* m_objective;
-    bool m_useProjection;
-    
-    btDeformableBodySolver();
-    
-    virtual ~btDeformableBodySolver();
-    
-    virtual SolverTypes getSolverType() const
-    {
-        return DEFORMABLE_SOLVER;
-    }
-
-    // update soft body normals
-    virtual void updateSoftBodies();
-    
-    virtual btScalar solveContactConstraints(btCollisionObject** deformableBodies,int numDeformableBodies, const btContactSolverInfo& infoGlobal);
-    
-    // solve the momentum equation
-    virtual void solveDeformableConstraints(btScalar solverdt);
-    
-    // set up the position error in split impulse
-    void splitImpulseSetup(const btContactSolverInfo& infoGlobal);
-
-    // resize/clear data structures
-    void reinitialize(const btAlignedObjectArray<btSoftBody *>& softBodies, btScalar dt);
-    
-    // set up contact constraints
-    void setConstraints(const btContactSolverInfo& infoGlobal);
-    
-    // add in elastic forces and gravity to obtain v_{n+1}^* and calls predictDeformableMotion
-    virtual void predictMotion(btScalar solverdt);
-    
-    // move to temporary position x_{n+1}^* = x_n + dt * v_{n+1}^*
-    // x_{n+1}^* is stored in m_q
-    void predictDeformableMotion(btSoftBody* psb, btScalar dt);
-    
-    // save the current velocity to m_backupVelocity
-    void backupVelocity();
-    
-    // set m_dv and m_backupVelocity to desired value to prepare for momentum solve
-    void setupDeformableSolve(bool implicit);
-    
-    // set the current velocity to that backed up in m_backupVelocity
-    void revertVelocity();
-    
-    // set velocity to m_dv + m_backupVelocity
-    void updateVelocity();
-    
-    // update the node count
-    bool updateNodes();
-    
-    // calculate the change in dv resulting from the momentum solve
-    void computeStep(TVStack& ddv, const TVStack& residual);
-    
-    // calculate the change in dv resulting from the momentum solve when line search is turned on
-    btScalar computeDescentStep(TVStack& ddv, const TVStack& residual, bool verbose=false);
-
-    virtual void copySoftBodyToVertexBuffer(const btSoftBody *const softBody, btVertexBufferDescriptor *vertexBuffer) {}
-
-    // process collision between deformable and rigid
-    virtual void processCollision(btSoftBody * softBody, const btCollisionObjectWrapper * collisionObjectWrap)
-    {
-        softBody->defaultCollisionHandler(collisionObjectWrap);
-    }
-    
-    // process collision between deformable and deformable
-    virtual void processCollision(btSoftBody * softBody, btSoftBody * otherSoftBody) {
-        softBody->defaultCollisionHandler(otherSoftBody);
-    }
-
-    // If true, implicit time stepping scheme is used.
-    // Otherwise, explicit time stepping scheme is used
-    void setImplicit(bool implicit);
-    
-    // If true, newton's method with line search is used when implicit time stepping scheme is turned on
-    void setLineSearch(bool lineSearch);
-    
-    // set temporary position x^* = x_n + dt * v
-    // update the deformation gradient at position x^*
-    void updateState();
-    
-    // set dv = dv + scale * ddv
-    void updateDv(btScalar scale = 1);
-    
-    // set temporary position x^* = x_n + dt * v^*
-    void updateTempPosition();
-    
-    // save the current dv to m_backup_dv;
-    void backupDv();
-    
-    // set dv to the backed-up value
-    void revertDv();
-    
-    // set dv = dv + scale * ddv
-    // set v^* = v_n + dv
-    // set temporary position x^* = x_n + dt * v^*
-    // update the deformation gradient at position x^*
-    void updateEnergy(btScalar scale);
-    
-    // calculates the appropriately scaled kinetic energy in the system, which is
-    // 1/2 * dv^T * M * dv
-    // used in line search
-    btScalar kineticEnergy();
-    
-    // unused functions
-    virtual void optimize(btAlignedObjectArray<btSoftBody *> &softBodies, bool forceUpdate = false){}
-    virtual void solveConstraints(btScalar dt){}
-    virtual bool checkInitialized(){return true;}
-    virtual void copyBackToSoftBodies(bool bMove = true) {}
+	// handles data related to objective function
+	btDeformableBackwardEulerObjective* m_objective;
+	bool m_useProjection;
+
+	btDeformableBodySolver();
+
+	virtual ~btDeformableBodySolver();
+
+	virtual SolverTypes getSolverType() const
+	{
+		return DEFORMABLE_SOLVER;
+	}
+
+	// update soft body normals
+	virtual void updateSoftBodies();
+
+	virtual btScalar solveContactConstraints(btCollisionObject** deformableBodies, int numDeformableBodies, const btContactSolverInfo& infoGlobal);
+
+	// solve the momentum equation
+	virtual void solveDeformableConstraints(btScalar solverdt);
+
+	// resize/clear data structures
+	void reinitialize(const btAlignedObjectArray<btSoftBody*>& softBodies, btScalar dt);
+
+	// set up contact constraints
+	void setConstraints(const btContactSolverInfo& infoGlobal);
+
+	// add in elastic forces and gravity to obtain v_{n+1}^* and calls predictDeformableMotion
+	virtual void predictMotion(btScalar solverdt);
+
+	// move to temporary position x_{n+1}^* = x_n + dt * v_{n+1}^*
+	// x_{n+1}^* is stored in m_q
+	void predictDeformableMotion(btSoftBody* psb, btScalar dt);
+
+	// save the current velocity to m_backupVelocity
+	void backupVelocity();
+
+	// set m_dv and m_backupVelocity to desired value to prepare for momentum solve
+	void setupDeformableSolve(bool implicit);
+
+	// set the current velocity to that backed up in m_backupVelocity
+	void revertVelocity();
+
+	// set velocity to m_dv + m_backupVelocity
+	void updateVelocity();
+
+	// update the node count
+	bool updateNodes();
+
+	// calculate the change in dv resulting from the momentum solve
+	void computeStep(TVStack& ddv, const TVStack& residual);
+
+	// calculate the change in dv resulting from the momentum solve when line search is turned on
+	btScalar computeDescentStep(TVStack& ddv, const TVStack& residual, bool verbose = false);
+
+	virtual void copySoftBodyToVertexBuffer(const btSoftBody* const softBody, btVertexBufferDescriptor* vertexBuffer) {}
+
+	// process collision between deformable and rigid
+	virtual void processCollision(btSoftBody* softBody, const btCollisionObjectWrapper* collisionObjectWrap)
+	{
+		softBody->defaultCollisionHandler(collisionObjectWrap);
+	}
+
+	// process collision between deformable and deformable
+	virtual void processCollision(btSoftBody* softBody, btSoftBody* otherSoftBody)
+	{
+		softBody->defaultCollisionHandler(otherSoftBody);
+	}
+
+	// If true, implicit time stepping scheme is used.
+	// Otherwise, explicit time stepping scheme is used
+	void setImplicit(bool implicit);
+
+	// If true, newton's method with line search is used when implicit time stepping scheme is turned on
+	void setLineSearch(bool lineSearch);
+
+	// set temporary position x^* = x_n + dt * v
+	// update the deformation gradient at position x^*
+	void updateState();
+
+	// set dv = dv + scale * ddv
+	void updateDv(btScalar scale = 1);
+
+	// set temporary position x^* = x_n + dt * v^*
+	void updateTempPosition();
+
+	// save the current dv to m_backup_dv;
+	void backupDv();
+
+	// set dv to the backed-up value
+	void revertDv();
+
+	// set dv = dv + scale * ddv
+	// set v^* = v_n + dv
+	// set temporary position x^* = x_n + dt * v^*
+	// update the deformation gradient at position x^*
+	void updateEnergy(btScalar scale);
+
+	// calculates the appropriately scaled kinetic energy in the system, which is
+	// 1/2 * dv^T * M * dv
+	// used in line search
+	btScalar kineticEnergy();
+
+	// unused functions
+	virtual void optimize(btAlignedObjectArray<btSoftBody*>& softBodies, bool forceUpdate = false) {}
+	virtual void solveConstraints(btScalar dt) {}
+	virtual bool checkInitialized() { return true; }
+	virtual void copyBackToSoftBodies(bool bMove = true) {}
 };
 
 #endif /* btDeformableBodySolver_h */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableContactConstraint.cpp b/thirdparty/bullet/BulletSoftBody/btDeformableContactConstraint.cpp
index 2864446de6..09398d79a5 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableContactConstraint.cpp
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableContactConstraint.cpp
@@ -16,387 +16,503 @@
 #include "btDeformableContactConstraint.h"
 /* ================   Deformable Node Anchor   =================== */
 btDeformableNodeAnchorConstraint::btDeformableNodeAnchorConstraint(const btSoftBody::DeformableNodeRigidAnchor& a, const btContactSolverInfo& infoGlobal)
-: m_anchor(&a)
-, btDeformableContactConstraint(a.m_cti.m_normal, infoGlobal)
+	: m_anchor(&a), btDeformableContactConstraint(a.m_cti.m_normal, infoGlobal)
 {
 }
 
 btDeformableNodeAnchorConstraint::btDeformableNodeAnchorConstraint(const btDeformableNodeAnchorConstraint& other)
-: m_anchor(other.m_anchor)
-, btDeformableContactConstraint(other)
+	: m_anchor(other.m_anchor), btDeformableContactConstraint(other)
 {
 }
 
 btVector3 btDeformableNodeAnchorConstraint::getVa() const
 {
-    const btSoftBody::sCti& cti = m_anchor->m_cti;
-    btVector3 va(0, 0, 0);
-    if (cti.m_colObj->hasContactResponse())
-    {
-        btRigidBody* rigidCol = 0;
-        btMultiBodyLinkCollider* multibodyLinkCol = 0;
-        
-        // grab the velocity of the rigid body
-        if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
-        {
-            rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
-            va = rigidCol ? (rigidCol->getVelocityInLocalPoint(m_anchor->m_c1)) : btVector3(0, 0, 0);
-        }
-        else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
-        {
-            multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
-            if (multibodyLinkCol)
-            {
-                const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
-                const btScalar* J_n = &m_anchor->jacobianData_normal.m_jacobians[0];
-                const btScalar* J_t1 = &m_anchor->jacobianData_t1.m_jacobians[0];
-                const btScalar* J_t2 = &m_anchor->jacobianData_t2.m_jacobians[0];
-                const btScalar* local_v = multibodyLinkCol->m_multiBody->getVelocityVector();
-                const btScalar* local_dv = multibodyLinkCol->m_multiBody->getDeltaVelocityVector();
-                // add in the normal component of the va
-                btScalar vel = 0.0;
-                for (int k = 0; k < ndof; ++k)
-                {
-                    vel += (local_v[k]+local_dv[k]) * J_n[k];
-                }
-                va = cti.m_normal * vel;
-                // add in the tangential components of the va
-                vel = 0.0;
-                for (int k = 0; k < ndof; ++k)
-                {
-                    vel += (local_v[k]+local_dv[k]) * J_t1[k];
-                }
-                va += m_anchor->t1 * vel;
-                vel = 0.0;
-                for (int k = 0; k < ndof; ++k)
-                {
-                    vel += (local_v[k]+local_dv[k]) * J_t2[k];
-                }
-                va += m_anchor->t2 * vel;
-            }
-        }
-    }
-    return va;
+	const btSoftBody::sCti& cti = m_anchor->m_cti;
+	btVector3 va(0, 0, 0);
+	if (cti.m_colObj->hasContactResponse())
+	{
+		btRigidBody* rigidCol = 0;
+		btMultiBodyLinkCollider* multibodyLinkCol = 0;
+
+		// grab the velocity of the rigid body
+		if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
+		{
+			rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
+			va = rigidCol ? (rigidCol->getVelocityInLocalPoint(m_anchor->m_c1)) : btVector3(0, 0, 0);
+		}
+		else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
+		{
+			multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
+			if (multibodyLinkCol)
+			{
+				const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
+				const btScalar* J_n = &m_anchor->jacobianData_normal.m_jacobians[0];
+				const btScalar* J_t1 = &m_anchor->jacobianData_t1.m_jacobians[0];
+				const btScalar* J_t2 = &m_anchor->jacobianData_t2.m_jacobians[0];
+				const btScalar* local_v = multibodyLinkCol->m_multiBody->getVelocityVector();
+				const btScalar* local_dv = multibodyLinkCol->m_multiBody->getDeltaVelocityVector();
+				// add in the normal component of the va
+				btScalar vel = 0.0;
+				for (int k = 0; k < ndof; ++k)
+				{
+					vel += (local_v[k] + local_dv[k]) * J_n[k];
+				}
+				va = cti.m_normal * vel;
+				// add in the tangential components of the va
+				vel = 0.0;
+				for (int k = 0; k < ndof; ++k)
+				{
+					vel += (local_v[k] + local_dv[k]) * J_t1[k];
+				}
+				va += m_anchor->t1 * vel;
+				vel = 0.0;
+				for (int k = 0; k < ndof; ++k)
+				{
+					vel += (local_v[k] + local_dv[k]) * J_t2[k];
+				}
+				va += m_anchor->t2 * vel;
+			}
+		}
+	}
+	return va;
 }
 
 btScalar btDeformableNodeAnchorConstraint::solveConstraint(const btContactSolverInfo& infoGlobal)
 {
-    const btSoftBody::sCti& cti = m_anchor->m_cti;
-    btVector3 va = getVa();
-    btVector3 vb = getVb();
-    btVector3 vr = (vb - va);
-    // + (m_anchor->m_node->m_x - cti.m_colObj->getWorldTransform() * m_anchor->m_local) * 10.0
-    const btScalar dn = btDot(vr, vr);
-    // dn is the normal component of velocity diffrerence. Approximates the residual. // todo xuchenhan@: this prob needs to be scaled by dt
-    btScalar residualSquare = dn*dn;
-    btVector3 impulse = m_anchor->m_c0 * vr;
-    // apply impulse to deformable nodes involved and change their velocities
-    applyImpulse(impulse);
-    
-    // apply impulse to the rigid/multibodies involved and change their velocities
-    if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
-    {
-        btRigidBody* rigidCol = 0;
-        rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
-        if (rigidCol)
-        {
-            rigidCol->applyImpulse(impulse, m_anchor->m_c1);
-        }
-    }
-    else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
-    {
-        btMultiBodyLinkCollider* multibodyLinkCol = 0;
-        multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
-        if (multibodyLinkCol)
-        {
-            const btScalar* deltaV_normal = &m_anchor->jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
-            // apply normal component of the impulse
-            multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_normal, impulse.dot(cti.m_normal));
-            // apply tangential component of the impulse
-            const btScalar* deltaV_t1 = &m_anchor->jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
-            multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_t1, impulse.dot(m_anchor->t1));
-            const btScalar* deltaV_t2 = &m_anchor->jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
-            multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_t2, impulse.dot(m_anchor->t2));
-        }
-    }
-    return residualSquare;
+	const btSoftBody::sCti& cti = m_anchor->m_cti;
+	btVector3 va = getVa();
+	btVector3 vb = getVb();
+	btVector3 vr = (vb - va);
+	// + (m_anchor->m_node->m_x - cti.m_colObj->getWorldTransform() * m_anchor->m_local) * 10.0
+	const btScalar dn = btDot(vr, vr);
+	// dn is the normal component of velocity diffrerence. Approximates the residual. // todo xuchenhan@: this prob needs to be scaled by dt
+	btScalar residualSquare = dn * dn;
+	btVector3 impulse = m_anchor->m_c0 * vr;
+	// apply impulse to deformable nodes involved and change their velocities
+	applyImpulse(impulse);
+
+	// apply impulse to the rigid/multibodies involved and change their velocities
+	if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
+	{
+		btRigidBody* rigidCol = 0;
+		rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
+		if (rigidCol)
+		{
+			rigidCol->applyImpulse(impulse, m_anchor->m_c1);
+		}
+	}
+	else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
+	{
+		btMultiBodyLinkCollider* multibodyLinkCol = 0;
+		multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
+		if (multibodyLinkCol)
+		{
+			const btScalar* deltaV_normal = &m_anchor->jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
+			// apply normal component of the impulse
+			multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_normal, impulse.dot(cti.m_normal));
+			// apply tangential component of the impulse
+			const btScalar* deltaV_t1 = &m_anchor->jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
+			multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_t1, impulse.dot(m_anchor->t1));
+			const btScalar* deltaV_t2 = &m_anchor->jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
+			multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_t2, impulse.dot(m_anchor->t2));
+		}
+	}
+	return residualSquare;
 }
 
 btVector3 btDeformableNodeAnchorConstraint::getVb() const
 {
-    return m_anchor->m_node->m_v;
+	return m_anchor->m_node->m_v;
 }
 
 void btDeformableNodeAnchorConstraint::applyImpulse(const btVector3& impulse)
 {
-    btVector3 dv = impulse * m_anchor->m_c2;
-    m_anchor->m_node->m_v -= dv;
+	btVector3 dv = impulse * m_anchor->m_c2;
+	m_anchor->m_node->m_v -= dv;
 }
 
 /* ================   Deformable vs. Rigid   =================== */
 btDeformableRigidContactConstraint::btDeformableRigidContactConstraint(const btSoftBody::DeformableRigidContact& c, const btContactSolverInfo& infoGlobal)
-: m_contact(&c)
-, btDeformableContactConstraint(c.m_cti.m_normal, infoGlobal)
+	: m_contact(&c), btDeformableContactConstraint(c.m_cti.m_normal, infoGlobal)
 {
-    m_total_normal_dv.setZero();
-    m_total_tangent_dv.setZero();
-    // The magnitude of penetration is the depth of penetration.
-    m_penetration = c.m_cti.m_offset;
-//	m_penetration = btMin(btScalar(0),c.m_cti.m_offset);
+	m_total_normal_dv.setZero();
+	m_total_tangent_dv.setZero();
+	// The magnitude of penetration is the depth of penetration.
+	m_penetration = c.m_cti.m_offset;
+	m_total_split_impulse = 0;
+	m_binding = false;
 }
 
 btDeformableRigidContactConstraint::btDeformableRigidContactConstraint(const btDeformableRigidContactConstraint& other)
-: m_contact(other.m_contact)
-, btDeformableContactConstraint(other)
-, m_penetration(other.m_penetration)
+	: m_contact(other.m_contact), btDeformableContactConstraint(other), m_penetration(other.m_penetration), m_total_split_impulse(other.m_total_split_impulse), m_binding(other.m_binding)
 {
-    m_total_normal_dv = other.m_total_normal_dv;
-    m_total_tangent_dv = other.m_total_tangent_dv;
+	m_total_normal_dv = other.m_total_normal_dv;
+	m_total_tangent_dv = other.m_total_tangent_dv;
 }
 
-
 btVector3 btDeformableRigidContactConstraint::getVa() const
 {
-    const btSoftBody::sCti& cti = m_contact->m_cti;
-    btVector3 va(0, 0, 0);
-    if (cti.m_colObj->hasContactResponse())
-    {
-        btRigidBody* rigidCol = 0;
-        btMultiBodyLinkCollider* multibodyLinkCol = 0;
-        
-        // grab the velocity of the rigid body
-        if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
-        {
-            rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
-            va = rigidCol ? (rigidCol->getVelocityInLocalPoint(m_contact->m_c1)) : btVector3(0, 0, 0);
-        }
-        else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
-        {
-            multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
-            if (multibodyLinkCol)
-            {
-                const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
-                const btScalar* J_n = &m_contact->jacobianData_normal.m_jacobians[0];
-                const btScalar* J_t1 = &m_contact->jacobianData_t1.m_jacobians[0];
-                const btScalar* J_t2 = &m_contact->jacobianData_t2.m_jacobians[0];
-                const btScalar* local_v = multibodyLinkCol->m_multiBody->getVelocityVector();
-                const btScalar* local_dv = multibodyLinkCol->m_multiBody->getDeltaVelocityVector();
-                // add in the normal component of the va
-                btScalar vel = 0.0;
-                for (int k = 0; k < ndof; ++k)
-                {
-                    vel += (local_v[k]+local_dv[k]) * J_n[k];
-                }
-                va = cti.m_normal * vel;
-                // add in the tangential components of the va
-                vel = 0.0;
-                for (int k = 0; k < ndof; ++k)
-                {
-                    vel += (local_v[k]+local_dv[k]) * J_t1[k];
-                }
-                va += m_contact->t1 * vel;
-                vel = 0.0;
-                for (int k = 0; k < ndof; ++k)
-                {
-                    vel += (local_v[k]+local_dv[k]) * J_t2[k];
-                }
-                va += m_contact->t2 * vel;
-            }
-        }
-    }
-    return va;
+	const btSoftBody::sCti& cti = m_contact->m_cti;
+	btVector3 va(0, 0, 0);
+	if (cti.m_colObj->hasContactResponse())
+	{
+		btRigidBody* rigidCol = 0;
+		btMultiBodyLinkCollider* multibodyLinkCol = 0;
+
+		// grab the velocity of the rigid body
+		if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
+		{
+			rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
+			va = rigidCol ? (rigidCol->getVelocityInLocalPoint(m_contact->m_c1)) : btVector3(0, 0, 0);
+		}
+		else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
+		{
+			multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
+			if (multibodyLinkCol)
+			{
+				const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
+				const btScalar* J_n = &m_contact->jacobianData_normal.m_jacobians[0];
+				const btScalar* J_t1 = &m_contact->jacobianData_t1.m_jacobians[0];
+				const btScalar* J_t2 = &m_contact->jacobianData_t2.m_jacobians[0];
+				const btScalar* local_v = multibodyLinkCol->m_multiBody->getVelocityVector();
+				const btScalar* local_dv = multibodyLinkCol->m_multiBody->getDeltaVelocityVector();
+				// add in the normal component of the va
+				btScalar vel = 0.0;
+				for (int k = 0; k < ndof; ++k)
+				{
+					vel += (local_v[k] + local_dv[k]) * J_n[k];
+				}
+				va = cti.m_normal * vel;
+				// add in the tangential components of the va
+				vel = 0.0;
+				for (int k = 0; k < ndof; ++k)
+				{
+					vel += (local_v[k] + local_dv[k]) * J_t1[k];
+				}
+				va += m_contact->t1 * vel;
+				vel = 0.0;
+				for (int k = 0; k < ndof; ++k)
+				{
+					vel += (local_v[k] + local_dv[k]) * J_t2[k];
+				}
+				va += m_contact->t2 * vel;
+			}
+		}
+	}
+	return va;
+}
+
+btVector3 btDeformableRigidContactConstraint::getSplitVa() const
+{
+	const btSoftBody::sCti& cti = m_contact->m_cti;
+	btVector3 va(0, 0, 0);
+	if (cti.m_colObj->hasContactResponse())
+	{
+		btRigidBody* rigidCol = 0;
+		btMultiBodyLinkCollider* multibodyLinkCol = 0;
+
+		// grab the velocity of the rigid body
+		if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
+		{
+			rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
+			va = rigidCol ? (rigidCol->getPushVelocityInLocalPoint(m_contact->m_c1)) : btVector3(0, 0, 0);
+		}
+		else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
+		{
+			multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
+			if (multibodyLinkCol)
+			{
+				const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
+				const btScalar* J_n = &m_contact->jacobianData_normal.m_jacobians[0];
+				const btScalar* J_t1 = &m_contact->jacobianData_t1.m_jacobians[0];
+				const btScalar* J_t2 = &m_contact->jacobianData_t2.m_jacobians[0];
+				const btScalar* local_split_v = multibodyLinkCol->m_multiBody->getSplitVelocityVector();
+				// add in the normal component of the va
+				btScalar vel = 0.0;
+				for (int k = 0; k < ndof; ++k)
+				{
+					vel += local_split_v[k] * J_n[k];
+				}
+				va = cti.m_normal * vel;
+				// add in the tangential components of the va
+				vel = 0.0;
+				for (int k = 0; k < ndof; ++k)
+				{
+					vel += local_split_v[k] * J_t1[k];
+				}
+				va += m_contact->t1 * vel;
+				vel = 0.0;
+				for (int k = 0; k < ndof; ++k)
+				{
+					vel += local_split_v[k] * J_t2[k];
+				}
+				va += m_contact->t2 * vel;
+			}
+		}
+	}
+	return va;
 }
 
 btScalar btDeformableRigidContactConstraint::solveConstraint(const btContactSolverInfo& infoGlobal)
 {
-    const btSoftBody::sCti& cti = m_contact->m_cti;
-    btVector3 va = getVa();
-    btVector3 vb = getVb();
-    btVector3 vr = vb - va;
-    btScalar dn = btDot(vr, cti.m_normal) + m_penetration * infoGlobal.m_deformable_erp / infoGlobal.m_timeStep;
-    // dn is the normal component of velocity diffrerence. Approximates the residual. // todo xuchenhan@: this prob needs to be scaled by dt
-    btScalar residualSquare = dn*dn;
-    btVector3 impulse = m_contact->m_c0 * (vr + m_penetration * infoGlobal.m_deformable_erp / infoGlobal.m_timeStep * cti.m_normal) ;
-    const btVector3 impulse_normal = m_contact->m_c0 * (cti.m_normal * dn);
-    btVector3 impulse_tangent = impulse - impulse_normal;
-    btVector3 old_total_tangent_dv = m_total_tangent_dv;
-    // m_c2 is the inverse mass of the deformable node/face
-    m_total_normal_dv -= impulse_normal * m_contact->m_c2;
-    m_total_tangent_dv -= impulse_tangent * m_contact->m_c2;
-
-    if (m_total_normal_dv.dot(cti.m_normal) < 0)
-    {
-        // separating in the normal direction
-        m_static = false;
-        m_total_tangent_dv = btVector3(0,0,0);
-        impulse_tangent.setZero();
-    }
-    else
-    {
-        if (m_total_normal_dv.norm() * m_contact->m_c3 < m_total_tangent_dv.norm())
-        {
-            // dynamic friction
-            // with dynamic friction, the impulse are still applied to the two objects colliding, however, it does not pose a constraint in the cg solve, hence the change to dv merely serves to update velocity in the contact iterations.
-            m_static = false;
-            if (m_total_tangent_dv.safeNorm() < SIMD_EPSILON)
-            {
-                m_total_tangent_dv = btVector3(0,0,0);
-            }
-            else
-            {
-                m_total_tangent_dv = m_total_tangent_dv.normalized() * m_total_normal_dv.safeNorm() * m_contact->m_c3;
-            }
-            impulse_tangent = -btScalar(1)/m_contact->m_c2 * (m_total_tangent_dv - old_total_tangent_dv);
-        }
-        else
-        {
-            // static friction
-            m_static = true;
-        }
-    }
-    impulse = impulse_normal + impulse_tangent;
-    // apply impulse to deformable nodes involved and change their velocities
-    applyImpulse(impulse);
-	if (residualSquare < 1e-7)
-		return residualSquare;
-    // apply impulse to the rigid/multibodies involved and change their velocities
-    if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
-    {
-        btRigidBody* rigidCol = 0;
-        rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
-        if (rigidCol)
-        {
-            rigidCol->applyImpulse(impulse, m_contact->m_c1);
-        }
-    }
-    else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
-    {
-        btMultiBodyLinkCollider* multibodyLinkCol = 0;
-        multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
-        if (multibodyLinkCol)
-        {
-            const btScalar* deltaV_normal = &m_contact->jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
-            // apply normal component of the impulse
-            multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_normal, impulse.dot(cti.m_normal));
-            if (impulse_tangent.norm() > SIMD_EPSILON)
-            {
-                // apply tangential component of the impulse
-                const btScalar* deltaV_t1 = &m_contact->jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
-                multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_t1, impulse.dot(m_contact->t1));
-                const btScalar* deltaV_t2 = &m_contact->jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
-                multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_t2, impulse.dot(m_contact->t2));
-            }
-        }
-    }
-//    va = getVa();
-//    vb = getVb();
-//    vr = vb - va;
-//    btScalar dn1 = btDot(vr, cti.m_normal) / 150;
-//    m_penetration += dn1;
-    return residualSquare;
+	const btSoftBody::sCti& cti = m_contact->m_cti;
+	btVector3 va = getVa();
+	btVector3 vb = getVb();
+	btVector3 vr = vb - va;
+	btScalar dn = btDot(vr, cti.m_normal) + m_total_normal_dv.dot(cti.m_normal) * infoGlobal.m_deformable_cfm;
+	if (m_penetration > 0)
+	{
+		dn += m_penetration / infoGlobal.m_timeStep;
+	}
+	if (!infoGlobal.m_splitImpulse)
+	{
+		dn += m_penetration * infoGlobal.m_deformable_erp / infoGlobal.m_timeStep;
+	}
+	// dn is the normal component of velocity diffrerence. Approximates the residual. // todo xuchenhan@: this prob needs to be scaled by dt
+	btVector3 impulse = m_contact->m_c0 * (vr + m_total_normal_dv * infoGlobal.m_deformable_cfm + ((m_penetration > 0) ? m_penetration / infoGlobal.m_timeStep * cti.m_normal : btVector3(0, 0, 0)));
+	if (!infoGlobal.m_splitImpulse)
+	{
+		impulse += m_contact->m_c0 * (m_penetration * infoGlobal.m_deformable_erp / infoGlobal.m_timeStep * cti.m_normal);
+	}
+	btVector3 impulse_normal = m_contact->m_c0 * (cti.m_normal * dn);
+	btVector3 impulse_tangent = impulse - impulse_normal;
+	if (dn > 0)
+	{
+		return 0;
+	}
+	m_binding = true;
+	btScalar residualSquare = dn * dn;
+	btVector3 old_total_tangent_dv = m_total_tangent_dv;
+	// m_c5 is the inverse mass of the deformable node/face
+	m_total_normal_dv -= m_contact->m_c5 * impulse_normal;
+	m_total_tangent_dv -= m_contact->m_c5 * impulse_tangent;
+
+	if (m_total_normal_dv.dot(cti.m_normal) < 0)
+	{
+		// separating in the normal direction
+		m_binding = false;
+		m_static = false;
+		impulse_tangent.setZero();
+	}
+	else
+	{
+		if (m_total_normal_dv.norm() * m_contact->m_c3 < m_total_tangent_dv.norm())
+		{
+			// dynamic friction
+			// with dynamic friction, the impulse are still applied to the two objects colliding, however, it does not pose a constraint in the cg solve, hence the change to dv merely serves to update velocity in the contact iterations.
+			m_static = false;
+			if (m_total_tangent_dv.safeNorm() < SIMD_EPSILON)
+			{
+				m_total_tangent_dv = btVector3(0, 0, 0);
+			}
+			else
+			{
+				m_total_tangent_dv = m_total_tangent_dv.normalized() * m_total_normal_dv.safeNorm() * m_contact->m_c3;
+			}
+			//            impulse_tangent = -btScalar(1)/m_contact->m_c2 * (m_total_tangent_dv - old_total_tangent_dv);
+			impulse_tangent = m_contact->m_c5.inverse() * (old_total_tangent_dv - m_total_tangent_dv);
+		}
+		else
+		{
+			// static friction
+			m_static = true;
+		}
+	}
+	impulse = impulse_normal + impulse_tangent;
+	// apply impulse to deformable nodes involved and change their velocities
+	applyImpulse(impulse);
+	// apply impulse to the rigid/multibodies involved and change their velocities
+	if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
+	{
+		btRigidBody* rigidCol = 0;
+		rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
+		if (rigidCol)
+		{
+			rigidCol->applyImpulse(impulse, m_contact->m_c1);
+		}
+	}
+	else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
+	{
+		btMultiBodyLinkCollider* multibodyLinkCol = 0;
+		multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
+		if (multibodyLinkCol)
+		{
+			const btScalar* deltaV_normal = &m_contact->jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
+			// apply normal component of the impulse
+			multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_normal, impulse.dot(cti.m_normal));
+			if (impulse_tangent.norm() > SIMD_EPSILON)
+			{
+				// apply tangential component of the impulse
+				const btScalar* deltaV_t1 = &m_contact->jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
+				multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_t1, impulse.dot(m_contact->t1));
+				const btScalar* deltaV_t2 = &m_contact->jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
+				multibodyLinkCol->m_multiBody->applyDeltaVeeMultiDof2(deltaV_t2, impulse.dot(m_contact->t2));
+			}
+		}
+	}
+	return residualSquare;
+}
+
+btScalar btDeformableRigidContactConstraint::solveSplitImpulse(const btContactSolverInfo& infoGlobal)
+{
+	btScalar MAX_PENETRATION_CORRECTION = infoGlobal.m_deformable_maxErrorReduction;
+	const btSoftBody::sCti& cti = m_contact->m_cti;
+	btVector3 vb = getSplitVb();
+	btVector3 va = getSplitVa();
+	btScalar p = m_penetration;
+	if (p > 0)
+	{
+		return 0;
+	}
+	btVector3 vr = vb - va;
+	btScalar dn = btDot(vr, cti.m_normal) + p * infoGlobal.m_deformable_erp / infoGlobal.m_timeStep;
+	if (dn > 0)
+	{
+		return 0;
+	}
+	if (m_total_split_impulse + dn > MAX_PENETRATION_CORRECTION)
+	{
+		dn = MAX_PENETRATION_CORRECTION - m_total_split_impulse;
+	}
+	if (m_total_split_impulse + dn < -MAX_PENETRATION_CORRECTION)
+	{
+		dn = -MAX_PENETRATION_CORRECTION - m_total_split_impulse;
+	}
+	m_total_split_impulse += dn;
+
+	btScalar residualSquare = dn * dn;
+	const btVector3 impulse = m_contact->m_c0 * (cti.m_normal * dn);
+	applySplitImpulse(impulse);
+
+	// apply split impulse to the rigid/multibodies involved and change their velocities
+	if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
+	{
+		btRigidBody* rigidCol = 0;
+		rigidCol = (btRigidBody*)btRigidBody::upcast(cti.m_colObj);
+		if (rigidCol)
+		{
+			rigidCol->applyPushImpulse(impulse, m_contact->m_c1);
+		}
+	}
+	else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
+	{
+		btMultiBodyLinkCollider* multibodyLinkCol = 0;
+		multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
+		if (multibodyLinkCol)
+		{
+			const btScalar* deltaV_normal = &m_contact->jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
+			// apply normal component of the impulse
+			multibodyLinkCol->m_multiBody->applyDeltaSplitVeeMultiDof(deltaV_normal, impulse.dot(cti.m_normal));
+		}
+	}
+	return residualSquare;
 }
 /* ================   Node vs. Rigid   =================== */
 btDeformableNodeRigidContactConstraint::btDeformableNodeRigidContactConstraint(const btSoftBody::DeformableNodeRigidContact& contact, const btContactSolverInfo& infoGlobal)
-    : m_node(contact.m_node)
-    , btDeformableRigidContactConstraint(contact, infoGlobal)
-    {
-    }
+	: m_node(contact.m_node), btDeformableRigidContactConstraint(contact, infoGlobal)
+{
+}
 
 btDeformableNodeRigidContactConstraint::btDeformableNodeRigidContactConstraint(const btDeformableNodeRigidContactConstraint& other)
-: m_node(other.m_node)
-, btDeformableRigidContactConstraint(other)
+	: m_node(other.m_node), btDeformableRigidContactConstraint(other)
 {
 }
 
 btVector3 btDeformableNodeRigidContactConstraint::getVb() const
 {
-    return m_node->m_v;
+	return m_node->m_v;
 }
 
+btVector3 btDeformableNodeRigidContactConstraint::getSplitVb() const
+{
+	return m_node->m_splitv;
+}
 
 btVector3 btDeformableNodeRigidContactConstraint::getDv(const btSoftBody::Node* node) const
 {
-    return m_total_normal_dv + m_total_tangent_dv;
+	return m_total_normal_dv + m_total_tangent_dv;
 }
 
 void btDeformableNodeRigidContactConstraint::applyImpulse(const btVector3& impulse)
 {
-    const btSoftBody::DeformableNodeRigidContact* contact = getContact();
-    btVector3 dv = impulse * contact->m_c2;
-    contact->m_node->m_v -= dv;
+	const btSoftBody::DeformableNodeRigidContact* contact = getContact();
+	btVector3 dv = contact->m_c5 * impulse;
+	contact->m_node->m_v -= dv;
+}
+
+void btDeformableNodeRigidContactConstraint::applySplitImpulse(const btVector3& impulse)
+{
+	const btSoftBody::DeformableNodeRigidContact* contact = getContact();
+	btVector3 dv = contact->m_c5 * impulse;
+	contact->m_node->m_splitv -= dv;
 }
 
 /* ================   Face vs. Rigid   =================== */
 btDeformableFaceRigidContactConstraint::btDeformableFaceRigidContactConstraint(const btSoftBody::DeformableFaceRigidContact& contact, const btContactSolverInfo& infoGlobal, bool useStrainLimiting)
-: m_face(contact.m_face)
-, m_useStrainLimiting(useStrainLimiting)
-, btDeformableRigidContactConstraint(contact, infoGlobal)
+	: m_face(contact.m_face), m_useStrainLimiting(useStrainLimiting), btDeformableRigidContactConstraint(contact, infoGlobal)
 {
 }
 
 btDeformableFaceRigidContactConstraint::btDeformableFaceRigidContactConstraint(const btDeformableFaceRigidContactConstraint& other)
-: m_face(other.m_face)
-, m_useStrainLimiting(other.m_useStrainLimiting)
-, btDeformableRigidContactConstraint(other)
+	: m_face(other.m_face), m_useStrainLimiting(other.m_useStrainLimiting), btDeformableRigidContactConstraint(other)
 {
 }
 
 btVector3 btDeformableFaceRigidContactConstraint::getVb() const
 {
-    const btSoftBody::DeformableFaceRigidContact* contact = getContact();
-    btVector3 vb = m_face->m_n[0]->m_v * contact->m_bary[0] + m_face->m_n[1]->m_v * contact->m_bary[1] + m_face->m_n[2]->m_v * contact->m_bary[2];
-    return vb;
+	const btSoftBody::DeformableFaceRigidContact* contact = getContact();
+	btVector3 vb = m_face->m_n[0]->m_v * contact->m_bary[0] + m_face->m_n[1]->m_v * contact->m_bary[1] + m_face->m_n[2]->m_v * contact->m_bary[2];
+	return vb;
 }
 
-
 btVector3 btDeformableFaceRigidContactConstraint::getDv(const btSoftBody::Node* node) const
 {
-    btVector3 face_dv = m_total_normal_dv + m_total_tangent_dv;
-    const btSoftBody::DeformableFaceRigidContact* contact = getContact();
-    if (m_face->m_n[0] == node)
-    {
-        return face_dv * contact->m_weights[0];
-    }
-    if (m_face->m_n[1] == node)
-    {
-        return face_dv * contact->m_weights[1];
-    }
-    btAssert(node == m_face->m_n[2]);
-    return face_dv * contact->m_weights[2];
+	btVector3 face_dv = m_total_normal_dv + m_total_tangent_dv;
+	const btSoftBody::DeformableFaceRigidContact* contact = getContact();
+	if (m_face->m_n[0] == node)
+	{
+		return face_dv * contact->m_weights[0];
+	}
+	if (m_face->m_n[1] == node)
+	{
+		return face_dv * contact->m_weights[1];
+	}
+	btAssert(node == m_face->m_n[2]);
+	return face_dv * contact->m_weights[2];
 }
 
 void btDeformableFaceRigidContactConstraint::applyImpulse(const btVector3& impulse)
 {
-    const btSoftBody::DeformableFaceRigidContact* contact = getContact();
-    btVector3 dv = impulse * contact->m_c2;
-    btSoftBody::Face* face = contact->m_face;
-    
-    btVector3& v0 = face->m_n[0]->m_v;
-    btVector3& v1 = face->m_n[1]->m_v;
-    btVector3& v2 = face->m_n[2]->m_v;
-    const btScalar& im0 = face->m_n[0]->m_im;
-    const btScalar& im1 = face->m_n[1]->m_im;
-    const btScalar& im2 = face->m_n[2]->m_im;
-    if (im0 > 0)
-        v0 -= dv * contact->m_weights[0];
-    if (im1 > 0)
-        v1 -= dv * contact->m_weights[1];
-    if (im2 > 0)
-        v2 -= dv * contact->m_weights[2];
+	const btSoftBody::DeformableFaceRigidContact* contact = getContact();
+	btVector3 dv = impulse * contact->m_c2;
+	btSoftBody::Face* face = contact->m_face;
+
+	btVector3& v0 = face->m_n[0]->m_v;
+	btVector3& v1 = face->m_n[1]->m_v;
+	btVector3& v2 = face->m_n[2]->m_v;
+	const btScalar& im0 = face->m_n[0]->m_im;
+	const btScalar& im1 = face->m_n[1]->m_im;
+	const btScalar& im2 = face->m_n[2]->m_im;
+	if (im0 > 0)
+		v0 -= dv * contact->m_weights[0];
+	if (im1 > 0)
+		v1 -= dv * contact->m_weights[1];
+	if (im2 > 0)
+		v2 -= dv * contact->m_weights[2];
 	if (m_useStrainLimiting)
 	{
-		btScalar relaxation = 1./btScalar(m_infoGlobal->m_numIterations);
-		btScalar m01 = (relaxation/(im0 + im1));
-		btScalar m02 = (relaxation/(im0 + im2));
-		btScalar m12 = (relaxation/(im1 + im2));
-		#ifdef USE_STRAIN_RATE_LIMITING
+		btScalar relaxation = 1. / btScalar(m_infoGlobal->m_numIterations);
+		btScalar m01 = (relaxation / (im0 + im1));
+		btScalar m02 = (relaxation / (im0 + im2));
+		btScalar m12 = (relaxation / (im1 + im2));
+#ifdef USE_STRAIN_RATE_LIMITING
 		// apply strain limiting to prevent the new velocity to change the current length of the edge by more than 1%.
 		btScalar p = 0.01;
 		btVector3& x0 = face->m_n[0]->m_x;
 		btVector3& x1 = face->m_n[1]->m_x;
 		btVector3& x2 = face->m_n[2]->m_x;
-		const btVector3 x_diff[3] = {x1-x0, x2-x0, x2-x1};
-		const btVector3 v_diff[3] = {v1-v0, v2-v0, v2-v1};
+		const btVector3 x_diff[3] = {x1 - x0, x2 - x0, x2 - x1};
+		const btVector3 v_diff[3] = {v1 - v0, v2 - v0, v2 - v1};
 		btVector3 u[3];
 		btScalar x_diff_dot_u, dn[3];
 		btScalar dt = m_infoGlobal->m_timeStep;
@@ -404,172 +520,201 @@ void btDeformableFaceRigidContactConstraint::applyImpulse(const btVector3& impul
 		{
 			btScalar x_diff_norm = x_diff[i].safeNorm();
 			btScalar x_diff_norm_new = (x_diff[i] + v_diff[i] * dt).safeNorm();
-			btScalar strainRate = x_diff_norm_new/x_diff_norm;
+			btScalar strainRate = x_diff_norm_new / x_diff_norm;
 			u[i] = v_diff[i];
 			u[i].safeNormalize();
-			if (x_diff_norm == 0 || (1-p <= strainRate && strainRate <= 1+p))
+			if (x_diff_norm == 0 || (1 - p <= strainRate && strainRate <= 1 + p))
 			{
 				dn[i] = 0;
 				continue;
 			}
 			x_diff_dot_u = btDot(x_diff[i], u[i]);
 			btScalar s;
-			if (1-p > strainRate)
+			if (1 - p > strainRate)
 			{
-				s = 1/dt * (-x_diff_dot_u - btSqrt(x_diff_dot_u*x_diff_dot_u + (p*p-2*p) * x_diff_norm * x_diff_norm));
+				s = 1 / dt * (-x_diff_dot_u - btSqrt(x_diff_dot_u * x_diff_dot_u + (p * p - 2 * p) * x_diff_norm * x_diff_norm));
 			}
 			else
 			{
-				s = 1/dt * (-x_diff_dot_u + btSqrt(x_diff_dot_u*x_diff_dot_u + (p*p+2*p) * x_diff_norm * x_diff_norm));
+				s = 1 / dt * (-x_diff_dot_u + btSqrt(x_diff_dot_u * x_diff_dot_u + (p * p + 2 * p) * x_diff_norm * x_diff_norm));
 			}
 			//		x_diff_norm_new = (x_diff[i] + s * u[i] * dt).safeNorm();
 			//		strainRate = x_diff_norm_new/x_diff_norm;
 			dn[i] = s - v_diff[i].safeNorm();
 		}
-		btVector3 dv0 = im0 * (m01 * u[0]*(-dn[0]) + m02 * u[1]*-(dn[1]));
-		btVector3 dv1 = im1 * (m01 * u[0]*(dn[0]) + m12 * u[2]*(-dn[2]));
-		btVector3 dv2 = im2 * (m12 * u[2]*(dn[2]) + m02 * u[1]*(dn[1]));
-	#else
+		btVector3 dv0 = im0 * (m01 * u[0] * (-dn[0]) + m02 * u[1] * -(dn[1]));
+		btVector3 dv1 = im1 * (m01 * u[0] * (dn[0]) + m12 * u[2] * (-dn[2]));
+		btVector3 dv2 = im2 * (m12 * u[2] * (dn[2]) + m02 * u[1] * (dn[1]));
+#else
 		// apply strain limiting to prevent undamped modes
-		btVector3 dv0 = im0 * (m01 * (v1-v0) + m02 * (v2-v0));
-		btVector3 dv1 = im1 * (m01 * (v0-v1) + m12 * (v2-v1));
-		btVector3 dv2 = im2 * (m12 * (v1-v2) + m02 * (v0-v2));
-	#endif
+		btVector3 dv0 = im0 * (m01 * (v1 - v0) + m02 * (v2 - v0));
+		btVector3 dv1 = im1 * (m01 * (v0 - v1) + m12 * (v2 - v1));
+		btVector3 dv2 = im2 * (m12 * (v1 - v2) + m02 * (v0 - v2));
+#endif
 		v0 += dv0;
 		v1 += dv1;
 		v2 += dv2;
 	}
 }
 
+btVector3 btDeformableFaceRigidContactConstraint::getSplitVb() const
+{
+	const btSoftBody::DeformableFaceRigidContact* contact = getContact();
+	btVector3 vb = (m_face->m_n[0]->m_splitv) * contact->m_bary[0] + (m_face->m_n[1]->m_splitv) * contact->m_bary[1] + (m_face->m_n[2]->m_splitv) * contact->m_bary[2];
+	return vb;
+}
+
+void btDeformableFaceRigidContactConstraint::applySplitImpulse(const btVector3& impulse)
+{
+	const btSoftBody::DeformableFaceRigidContact* contact = getContact();
+	btVector3 dv = impulse * contact->m_c2;
+	btSoftBody::Face* face = contact->m_face;
+	btVector3& v0 = face->m_n[0]->m_splitv;
+	btVector3& v1 = face->m_n[1]->m_splitv;
+	btVector3& v2 = face->m_n[2]->m_splitv;
+	const btScalar& im0 = face->m_n[0]->m_im;
+	const btScalar& im1 = face->m_n[1]->m_im;
+	const btScalar& im2 = face->m_n[2]->m_im;
+	if (im0 > 0)
+	{
+		v0 -= dv * contact->m_weights[0];
+	}
+	if (im1 > 0)
+	{
+		v1 -= dv * contact->m_weights[1];
+	}
+	if (im2 > 0)
+	{
+		v2 -= dv * contact->m_weights[2];
+	}
+}
+
 /* ================   Face vs. Node   =================== */
 btDeformableFaceNodeContactConstraint::btDeformableFaceNodeContactConstraint(const btSoftBody::DeformableFaceNodeContact& contact, const btContactSolverInfo& infoGlobal)
-: m_node(contact.m_node)
-, m_face(contact.m_face)
-, m_contact(&contact)
-, btDeformableContactConstraint(contact.m_normal, infoGlobal)
+	: m_node(contact.m_node), m_face(contact.m_face), m_contact(&contact), btDeformableContactConstraint(contact.m_normal, infoGlobal)
 {
-    m_total_normal_dv.setZero();
-    m_total_tangent_dv.setZero();
+	m_total_normal_dv.setZero();
+	m_total_tangent_dv.setZero();
 }
 
 btVector3 btDeformableFaceNodeContactConstraint::getVa() const
 {
-    return m_node->m_v;
+	return m_node->m_v;
 }
 
 btVector3 btDeformableFaceNodeContactConstraint::getVb() const
 {
-    const btSoftBody::DeformableFaceNodeContact* contact = getContact();
-    btVector3 vb = m_face->m_n[0]->m_v * contact->m_bary[0] + m_face->m_n[1]->m_v * contact->m_bary[1] + m_face->m_n[2]->m_v * contact->m_bary[2];
-    return vb;
+	const btSoftBody::DeformableFaceNodeContact* contact = getContact();
+	btVector3 vb = m_face->m_n[0]->m_v * contact->m_bary[0] + m_face->m_n[1]->m_v * contact->m_bary[1] + m_face->m_n[2]->m_v * contact->m_bary[2];
+	return vb;
 }
 
 btVector3 btDeformableFaceNodeContactConstraint::getDv(const btSoftBody::Node* n) const
 {
-    btVector3 dv = m_total_normal_dv + m_total_tangent_dv;
-    if (n == m_node)
-        return dv;
-    const btSoftBody::DeformableFaceNodeContact* contact = getContact();
-    if (m_face->m_n[0] == n)
-    {
-        return dv * contact->m_weights[0];
-    }
-    if (m_face->m_n[1] == n)
-    {
-        return dv * contact->m_weights[1];
-    }
-    btAssert(n == m_face->m_n[2]);
-    return dv * contact->m_weights[2];
+	btVector3 dv = m_total_normal_dv + m_total_tangent_dv;
+	if (n == m_node)
+		return dv;
+	const btSoftBody::DeformableFaceNodeContact* contact = getContact();
+	if (m_face->m_n[0] == n)
+	{
+		return dv * contact->m_weights[0];
+	}
+	if (m_face->m_n[1] == n)
+	{
+		return dv * contact->m_weights[1];
+	}
+	btAssert(n == m_face->m_n[2]);
+	return dv * contact->m_weights[2];
 }
 
 btScalar btDeformableFaceNodeContactConstraint::solveConstraint(const btContactSolverInfo& infoGlobal)
 {
-    btVector3 va = getVa();
-    btVector3 vb = getVb();
-    btVector3 vr = vb - va;
-    const btScalar dn = btDot(vr, m_contact->m_normal);
-    // dn is the normal component of velocity diffrerence. Approximates the residual. // todo xuchenhan@: this prob needs to be scaled by dt
-    btScalar residualSquare = dn*dn;
-    btVector3 impulse = m_contact->m_c0 * vr;
-    const btVector3 impulse_normal = m_contact->m_c0 * (m_contact->m_normal * dn);
-    btVector3 impulse_tangent = impulse - impulse_normal;
-    
-    btVector3 old_total_tangent_dv = m_total_tangent_dv;
-    // m_c2 is the inverse mass of the deformable node/face
-    if (m_node->m_im > 0)
-    {
-        m_total_normal_dv -= impulse_normal * m_node->m_im;
-        m_total_tangent_dv -= impulse_tangent * m_node->m_im;
-    }
-    else
-    {
-        m_total_normal_dv -= impulse_normal * m_contact->m_imf;
-        m_total_tangent_dv -= impulse_tangent *  m_contact->m_imf;
-    }
-    
-    if (m_total_normal_dv.dot(m_contact->m_normal) > 0)
-    {
-        // separating in the normal direction
-        m_static = false;
-        m_total_tangent_dv = btVector3(0,0,0);
-        impulse_tangent.setZero();
-    }
-    else
-    {
-        if (m_total_normal_dv.norm() * m_contact->m_friction < m_total_tangent_dv.norm())
-        {
-            // dynamic friction
-            // with dynamic friction, the impulse are still applied to the two objects colliding, however, it does not pose a constraint in the cg solve, hence the change to dv merely serves to update velocity in the contact iterations.
-            m_static = false;
-            if (m_total_tangent_dv.safeNorm() < SIMD_EPSILON)
-            {
-                m_total_tangent_dv = btVector3(0,0,0);
-            }
-            else
-            {
-                m_total_tangent_dv = m_total_tangent_dv.normalized() * m_total_normal_dv.safeNorm() * m_contact->m_friction;
-            }
-            impulse_tangent = -btScalar(1)/m_node->m_im * (m_total_tangent_dv - old_total_tangent_dv);
-        }
-        else
-        {
-            // static friction
-            m_static = true;
-        }
-    }
-    impulse = impulse_normal + impulse_tangent;
-    // apply impulse to deformable nodes involved and change their velocities
-    applyImpulse(impulse);
-    return residualSquare;
+	btVector3 va = getVa();
+	btVector3 vb = getVb();
+	btVector3 vr = vb - va;
+	const btScalar dn = btDot(vr, m_contact->m_normal);
+	// dn is the normal component of velocity diffrerence. Approximates the residual. // todo xuchenhan@: this prob needs to be scaled by dt
+	btScalar residualSquare = dn * dn;
+	btVector3 impulse = m_contact->m_c0 * vr;
+	const btVector3 impulse_normal = m_contact->m_c0 * (m_contact->m_normal * dn);
+	btVector3 impulse_tangent = impulse - impulse_normal;
+
+	btVector3 old_total_tangent_dv = m_total_tangent_dv;
+	// m_c2 is the inverse mass of the deformable node/face
+	if (m_node->m_im > 0)
+	{
+		m_total_normal_dv -= impulse_normal * m_node->m_im;
+		m_total_tangent_dv -= impulse_tangent * m_node->m_im;
+	}
+	else
+	{
+		m_total_normal_dv -= impulse_normal * m_contact->m_imf;
+		m_total_tangent_dv -= impulse_tangent * m_contact->m_imf;
+	}
+
+	if (m_total_normal_dv.dot(m_contact->m_normal) > 0)
+	{
+		// separating in the normal direction
+		m_static = false;
+		m_total_tangent_dv = btVector3(0, 0, 0);
+		impulse_tangent.setZero();
+	}
+	else
+	{
+		if (m_total_normal_dv.norm() * m_contact->m_friction < m_total_tangent_dv.norm())
+		{
+			// dynamic friction
+			// with dynamic friction, the impulse are still applied to the two objects colliding, however, it does not pose a constraint in the cg solve, hence the change to dv merely serves to update velocity in the contact iterations.
+			m_static = false;
+			if (m_total_tangent_dv.safeNorm() < SIMD_EPSILON)
+			{
+				m_total_tangent_dv = btVector3(0, 0, 0);
+			}
+			else
+			{
+				m_total_tangent_dv = m_total_tangent_dv.normalized() * m_total_normal_dv.safeNorm() * m_contact->m_friction;
+			}
+			impulse_tangent = -btScalar(1) / m_node->m_im * (m_total_tangent_dv - old_total_tangent_dv);
+		}
+		else
+		{
+			// static friction
+			m_static = true;
+		}
+	}
+	impulse = impulse_normal + impulse_tangent;
+	// apply impulse to deformable nodes involved and change their velocities
+	applyImpulse(impulse);
+	return residualSquare;
 }
 
 void btDeformableFaceNodeContactConstraint::applyImpulse(const btVector3& impulse)
 {
-    const btSoftBody::DeformableFaceNodeContact* contact = getContact();
-    btVector3 dva = impulse * contact->m_node->m_im;
-    btVector3 dvb = impulse * contact->m_imf;
-    if (contact->m_node->m_im > 0)
-    {
-        contact->m_node->m_v += dva;
-    }
-    
-    btSoftBody::Face* face = contact->m_face;
-    btVector3& v0 = face->m_n[0]->m_v;
-    btVector3& v1 = face->m_n[1]->m_v;
-    btVector3& v2 = face->m_n[2]->m_v;
-    const btScalar& im0 = face->m_n[0]->m_im;
-    const btScalar& im1 = face->m_n[1]->m_im;
-    const btScalar& im2 = face->m_n[2]->m_im;
-    if (im0 > 0)
-    {
-        v0 -= dvb * contact->m_weights[0];
-    }
-    if (im1 > 0)
-    {
-        v1 -= dvb * contact->m_weights[1];
-    }
-    if (im2 > 0)
-    {
-        v2 -= dvb * contact->m_weights[2];
-    }
+	const btSoftBody::DeformableFaceNodeContact* contact = getContact();
+	btVector3 dva = impulse * contact->m_node->m_im;
+	btVector3 dvb = impulse * contact->m_imf;
+	if (contact->m_node->m_im > 0)
+	{
+		contact->m_node->m_v += dva;
+	}
+
+	btSoftBody::Face* face = contact->m_face;
+	btVector3& v0 = face->m_n[0]->m_v;
+	btVector3& v1 = face->m_n[1]->m_v;
+	btVector3& v2 = face->m_n[2]->m_v;
+	const btScalar& im0 = face->m_n[0]->m_im;
+	const btScalar& im1 = face->m_n[1]->m_im;
+	const btScalar& im2 = face->m_n[2]->m_im;
+	if (im0 > 0)
+	{
+		v0 -= dvb * contact->m_weights[0];
+	}
+	if (im1 > 0)
+	{
+		v1 -= dvb * contact->m_weights[1];
+	}
+	if (im2 > 0)
+	{
+		v2 -= dvb * contact->m_weights[2];
+	}
 }
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableContactConstraint.h b/thirdparty/bullet/BulletSoftBody/btDeformableContactConstraint.h
index 9f9d5bf0a3..1e2c9f5bce 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableContactConstraint.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableContactConstraint.h
@@ -21,51 +21,49 @@
 class btDeformableContactConstraint
 {
 public:
-    // True if the friction is static
-    // False if the friction is dynamic
-    bool m_static;
+	// True if the friction is static
+	// False if the friction is dynamic
+	bool m_static;
 	const btContactSolverInfo* m_infoGlobal;
 
 	// normal of the contact
 	btVector3 m_normal;
 
-	btDeformableContactConstraint(const btVector3& normal, const btContactSolverInfo& infoGlobal): m_static(false), m_normal(normal), m_infoGlobal(&infoGlobal)
+	btDeformableContactConstraint(const btVector3& normal, const btContactSolverInfo& infoGlobal) : m_static(false), m_normal(normal), m_infoGlobal(&infoGlobal)
 	{
 	}
 
-	btDeformableContactConstraint(bool isStatic, const btVector3& normal, const btContactSolverInfo& infoGlobal): m_static(isStatic), m_normal(normal), m_infoGlobal(&infoGlobal)
+	btDeformableContactConstraint(bool isStatic, const btVector3& normal, const btContactSolverInfo& infoGlobal) : m_static(isStatic), m_normal(normal), m_infoGlobal(&infoGlobal)
 	{
 	}
-	
-	btDeformableContactConstraint(){}
+
+	btDeformableContactConstraint() {}
 
 	btDeformableContactConstraint(const btDeformableContactConstraint& other)
-	: m_static(other.m_static)
-	, m_normal(other.m_normal)
-	, m_infoGlobal(other.m_infoGlobal)
+		: m_static(other.m_static), m_normal(other.m_normal), m_infoGlobal(other.m_infoGlobal)
 	{
 	}
 
-    virtual ~btDeformableContactConstraint(){}
-    
-    // solve the constraint with inelastic impulse and return the error, which is the square of normal component of velocity diffrerence
-    // the constraint is solved by calculating the impulse between object A and B in the contact and apply the impulse to both objects involved in the contact
-    virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal) = 0;
-    
-    // get the velocity of the object A in the contact
-    virtual btVector3 getVa() const = 0;
-    
-    // get the velocity of the object B in the contact
-    virtual btVector3 getVb() const = 0;
-    
-    // get the velocity change of the soft body node in the constraint
-    virtual btVector3 getDv(const btSoftBody::Node*) const = 0;
-    
-    // apply impulse to the soft body node and/or face involved
-    virtual void applyImpulse(const btVector3& impulse) = 0;
-    
-    // scale the penetration depth by erp
-    virtual void setPenetrationScale(btScalar scale) = 0;
+	virtual ~btDeformableContactConstraint() {}
+
+	// solve the constraint with inelastic impulse and return the error, which is the square of normal component of velocity diffrerence
+	// the constraint is solved by calculating the impulse between object A and B in the contact and apply the impulse to both objects involved in the contact
+	virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal) = 0;
+
+	// get the velocity of the object A in the contact
+	virtual btVector3 getVa() const = 0;
+
+	// get the velocity of the object B in the contact
+	virtual btVector3 getVb() const = 0;
+
+	// get the velocity change of the soft body node in the constraint
+	virtual btVector3 getDv(const btSoftBody::Node*) const = 0;
+
+	// apply impulse to the soft body node and/or face involved
+	virtual void applyImpulse(const btVector3& impulse) = 0;
+
+	// scale the penetration depth by erp
+	virtual void setPenetrationScale(btScalar scale) = 0;
 };
 
 //
@@ -73,42 +71,41 @@ public:
 class btDeformableStaticConstraint : public btDeformableContactConstraint
 {
 public:
-    btSoftBody::Node* m_node;
-    
-    btDeformableStaticConstraint(btSoftBody::Node* node, const btContactSolverInfo& infoGlobal): m_node(node), btDeformableContactConstraint(false, btVector3(0,0,0), infoGlobal)
-    {
-    }
-	btDeformableStaticConstraint(){}
-    btDeformableStaticConstraint(const btDeformableStaticConstraint& other)
-    : m_node(other.m_node)
-    , btDeformableContactConstraint(other)
-    {
-    }
-    
-    virtual ~btDeformableStaticConstraint(){}
-    
-    virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal)
-    {
-        return 0;
-    }
-
-    virtual btVector3 getVa() const
-    {
-        return btVector3(0,0,0);
-    }
-    
-    virtual btVector3 getVb() const
-    {
-        return btVector3(0,0,0);
-    }
-    
-    virtual btVector3 getDv(const btSoftBody::Node* n) const
-    {
-        return btVector3(0,0,0);
-    }
-    
-    virtual void applyImpulse(const btVector3& impulse){}
-    virtual void setPenetrationScale(btScalar scale){}
+	btSoftBody::Node* m_node;
+
+	btDeformableStaticConstraint(btSoftBody::Node* node, const btContactSolverInfo& infoGlobal) : m_node(node), btDeformableContactConstraint(false, btVector3(0, 0, 0), infoGlobal)
+	{
+	}
+	btDeformableStaticConstraint() {}
+	btDeformableStaticConstraint(const btDeformableStaticConstraint& other)
+		: m_node(other.m_node), btDeformableContactConstraint(other)
+	{
+	}
+
+	virtual ~btDeformableStaticConstraint() {}
+
+	virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal)
+	{
+		return 0;
+	}
+
+	virtual btVector3 getVa() const
+	{
+		return btVector3(0, 0, 0);
+	}
+
+	virtual btVector3 getVb() const
+	{
+		return btVector3(0, 0, 0);
+	}
+
+	virtual btVector3 getDv(const btSoftBody::Node* n) const
+	{
+		return btVector3(0, 0, 0);
+	}
+
+	virtual void applyImpulse(const btVector3& impulse) {}
+	virtual void setPenetrationScale(btScalar scale) {}
 };
 
 //
@@ -116,56 +113,67 @@ public:
 class btDeformableNodeAnchorConstraint : public btDeformableContactConstraint
 {
 public:
-    const btSoftBody::DeformableNodeRigidAnchor* m_anchor;
-	
-    btDeformableNodeAnchorConstraint(const btSoftBody::DeformableNodeRigidAnchor& c, const btContactSolverInfo& infoGlobal);
-    btDeformableNodeAnchorConstraint(const btDeformableNodeAnchorConstraint& other);
-	btDeformableNodeAnchorConstraint(){}
-    virtual ~btDeformableNodeAnchorConstraint()
-    {
-    }
-    virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal);
-
-    // object A is the rigid/multi body, and object B is the deformable node/face
-    virtual btVector3 getVa() const;
-    // get the velocity of the deformable node in contact
-    virtual btVector3 getVb() const;
-    virtual btVector3 getDv(const btSoftBody::Node* n) const
-    {
-        return btVector3(0,0,0);
-    }
-    virtual void applyImpulse(const btVector3& impulse);
-
-    virtual void setPenetrationScale(btScalar scale){}
-};
+	const btSoftBody::DeformableNodeRigidAnchor* m_anchor;
 
+	btDeformableNodeAnchorConstraint(const btSoftBody::DeformableNodeRigidAnchor& c, const btContactSolverInfo& infoGlobal);
+	btDeformableNodeAnchorConstraint(const btDeformableNodeAnchorConstraint& other);
+	btDeformableNodeAnchorConstraint() {}
+	virtual ~btDeformableNodeAnchorConstraint()
+	{
+	}
+	virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal);
+
+	// object A is the rigid/multi body, and object B is the deformable node/face
+	virtual btVector3 getVa() const;
+	// get the velocity of the deformable node in contact
+	virtual btVector3 getVb() const;
+	virtual btVector3 getDv(const btSoftBody::Node* n) const
+	{
+		return btVector3(0, 0, 0);
+	}
+	virtual void applyImpulse(const btVector3& impulse);
+
+	virtual void setPenetrationScale(btScalar scale) {}
+};
 
 //
 // Constraint between rigid/multi body and deformable objects
 class btDeformableRigidContactConstraint : public btDeformableContactConstraint
 {
 public:
-    btVector3 m_total_normal_dv;
-    btVector3 m_total_tangent_dv;
-    btScalar m_penetration;
-    const btSoftBody::DeformableRigidContact* m_contact;
-	
-    btDeformableRigidContactConstraint(const btSoftBody::DeformableRigidContact& c, const btContactSolverInfo& infoGlobal);
-    btDeformableRigidContactConstraint(const btDeformableRigidContactConstraint& other);
-	btDeformableRigidContactConstraint(){}
-    virtual ~btDeformableRigidContactConstraint()
-    {
-    }
-    
-    // object A is the rigid/multi body, and object B is the deformable node/face
-    virtual btVector3 getVa() const;
-    
-    virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal);
-    
-    virtual void setPenetrationScale(btScalar scale)
-    {
-        m_penetration *= scale;
-    }
+	btVector3 m_total_normal_dv;
+	btVector3 m_total_tangent_dv;
+	btScalar m_penetration;
+	btScalar m_total_split_impulse;
+	bool m_binding;
+	const btSoftBody::DeformableRigidContact* m_contact;
+
+	btDeformableRigidContactConstraint(const btSoftBody::DeformableRigidContact& c, const btContactSolverInfo& infoGlobal);
+	btDeformableRigidContactConstraint(const btDeformableRigidContactConstraint& other);
+	btDeformableRigidContactConstraint() {}
+	virtual ~btDeformableRigidContactConstraint()
+	{
+	}
+
+	// object A is the rigid/multi body, and object B is the deformable node/face
+	virtual btVector3 getVa() const;
+
+	// get the split impulse velocity of the deformable face at the contact point
+	virtual btVector3 getSplitVb() const = 0;
+
+	// get the split impulse velocity of the rigid/multibdoy at the contaft
+	virtual btVector3 getSplitVa() const;
+
+	virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal);
+
+	virtual void setPenetrationScale(btScalar scale)
+	{
+		m_penetration *= scale;
+	}
+
+	btScalar solveSplitImpulse(const btContactSolverInfo& infoGlobal);
+
+	virtual void applySplitImpulse(const btVector3& impulse) = 0;
 };
 
 //
@@ -173,29 +181,34 @@ public:
 class btDeformableNodeRigidContactConstraint : public btDeformableRigidContactConstraint
 {
 public:
-    // the deformable node in contact
-    btSoftBody::Node* m_node;
-	
-    btDeformableNodeRigidContactConstraint(const btSoftBody::DeformableNodeRigidContact& contact, const btContactSolverInfo& infoGlobal);
-    btDeformableNodeRigidContactConstraint(const btDeformableNodeRigidContactConstraint& other);
-	btDeformableNodeRigidContactConstraint(){}
-    virtual ~btDeformableNodeRigidContactConstraint()
-    {
-    }
-    
-    // get the velocity of the deformable node in contact
-    virtual btVector3 getVb() const;
-    
-    // get the velocity change of the input soft body node in the constraint
-    virtual btVector3 getDv(const btSoftBody::Node*) const;
-    
-    // cast the contact to the desired type
-    const btSoftBody::DeformableNodeRigidContact* getContact() const
-    {
-        return static_cast<const btSoftBody::DeformableNodeRigidContact*>(m_contact);
-    }
-    
-    virtual void applyImpulse(const btVector3& impulse);
+	// the deformable node in contact
+	btSoftBody::Node* m_node;
+
+	btDeformableNodeRigidContactConstraint(const btSoftBody::DeformableNodeRigidContact& contact, const btContactSolverInfo& infoGlobal);
+	btDeformableNodeRigidContactConstraint(const btDeformableNodeRigidContactConstraint& other);
+	btDeformableNodeRigidContactConstraint() {}
+	virtual ~btDeformableNodeRigidContactConstraint()
+	{
+	}
+
+	// get the velocity of the deformable node in contact
+	virtual btVector3 getVb() const;
+
+	// get the split impulse velocity of the deformable face at the contact point
+	virtual btVector3 getSplitVb() const;
+
+	// get the velocity change of the input soft body node in the constraint
+	virtual btVector3 getDv(const btSoftBody::Node*) const;
+
+	// cast the contact to the desired type
+	const btSoftBody::DeformableNodeRigidContact* getContact() const
+	{
+		return static_cast<const btSoftBody::DeformableNodeRigidContact*>(m_contact);
+	}
+
+	virtual void applyImpulse(const btVector3& impulse);
+
+	virtual void applySplitImpulse(const btVector3& impulse);
 };
 
 //
@@ -203,28 +216,33 @@ public:
 class btDeformableFaceRigidContactConstraint : public btDeformableRigidContactConstraint
 {
 public:
-    const btSoftBody::Face* m_face;
-    bool m_useStrainLimiting;
-    btDeformableFaceRigidContactConstraint(const btSoftBody::DeformableFaceRigidContact& contact, const btContactSolverInfo& infoGlobal, bool useStrainLimiting);
-    btDeformableFaceRigidContactConstraint(const btDeformableFaceRigidContactConstraint& other);
-    btDeformableFaceRigidContactConstraint(): m_useStrainLimiting(false) {}
-    virtual ~btDeformableFaceRigidContactConstraint()
-    {
-    }
-    
-    // get the velocity of the deformable face at the contact point
-    virtual btVector3 getVb() const;
-    
-    // get the velocity change of the input soft body node in the constraint
-    virtual btVector3 getDv(const btSoftBody::Node*) const;
-    
-    // cast the contact to the desired type
-    const btSoftBody::DeformableFaceRigidContact* getContact() const
-    {
-        return static_cast<const btSoftBody::DeformableFaceRigidContact*>(m_contact);
-    }
-    
-    virtual void applyImpulse(const btVector3& impulse);
+	btSoftBody::Face* m_face;
+	bool m_useStrainLimiting;
+	btDeformableFaceRigidContactConstraint(const btSoftBody::DeformableFaceRigidContact& contact, const btContactSolverInfo& infoGlobal, bool useStrainLimiting);
+	btDeformableFaceRigidContactConstraint(const btDeformableFaceRigidContactConstraint& other);
+	btDeformableFaceRigidContactConstraint() : m_useStrainLimiting(false) {}
+	virtual ~btDeformableFaceRigidContactConstraint()
+	{
+	}
+
+	// get the velocity of the deformable face at the contact point
+	virtual btVector3 getVb() const;
+
+	// get the split impulse velocity of the deformable face at the contact point
+	virtual btVector3 getSplitVb() const;
+
+	// get the velocity change of the input soft body node in the constraint
+	virtual btVector3 getDv(const btSoftBody::Node*) const;
+
+	// cast the contact to the desired type
+	const btSoftBody::DeformableFaceRigidContact* getContact() const
+	{
+		return static_cast<const btSoftBody::DeformableFaceRigidContact*>(m_contact);
+	}
+
+	virtual void applyImpulse(const btVector3& impulse);
+
+	virtual void applySplitImpulse(const btVector3& impulse);
 };
 
 //
@@ -232,35 +250,35 @@ public:
 class btDeformableFaceNodeContactConstraint : public btDeformableContactConstraint
 {
 public:
-    btSoftBody::Node* m_node;
-    btSoftBody::Face* m_face;
-    const btSoftBody::DeformableFaceNodeContact* m_contact;
-    btVector3 m_total_normal_dv;
-    btVector3 m_total_tangent_dv;
-    
-    btDeformableFaceNodeContactConstraint(const btSoftBody::DeformableFaceNodeContact& contact, const btContactSolverInfo& infoGlobal);
-	btDeformableFaceNodeContactConstraint(){}
-    virtual ~btDeformableFaceNodeContactConstraint(){}
-    
-    virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal);
-    
-    // get the velocity of the object A in the contact
-    virtual btVector3 getVa() const;
-    
-    // get the velocity of the object B in the contact
-    virtual btVector3 getVb() const;
-    
-    // get the velocity change of the input soft body node in the constraint
-    virtual btVector3 getDv(const btSoftBody::Node*) const;
-    
-    // cast the contact to the desired type
-    const btSoftBody::DeformableFaceNodeContact* getContact() const
-    {
-        return static_cast<const btSoftBody::DeformableFaceNodeContact*>(m_contact);
-    }
-    
-    virtual void applyImpulse(const btVector3& impulse);
-
-    virtual void setPenetrationScale(btScalar scale){}
+	btSoftBody::Node* m_node;
+	btSoftBody::Face* m_face;
+	const btSoftBody::DeformableFaceNodeContact* m_contact;
+	btVector3 m_total_normal_dv;
+	btVector3 m_total_tangent_dv;
+
+	btDeformableFaceNodeContactConstraint(const btSoftBody::DeformableFaceNodeContact& contact, const btContactSolverInfo& infoGlobal);
+	btDeformableFaceNodeContactConstraint() {}
+	virtual ~btDeformableFaceNodeContactConstraint() {}
+
+	virtual btScalar solveConstraint(const btContactSolverInfo& infoGlobal);
+
+	// get the velocity of the object A in the contact
+	virtual btVector3 getVa() const;
+
+	// get the velocity of the object B in the contact
+	virtual btVector3 getVb() const;
+
+	// get the velocity change of the input soft body node in the constraint
+	virtual btVector3 getDv(const btSoftBody::Node*) const;
+
+	// cast the contact to the desired type
+	const btSoftBody::DeformableFaceNodeContact* getContact() const
+	{
+		return static_cast<const btSoftBody::DeformableFaceNodeContact*>(m_contact);
+	}
+
+	virtual void applyImpulse(const btVector3& impulse);
+
+	virtual void setPenetrationScale(btScalar scale) {}
 };
 #endif /* BT_DEFORMABLE_CONTACT_CONSTRAINT_H */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableContactProjection.cpp b/thirdparty/bullet/BulletSoftBody/btDeformableContactProjection.cpp
index 22ca8bf582..7f67260ce6 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableContactProjection.cpp
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableContactProjection.cpp
@@ -17,7 +17,7 @@
 #include "btDeformableMultiBodyDynamicsWorld.h"
 #include <algorithm>
 #include <cmath>
-btScalar btDeformableContactProjection::update(btCollisionObject** deformableBodies,int numDeformableBodies, const btContactSolverInfo& infoGlobal)
+btScalar btDeformableContactProjection::update(btCollisionObject** deformableBodies, int numDeformableBodies, const btContactSolverInfo& infoGlobal)
 {
 	btScalar residualSquare = 0;
 	for (int i = 0; i < numDeformableBodies; ++i)
@@ -58,27 +58,37 @@ btScalar btDeformableContactProjection::update(btCollisionObject** deformableBod
 	return residualSquare;
 }
 
-void btDeformableContactProjection::splitImpulseSetup(const btContactSolverInfo& infoGlobal)
+btScalar btDeformableContactProjection::solveSplitImpulse(btCollisionObject** deformableBodies, int numDeformableBodies, const btContactSolverInfo& infoGlobal)
 {
-	for (int i = 0; i < m_softBodies.size(); ++i)
+	btScalar residualSquare = 0;
+	for (int i = 0; i < numDeformableBodies; ++i)
 	{
-		// node constraints
-		for (int j = 0; j < m_nodeRigidConstraints[i].size(); ++j)
-		{
-			btDeformableNodeRigidContactConstraint& constraint = m_nodeRigidConstraints[i][j];
-			constraint.setPenetrationScale(infoGlobal.m_deformable_erp);
-		}
-		// face constraints
-		for (int j = 0; j < m_faceRigidConstraints[i].size(); ++j)
+		for (int j = 0; j < m_softBodies.size(); ++j)
 		{
-			btDeformableFaceRigidContactConstraint& constraint = m_faceRigidConstraints[i][j];
-			constraint.setPenetrationScale(infoGlobal.m_deformable_erp);
+			btCollisionObject* psb = m_softBodies[j];
+			if (psb != deformableBodies[i])
+			{
+				continue;
+			}
+			for (int k = 0; k < m_nodeRigidConstraints[j].size(); ++k)
+			{
+				btDeformableNodeRigidContactConstraint& constraint = m_nodeRigidConstraints[j][k];
+				btScalar localResidualSquare = constraint.solveSplitImpulse(infoGlobal);
+				residualSquare = btMax(residualSquare, localResidualSquare);
+			}
+			for (int k = 0; k < m_faceRigidConstraints[j].size(); ++k)
+			{
+				btDeformableFaceRigidContactConstraint& constraint = m_faceRigidConstraints[j][k];
+				btScalar localResidualSquare = constraint.solveSplitImpulse(infoGlobal);
+				residualSquare = btMax(residualSquare, localResidualSquare);
+			}
 		}
 	}
+	return residualSquare;
 }
 
 void btDeformableContactProjection::setConstraints(const btContactSolverInfo& infoGlobal)
-{  
+{
 	BT_PROFILE("setConstraints");
 	for (int i = 0; i < m_softBodies.size(); ++i)
 	{
@@ -97,7 +107,7 @@ void btDeformableContactProjection::setConstraints(const btContactSolverInfo& in
 				m_staticConstraints[i].push_back(static_constraint);
 			}
 		}
-		
+
 		// set up deformable anchors
 		for (int j = 0; j < psb->m_deformableAnchors.size(); ++j)
 		{
@@ -111,7 +121,7 @@ void btDeformableContactProjection::setConstraints(const btContactSolverInfo& in
 			btDeformableNodeAnchorConstraint constraint(anchor, infoGlobal);
 			m_nodeAnchorConstraints[i].push_back(constraint);
 		}
-		
+
 		// set Deformable Node vs. Rigid constraint
 		for (int j = 0; j < psb->m_nodeRigidContacts.size(); ++j)
 		{
@@ -122,17 +132,9 @@ void btDeformableContactProjection::setConstraints(const btContactSolverInfo& in
 				continue;
 			}
 			btDeformableNodeRigidContactConstraint constraint(contact, infoGlobal);
-			btVector3 va = constraint.getVa();
-			btVector3 vb = constraint.getVb();
-			const btVector3 vr = vb - va;
-			const btSoftBody::sCti& cti = contact.m_cti;
-			const btScalar dn = btDot(vr, cti.m_normal);
-			if (dn < SIMD_EPSILON)
-			{
-				m_nodeRigidConstraints[i].push_back(constraint);
-			}
+			m_nodeRigidConstraints[i].push_back(constraint);
 		}
-		
+
 		// set Deformable Face vs. Rigid constraint
 		for (int j = 0; j < psb->m_faceRigidContacts.size(); ++j)
 		{
@@ -143,15 +145,7 @@ void btDeformableContactProjection::setConstraints(const btContactSolverInfo& in
 				continue;
 			}
 			btDeformableFaceRigidContactConstraint constraint(contact, infoGlobal, m_useStrainLimiting);
-			btVector3 va = constraint.getVa();
-			btVector3 vb = constraint.getVb();
-			const btVector3 vr = vb - va;
-			const btSoftBody::sCti& cti = contact.m_cti;
-			const btScalar dn = btDot(vr, cti.m_normal);
-			if (dn < SIMD_EPSILON)
-			{
-				m_faceRigidConstraints[i].push_back(constraint);
-			}
+			m_faceRigidConstraints[i].push_back(constraint);
 		}
 	}
 }
@@ -159,267 +153,269 @@ void btDeformableContactProjection::setConstraints(const btContactSolverInfo& in
 void btDeformableContactProjection::project(TVStack& x)
 {
 #ifndef USE_MGS
-    const int dim = 3;
-    for (int index = 0; index < m_projectionsDict.size(); ++index)
-    {
-        btAlignedObjectArray<btVector3>& projectionDirs = *m_projectionsDict.getAtIndex(index);
-        size_t i = m_projectionsDict.getKeyAtIndex(index).getUid1();
-        if (projectionDirs.size() >= dim)
-        {
-            // static node
-            x[i].setZero();
-            continue;
-        }
-        else if (projectionDirs.size() == 2)
-        {
-            btVector3 dir0 = projectionDirs[0];
-            btVector3 dir1 = projectionDirs[1];
-            btVector3 free_dir = btCross(dir0, dir1);
-            if (free_dir.safeNorm() < SIMD_EPSILON)
-            {
-                x[i] -= x[i].dot(dir0) * dir0;
-                x[i] -= x[i].dot(dir1) * dir1;
-            }
-            else
-            {
-                free_dir.normalize();
-                x[i] = x[i].dot(free_dir) * free_dir;
-            }
-        }
-        else
-        {
-            btAssert(projectionDirs.size() == 1);
-            btVector3 dir0 = projectionDirs[0];
-            x[i] -= x[i].dot(dir0) * dir0;
-        }
-    }
+	const int dim = 3;
+	for (int index = 0; index < m_projectionsDict.size(); ++index)
+	{
+		btAlignedObjectArray<btVector3>& projectionDirs = *m_projectionsDict.getAtIndex(index);
+		size_t i = m_projectionsDict.getKeyAtIndex(index).getUid1();
+		if (projectionDirs.size() >= dim)
+		{
+			// static node
+			x[i].setZero();
+			continue;
+		}
+		else if (projectionDirs.size() == 2)
+		{
+			btVector3 dir0 = projectionDirs[0];
+			btVector3 dir1 = projectionDirs[1];
+			btVector3 free_dir = btCross(dir0, dir1);
+			if (free_dir.safeNorm() < SIMD_EPSILON)
+			{
+				x[i] -= x[i].dot(dir0) * dir0;
+			}
+			else
+			{
+				free_dir.normalize();
+				x[i] = x[i].dot(free_dir) * free_dir;
+			}
+		}
+		else
+		{
+			btAssert(projectionDirs.size() == 1);
+			btVector3 dir0 = projectionDirs[0];
+			x[i] -= x[i].dot(dir0) * dir0;
+		}
+	}
 #else
-    btReducedVector p(x.size());
-    for (int i = 0; i < m_projections.size(); ++i)
-    {
-        p += (m_projections[i].dot(x) * m_projections[i]);
-    }
-    for (int i = 0; i < p.m_indices.size(); ++i)
-    {
-        x[p.m_indices[i]] -= p.m_vecs[i];
-    }
+	btReducedVector p(x.size());
+	for (int i = 0; i < m_projections.size(); ++i)
+	{
+		p += (m_projections[i].dot(x) * m_projections[i]);
+	}
+	for (int i = 0; i < p.m_indices.size(); ++i)
+	{
+		x[p.m_indices[i]] -= p.m_vecs[i];
+	}
 #endif
 }
 
 void btDeformableContactProjection::setProjection()
 {
 #ifndef USE_MGS
-    BT_PROFILE("btDeformableContactProjection::setProjection");
-    btAlignedObjectArray<btVector3> units;
-    units.push_back(btVector3(1,0,0));
-    units.push_back(btVector3(0,1,0));
-    units.push_back(btVector3(0,0,1));
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        if (!psb->isActive())
-        {
-            continue;
-        }
-        for (int j = 0; j < m_staticConstraints[i].size(); ++j)
-        {
-            int index = m_staticConstraints[i][j].m_node->index;
-            m_staticConstraints[i][j].m_node->m_penetration = SIMD_INFINITY;
-            if (m_projectionsDict.find(index) == NULL)
-            {
-                m_projectionsDict.insert(index, units);
-            }
-            else
-            {
-                btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
-                for (int k = 0; k < 3; ++k)
-                {
-                    projections.push_back(units[k]);
-                }
-            }
-        }
-        for (int j = 0; j < m_nodeAnchorConstraints[i].size(); ++j)
-        {
-            int index = m_nodeAnchorConstraints[i][j].m_anchor->m_node->index;
-            m_nodeAnchorConstraints[i][j].m_anchor->m_node->m_penetration = SIMD_INFINITY;
-            if (m_projectionsDict.find(index) == NULL)
-            {
-                m_projectionsDict.insert(index, units);
-            }
-            else
-            {
-                btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
-                for (int k = 0; k < 3; ++k)
-                {
-                    projections.push_back(units[k]);
-                }
-            }
-        }
-        for (int j = 0; j < m_nodeRigidConstraints[i].size(); ++j)
-        {
-            int index = m_nodeRigidConstraints[i][j].m_node->index;
-            m_nodeRigidConstraints[i][j].m_node->m_penetration = -m_nodeRigidConstraints[i][j].getContact()->m_cti.m_offset;
-            if (m_nodeRigidConstraints[i][j].m_static)
-            {
-                if (m_projectionsDict.find(index) == NULL)
-                {
-                    m_projectionsDict.insert(index, units);
-                }
-                else
-                {
-                    btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
-                    for (int k = 0; k < 3; ++k)
-                    {
-                        projections.push_back(units[k]);
-                    }
-                }
-            }
-            else
-            {
-                if (m_projectionsDict.find(index) == NULL)
-                {
-                    btAlignedObjectArray<btVector3> projections;
-                    projections.push_back(m_nodeRigidConstraints[i][j].m_normal);
-                    m_projectionsDict.insert(index, projections);
-                }
-                else
-                {
-                    btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
-                    projections.push_back(m_nodeRigidConstraints[i][j].m_normal);
-                }
-            }
-        }
-        for (int j = 0; j < m_faceRigidConstraints[i].size(); ++j)
-        {
-            const btSoftBody::Face* face = m_faceRigidConstraints[i][j].m_face;
-            btScalar penetration = -m_faceRigidConstraints[i][j].getContact()->m_cti.m_offset;
-            for (int k = 0; k < 3; ++k)
-            {
-                face->m_n[k]->m_penetration = btMax(face->m_n[k]->m_penetration, penetration);
-            }
-            for (int k = 0; k < 3; ++k)
-            {
-                btSoftBody::Node* node = face->m_n[k];
-                node->m_penetration = true;
-                int index = node->index;
-                if (m_faceRigidConstraints[i][j].m_static)
-                {
-                    if (m_projectionsDict.find(index) == NULL)
-                    {
-                        m_projectionsDict.insert(index, units);
-                    }
-                    else
-                    {
-                        btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
-                        for (int k = 0; k < 3; ++k)
-                        {
-                            projections.push_back(units[k]);
-                        }
-                    }
-                }
-                else
-                {
-                    if (m_projectionsDict.find(index) == NULL)
-                    {
-                        btAlignedObjectArray<btVector3> projections;
-                        projections.push_back(m_faceRigidConstraints[i][j].m_normal);
-                        m_projectionsDict.insert(index, projections);
-                    }
-                    else
-                    {
-                        btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
-                        projections.push_back(m_faceRigidConstraints[i][j].m_normal);
-                    }
-                }
-            }
-        }
-    }
+	BT_PROFILE("btDeformableContactProjection::setProjection");
+	btAlignedObjectArray<btVector3> units;
+	units.push_back(btVector3(1, 0, 0));
+	units.push_back(btVector3(0, 1, 0));
+	units.push_back(btVector3(0, 0, 1));
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		if (!psb->isActive())
+		{
+			continue;
+		}
+		for (int j = 0; j < m_staticConstraints[i].size(); ++j)
+		{
+			int index = m_staticConstraints[i][j].m_node->index;
+			m_staticConstraints[i][j].m_node->m_constrained = true;
+			if (m_projectionsDict.find(index) == NULL)
+			{
+				m_projectionsDict.insert(index, units);
+			}
+			else
+			{
+				btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
+				for (int k = 0; k < 3; ++k)
+				{
+					projections.push_back(units[k]);
+				}
+			}
+		}
+		for (int j = 0; j < m_nodeAnchorConstraints[i].size(); ++j)
+		{
+			int index = m_nodeAnchorConstraints[i][j].m_anchor->m_node->index;
+			m_nodeAnchorConstraints[i][j].m_anchor->m_node->m_constrained = true;
+			if (m_projectionsDict.find(index) == NULL)
+			{
+				m_projectionsDict.insert(index, units);
+			}
+			else
+			{
+				btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
+				for (int k = 0; k < 3; ++k)
+				{
+					projections.push_back(units[k]);
+				}
+			}
+		}
+		for (int j = 0; j < m_nodeRigidConstraints[i].size(); ++j)
+		{
+			int index = m_nodeRigidConstraints[i][j].m_node->index;
+			m_nodeRigidConstraints[i][j].m_node->m_constrained = true;
+			if (m_nodeRigidConstraints[i][j].m_binding)
+			{
+				if (m_nodeRigidConstraints[i][j].m_static)
+				{
+					if (m_projectionsDict.find(index) == NULL)
+					{
+						m_projectionsDict.insert(index, units);
+					}
+					else
+					{
+						btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
+						for (int k = 0; k < 3; ++k)
+						{
+							projections.push_back(units[k]);
+						}
+					}
+				}
+				else
+				{
+					if (m_projectionsDict.find(index) == NULL)
+					{
+						btAlignedObjectArray<btVector3> projections;
+						projections.push_back(m_nodeRigidConstraints[i][j].m_normal);
+						m_projectionsDict.insert(index, projections);
+					}
+					else
+					{
+						btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
+						projections.push_back(m_nodeRigidConstraints[i][j].m_normal);
+					}
+				}
+			}
+		}
+		for (int j = 0; j < m_faceRigidConstraints[i].size(); ++j)
+		{
+			const btSoftBody::Face* face = m_faceRigidConstraints[i][j].m_face;
+			if (m_faceRigidConstraints[i][j].m_binding)
+			{
+				for (int k = 0; k < 3; ++k)
+				{
+					face->m_n[k]->m_constrained = true;
+				}
+			}
+			for (int k = 0; k < 3; ++k)
+			{
+				btSoftBody::Node* node = face->m_n[k];
+				int index = node->index;
+				if (m_faceRigidConstraints[i][j].m_static)
+				{
+					if (m_projectionsDict.find(index) == NULL)
+					{
+						m_projectionsDict.insert(index, units);
+					}
+					else
+					{
+						btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
+						for (int l = 0; l < 3; ++l)
+						{
+							projections.push_back(units[l]);
+						}
+					}
+				}
+				else
+				{
+					if (m_projectionsDict.find(index) == NULL)
+					{
+						btAlignedObjectArray<btVector3> projections;
+						projections.push_back(m_faceRigidConstraints[i][j].m_normal);
+						m_projectionsDict.insert(index, projections);
+					}
+					else
+					{
+						btAlignedObjectArray<btVector3>& projections = *m_projectionsDict[index];
+						projections.push_back(m_faceRigidConstraints[i][j].m_normal);
+					}
+				}
+			}
+		}
+	}
 #else
-    int dof = 0;
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        dof += m_softBodies[i]->m_nodes.size();
-    }
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        if (!psb->isActive())
-        {
-            continue;
-        }
-        for (int j = 0; j < m_staticConstraints[i].size(); ++j)
-        {
-            int index = m_staticConstraints[i][j].m_node->index;
-            m_staticConstraints[i][j].m_node->m_penetration = SIMD_INFINITY;
-            btAlignedObjectArray<int> indices;
-            btAlignedObjectArray<btVector3> vecs1,vecs2,vecs3;
-            indices.push_back(index);
-            vecs1.push_back(btVector3(1,0,0));
-            vecs2.push_back(btVector3(0,1,0));
-            vecs3.push_back(btVector3(0,0,1));
-            m_projections.push_back(btReducedVector(dof, indices, vecs1));
-            m_projections.push_back(btReducedVector(dof, indices, vecs2));
-            m_projections.push_back(btReducedVector(dof, indices, vecs3));
-        }
-        
-        for (int j = 0; j < m_nodeAnchorConstraints[i].size(); ++j)
-        {
-            int index = m_nodeAnchorConstraints[i][j].m_anchor->m_node->index;
-            m_nodeAnchorConstraints[i][j].m_anchor->m_node->m_penetration = SIMD_INFINITY;
-            btAlignedObjectArray<int> indices;
-            btAlignedObjectArray<btVector3> vecs1,vecs2,vecs3;
-            indices.push_back(index);
-            vecs1.push_back(btVector3(1,0,0));
-            vecs2.push_back(btVector3(0,1,0));
-            vecs3.push_back(btVector3(0,0,1));
-            m_projections.push_back(btReducedVector(dof, indices, vecs1));
-            m_projections.push_back(btReducedVector(dof, indices, vecs2));
-            m_projections.push_back(btReducedVector(dof, indices, vecs3));
-        }
-        for (int j = 0; j < m_nodeRigidConstraints[i].size(); ++j)
-        {
-            int index = m_nodeRigidConstraints[i][j].m_node->index;
-            m_nodeRigidConstraints[i][j].m_node->m_penetration = -m_nodeRigidConstraints[i][j].getContact()->m_cti.m_offset;
-            btAlignedObjectArray<int> indices;
-            indices.push_back(index);
-            btAlignedObjectArray<btVector3> vecs1,vecs2,vecs3;
-            if (m_nodeRigidConstraints[i][j].m_static)
-            {
-                vecs1.push_back(btVector3(1,0,0));
-                vecs2.push_back(btVector3(0,1,0));
-                vecs3.push_back(btVector3(0,0,1));
-                m_projections.push_back(btReducedVector(dof, indices, vecs1));
-                m_projections.push_back(btReducedVector(dof, indices, vecs2));
-                m_projections.push_back(btReducedVector(dof, indices, vecs3));
-            }
-            else
-            {
-                vecs1.push_back(m_nodeRigidConstraints[i][j].m_normal);
-                m_projections.push_back(btReducedVector(dof, indices, vecs1));
-            }
-        }
-        for (int j = 0; j < m_faceRigidConstraints[i].size(); ++j)
-        {
-            const btSoftBody::Face* face = m_faceRigidConstraints[i][j].m_face;
+	int dof = 0;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		dof += m_softBodies[i]->m_nodes.size();
+	}
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		if (!psb->isActive())
+		{
+			continue;
+		}
+		for (int j = 0; j < m_staticConstraints[i].size(); ++j)
+		{
+			int index = m_staticConstraints[i][j].m_node->index;
+			m_staticConstraints[i][j].m_node->m_penetration = SIMD_INFINITY;
+			btAlignedObjectArray<int> indices;
+			btAlignedObjectArray<btVector3> vecs1, vecs2, vecs3;
+			indices.push_back(index);
+			vecs1.push_back(btVector3(1, 0, 0));
+			vecs2.push_back(btVector3(0, 1, 0));
+			vecs3.push_back(btVector3(0, 0, 1));
+			m_projections.push_back(btReducedVector(dof, indices, vecs1));
+			m_projections.push_back(btReducedVector(dof, indices, vecs2));
+			m_projections.push_back(btReducedVector(dof, indices, vecs3));
+		}
+
+		for (int j = 0; j < m_nodeAnchorConstraints[i].size(); ++j)
+		{
+			int index = m_nodeAnchorConstraints[i][j].m_anchor->m_node->index;
+			m_nodeAnchorConstraints[i][j].m_anchor->m_node->m_penetration = SIMD_INFINITY;
+			btAlignedObjectArray<int> indices;
+			btAlignedObjectArray<btVector3> vecs1, vecs2, vecs3;
+			indices.push_back(index);
+			vecs1.push_back(btVector3(1, 0, 0));
+			vecs2.push_back(btVector3(0, 1, 0));
+			vecs3.push_back(btVector3(0, 0, 1));
+			m_projections.push_back(btReducedVector(dof, indices, vecs1));
+			m_projections.push_back(btReducedVector(dof, indices, vecs2));
+			m_projections.push_back(btReducedVector(dof, indices, vecs3));
+		}
+		for (int j = 0; j < m_nodeRigidConstraints[i].size(); ++j)
+		{
+			int index = m_nodeRigidConstraints[i][j].m_node->index;
+			m_nodeRigidConstraints[i][j].m_node->m_penetration = -m_nodeRigidConstraints[i][j].getContact()->m_cti.m_offset;
+			btAlignedObjectArray<int> indices;
+			indices.push_back(index);
+			btAlignedObjectArray<btVector3> vecs1, vecs2, vecs3;
+			if (m_nodeRigidConstraints[i][j].m_static)
+			{
+				vecs1.push_back(btVector3(1, 0, 0));
+				vecs2.push_back(btVector3(0, 1, 0));
+				vecs3.push_back(btVector3(0, 0, 1));
+				m_projections.push_back(btReducedVector(dof, indices, vecs1));
+				m_projections.push_back(btReducedVector(dof, indices, vecs2));
+				m_projections.push_back(btReducedVector(dof, indices, vecs3));
+			}
+			else
+			{
+				vecs1.push_back(m_nodeRigidConstraints[i][j].m_normal);
+				m_projections.push_back(btReducedVector(dof, indices, vecs1));
+			}
+		}
+		for (int j = 0; j < m_faceRigidConstraints[i].size(); ++j)
+		{
+			const btSoftBody::Face* face = m_faceRigidConstraints[i][j].m_face;
 			btVector3 bary = m_faceRigidConstraints[i][j].getContact()->m_bary;
-            btScalar penetration = -m_faceRigidConstraints[i][j].getContact()->m_cti.m_offset;
-            for (int k = 0; k < 3; ++k)
-            {
-                face->m_n[k]->m_penetration = btMax(face->m_n[k]->m_penetration, penetration);
-            }
+			btScalar penetration = -m_faceRigidConstraints[i][j].getContact()->m_cti.m_offset;
+			for (int k = 0; k < 3; ++k)
+			{
+				face->m_n[k]->m_penetration = btMax(face->m_n[k]->m_penetration, penetration);
+			}
 			if (m_faceRigidConstraints[i][j].m_static)
 			{
 				for (int l = 0; l < 3; ++l)
 				{
-					
 					btReducedVector rv(dof);
 					for (int k = 0; k < 3; ++k)
 					{
 						rv.m_indices.push_back(face->m_n[k]->index);
-						btVector3 v(0,0,0);
+						btVector3 v(0, 0, 0);
 						v[l] = bary[k];
 						rv.m_vecs.push_back(v);
-                        rv.sort();
+						rv.sort();
 					}
 					m_projections.push_back(rv);
 				}
@@ -431,121 +427,134 @@ void btDeformableContactProjection::setProjection()
 				{
 					rv.m_indices.push_back(face->m_n[k]->index);
 					rv.m_vecs.push_back(bary[k] * m_faceRigidConstraints[i][j].m_normal);
-                    rv.sort();
+					rv.sort();
 				}
 				m_projections.push_back(rv);
 			}
 		}
-    }
-    btModifiedGramSchmidt<btReducedVector> mgs(m_projections);
-    mgs.solve();
-    m_projections = mgs.m_out;
+	}
+	btModifiedGramSchmidt<btReducedVector> mgs(m_projections);
+	mgs.solve();
+	m_projections = mgs.m_out;
 #endif
 }
 
 void btDeformableContactProjection::checkConstraints(const TVStack& x)
 {
-    for (int i = 0; i < m_lagrangeMultipliers.size(); ++i)
-    {
-        btVector3 d(0,0,0);
-        const LagrangeMultiplier& lm = m_lagrangeMultipliers[i];
-        for (int j = 0; j < lm.m_num_constraints; ++j)
-        {
-            for (int k = 0; k < lm.m_num_nodes; ++k)
-            {
-                d[j] += lm.m_weights[k] * x[lm.m_indices[k]].dot(lm.m_dirs[j]);
-            }
-        }
-        printf("d = %f, %f, %f\n",d[0],d[1],d[2]);
-    }
+	for (int i = 0; i < m_lagrangeMultipliers.size(); ++i)
+	{
+		btVector3 d(0, 0, 0);
+		const LagrangeMultiplier& lm = m_lagrangeMultipliers[i];
+		for (int j = 0; j < lm.m_num_constraints; ++j)
+		{
+			for (int k = 0; k < lm.m_num_nodes; ++k)
+			{
+				d[j] += lm.m_weights[k] * x[lm.m_indices[k]].dot(lm.m_dirs[j]);
+			}
+		}
+		//		printf("d = %f, %f, %f\n", d[0], d[1], d[2]);
+		//        printf("val = %f, %f, %f\n", lm.m_vals[0], lm.m_vals[1], lm.m_vals[2]);
+	}
 }
 
 void btDeformableContactProjection::setLagrangeMultiplier()
 {
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        if (!psb->isActive())
-        {
-            continue;
-        }
-        for (int j = 0; j < m_staticConstraints[i].size(); ++j)
-        {
-            int index = m_staticConstraints[i][j].m_node->index;
-            m_staticConstraints[i][j].m_node->m_penetration = SIMD_INFINITY;
-            LagrangeMultiplier lm;
-            lm.m_num_nodes = 1;
-            lm.m_indices[0] = index;
-            lm.m_weights[0] = 1.0;
-            lm.m_num_constraints = 3;
-            lm.m_dirs[0] = btVector3(1,0,0);
-            lm.m_dirs[1] = btVector3(0,1,0);
-            lm.m_dirs[2] = btVector3(0,0,1);
-            m_lagrangeMultipliers.push_back(lm);
-        }
-        for (int j = 0; j < m_nodeAnchorConstraints[i].size(); ++j)
-        {
-            int index = m_nodeAnchorConstraints[i][j].m_anchor->m_node->index;
-            m_nodeAnchorConstraints[i][j].m_anchor->m_node->m_penetration = SIMD_INFINITY;
-            LagrangeMultiplier lm;
-            lm.m_num_nodes = 1;
-            lm.m_indices[0] = index;
-            lm.m_weights[0] = 1.0;
-            lm.m_num_constraints = 3;
-            lm.m_dirs[0] = btVector3(1,0,0);
-            lm.m_dirs[1] = btVector3(0,1,0);
-            lm.m_dirs[2] = btVector3(0,0,1);
-            m_lagrangeMultipliers.push_back(lm);
-        }
-        for (int j = 0; j < m_nodeRigidConstraints[i].size(); ++j)
-        {
-            int index = m_nodeRigidConstraints[i][j].m_node->index;
-            m_nodeRigidConstraints[i][j].m_node->m_penetration = -m_nodeRigidConstraints[i][j].getContact()->m_cti.m_offset;
-            LagrangeMultiplier lm;
-            lm.m_num_nodes = 1;
-            lm.m_indices[0] = index;
-            lm.m_weights[0] = 1.0;
-            if (m_nodeRigidConstraints[i][j].m_static)
-            {
-                lm.m_num_constraints = 3;
-                lm.m_dirs[0] = btVector3(1,0,0);
-                lm.m_dirs[1] = btVector3(0,1,0);
-                lm.m_dirs[2] = btVector3(0,0,1);
-            }
-            else
-            {
-                lm.m_num_constraints = 1;
-                lm.m_dirs[0] = m_nodeRigidConstraints[i][j].m_normal;
-            }
-            m_lagrangeMultipliers.push_back(lm);
-        }
-        for (int j = 0; j < m_faceRigidConstraints[i].size(); ++j)
-        {
-            const btSoftBody::Face* face = m_faceRigidConstraints[i][j].m_face;
-			
-            btVector3 bary = m_faceRigidConstraints[i][j].getContact()->m_bary;
-            btScalar penetration = -m_faceRigidConstraints[i][j].getContact()->m_cti.m_offset;
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		if (!psb->isActive())
+		{
+			continue;
+		}
+		for (int j = 0; j < m_staticConstraints[i].size(); ++j)
+		{
+			int index = m_staticConstraints[i][j].m_node->index;
+			m_staticConstraints[i][j].m_node->m_constrained = true;
+			LagrangeMultiplier lm;
+			lm.m_num_nodes = 1;
+			lm.m_indices[0] = index;
+			lm.m_weights[0] = 1.0;
+			lm.m_num_constraints = 3;
+			lm.m_dirs[0] = btVector3(1, 0, 0);
+			lm.m_dirs[1] = btVector3(0, 1, 0);
+			lm.m_dirs[2] = btVector3(0, 0, 1);
+			m_lagrangeMultipliers.push_back(lm);
+		}
+		for (int j = 0; j < m_nodeAnchorConstraints[i].size(); ++j)
+		{
+			int index = m_nodeAnchorConstraints[i][j].m_anchor->m_node->index;
+			m_nodeAnchorConstraints[i][j].m_anchor->m_node->m_constrained = true;
+			LagrangeMultiplier lm;
+			lm.m_num_nodes = 1;
+			lm.m_indices[0] = index;
+			lm.m_weights[0] = 1.0;
+			lm.m_num_constraints = 3;
+			lm.m_dirs[0] = btVector3(1, 0, 0);
+			lm.m_dirs[1] = btVector3(0, 1, 0);
+			lm.m_dirs[2] = btVector3(0, 0, 1);
+			m_lagrangeMultipliers.push_back(lm);
+		}
+
+		for (int j = 0; j < m_nodeRigidConstraints[i].size(); ++j)
+		{
+			if (!m_nodeRigidConstraints[i][j].m_binding)
+			{
+				continue;
+			}
+			int index = m_nodeRigidConstraints[i][j].m_node->index;
+			m_nodeRigidConstraints[i][j].m_node->m_constrained = true;
+			LagrangeMultiplier lm;
+			lm.m_num_nodes = 1;
+			lm.m_indices[0] = index;
+			lm.m_weights[0] = 1.0;
+			if (m_nodeRigidConstraints[i][j].m_static)
+			{
+				lm.m_num_constraints = 3;
+				lm.m_dirs[0] = btVector3(1, 0, 0);
+				lm.m_dirs[1] = btVector3(0, 1, 0);
+				lm.m_dirs[2] = btVector3(0, 0, 1);
+			}
+			else
+			{
+				lm.m_num_constraints = 1;
+				lm.m_dirs[0] = m_nodeRigidConstraints[i][j].m_normal;
+			}
+			m_lagrangeMultipliers.push_back(lm);
+		}
+
+		for (int j = 0; j < m_faceRigidConstraints[i].size(); ++j)
+		{
+			if (!m_faceRigidConstraints[i][j].m_binding)
+			{
+				continue;
+			}
+			btSoftBody::Face* face = m_faceRigidConstraints[i][j].m_face;
+
+			btVector3 bary = m_faceRigidConstraints[i][j].getContact()->m_bary;
 			LagrangeMultiplier lm;
 			lm.m_num_nodes = 3;
-			for (int k = 0; k<3; ++k)
+
+			for (int k = 0; k < 3; ++k)
 			{
-				face->m_n[k]->m_penetration = btMax(face->m_n[k]->m_penetration, penetration);
+				face->m_n[k]->m_constrained = true;
 				lm.m_indices[k] = face->m_n[k]->index;
 				lm.m_weights[k] = bary[k];
 			}
-            if (m_faceRigidConstraints[i][j].m_static)
-            {
+			if (m_faceRigidConstraints[i][j].m_static)
+			{
+				face->m_pcontact[3] = 1;
 				lm.m_num_constraints = 3;
-				lm.m_dirs[0] = btVector3(1,0,0);
-				lm.m_dirs[1] = btVector3(0,1,0);
-				lm.m_dirs[2] = btVector3(0,0,1);
+				lm.m_dirs[0] = btVector3(1, 0, 0);
+				lm.m_dirs[1] = btVector3(0, 1, 0);
+				lm.m_dirs[2] = btVector3(0, 0, 1);
 			}
 			else
 			{
+				face->m_pcontact[3] = 0;
 				lm.m_num_constraints = 1;
 				lm.m_dirs[0] = m_faceRigidConstraints[i][j].m_normal;
 			}
-            m_lagrangeMultipliers.push_back(lm);
+			m_lagrangeMultipliers.push_back(lm);
 		}
 	}
 }
@@ -562,7 +571,7 @@ void btDeformableContactProjection::applyDynamicFriction(TVStack& f)
 			if (node->m_im != 0)
 			{
 				int index = node->index;
-				f[index] += constraint.getDv(node)* (1./node->m_im);
+				f[index] += constraint.getDv(node) * (1. / node->m_im);
 			}
 		}
 		for (int j = 0; j < m_faceRigidConstraints[i].size(); ++j)
@@ -575,7 +584,7 @@ void btDeformableContactProjection::applyDynamicFriction(TVStack& f)
 				if (node->m_im != 0)
 				{
 					int index = node->index;
-					f[index] += constraint.getDv(node)* (1./node->m_im);
+					f[index] += constraint.getDv(node) * (1. / node->m_im);
 				}
 			}
 		}
@@ -587,7 +596,7 @@ void btDeformableContactProjection::applyDynamicFriction(TVStack& f)
 			if (node->m_im != 0)
 			{
 				int index = node->index;
-				f[index] += constraint.getDv(node)* (1./node->m_im);
+				f[index] += constraint.getDv(node) * (1. / node->m_im);
 			}
 			for (int k = 0; k < 3; ++k)
 			{
@@ -595,7 +604,7 @@ void btDeformableContactProjection::applyDynamicFriction(TVStack& f)
 				if (node->m_im != 0)
 				{
 					int index = node->index;
-					f[index] += constraint.getDv(node)* (1./node->m_im);
+					f[index] += constraint.getDv(node) * (1. / node->m_im);
 				}
 			}
 		}
@@ -612,9 +621,8 @@ void btDeformableContactProjection::reinitialize(bool nodeUpdated)
 		m_nodeRigidConstraints.resize(N);
 		m_faceRigidConstraints.resize(N);
 		m_deformableConstraints.resize(N);
-		
 	}
-	for (int i = 0 ; i < N; ++i)
+	for (int i = 0; i < N; ++i)
 	{
 		m_staticConstraints[i].clear();
 		m_nodeAnchorConstraints[i].clear();
@@ -623,12 +631,9 @@ void btDeformableContactProjection::reinitialize(bool nodeUpdated)
 		m_deformableConstraints[i].clear();
 	}
 #ifndef USE_MGS
-    m_projectionsDict.clear();
+	m_projectionsDict.clear();
 #else
-    m_projections.clear();
+	m_projections.clear();
 #endif
-    m_lagrangeMultipliers.clear();
+	m_lagrangeMultipliers.clear();
 }
-
-
-
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableContactProjection.h b/thirdparty/bullet/BulletSoftBody/btDeformableContactProjection.h
index 8d7e94d4fb..4964eaf990 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableContactProjection.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableContactProjection.h
@@ -27,31 +27,30 @@
 
 struct LagrangeMultiplier
 {
-    int m_num_constraints;        // Number of constraints
-    int m_num_nodes;              // Number of nodes in these constraints
-    btScalar m_weights[3];        // weights of the nodes involved, same size as m_num_nodes
-    btVector3 m_dirs[3];          // Constraint directions, same size of m_num_constraints;
-    int m_indices[3];             // indices of the nodes involved, same size as m_num_nodes;
+	int m_num_constraints;  // Number of constraints
+	int m_num_nodes;        // Number of nodes in these constraints
+	btScalar m_weights[3];  // weights of the nodes involved, same size as m_num_nodes
+	btVector3 m_dirs[3];    // Constraint directions, same size of m_num_constraints;
+	int m_indices[3];       // indices of the nodes involved, same size as m_num_nodes;
 };
 
-
 class btDeformableContactProjection
 {
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    btAlignedObjectArray<btSoftBody *>& m_softBodies;
-	
-    // all constraints involving face
-    btAlignedObjectArray<btDeformableContactConstraint*> m_allFaceConstraints;
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	btAlignedObjectArray<btSoftBody*>& m_softBodies;
+
+	// all constraints involving face
+	btAlignedObjectArray<btDeformableContactConstraint*> m_allFaceConstraints;
 #ifndef USE_MGS
-    // map from node index to projection directions
-    btHashMap<btHashInt, btAlignedObjectArray<btVector3> > m_projectionsDict;
+	// map from node index to projection directions
+	btHashMap<btHashInt, btAlignedObjectArray<btVector3> > m_projectionsDict;
 #else
-    btAlignedObjectArray<btReducedVector> m_projections;
+	btAlignedObjectArray<btReducedVector> m_projections;
 #endif
-    
-    btAlignedObjectArray<LagrangeMultiplier> m_lagrangeMultipliers;
-    
+
+	btAlignedObjectArray<LagrangeMultiplier> m_lagrangeMultipliers;
+
 	// map from node index to static constraint
 	btAlignedObjectArray<btAlignedObjectArray<btDeformableStaticConstraint> > m_staticConstraints;
 	// map from node index to node rigid constraint
@@ -62,39 +61,39 @@ public:
 	btAlignedObjectArray<btAlignedObjectArray<btDeformableFaceNodeContactConstraint> > m_deformableConstraints;
 	// map from node index to node anchor constraint
 	btAlignedObjectArray<btAlignedObjectArray<btDeformableNodeAnchorConstraint> > m_nodeAnchorConstraints;
-    
-    bool m_useStrainLimiting;
-    
-    btDeformableContactProjection(btAlignedObjectArray<btSoftBody *>& softBodies)
-    : m_softBodies(softBodies)
-    {
-    }
-    
-    virtual ~btDeformableContactProjection()
-    {
-    }
-    
-    // apply the constraints to the rhs of the linear solve
-    virtual void project(TVStack& x);
-    
-    // add friction force to the rhs of the linear solve
-    virtual void applyDynamicFriction(TVStack& f);
-    
-    // update and solve the constraints
-    virtual btScalar update(btCollisionObject** deformableBodies,int numDeformableBodies, const btContactSolverInfo& infoGlobal);
-    
-    // Add constraints to m_constraints. In addition, the constraints that each vertex own are recorded in m_constraintsDict.
-    virtual void setConstraints(const btContactSolverInfo& infoGlobal);
-    
-    // Set up projections for each vertex by adding the projection direction to
-    virtual void setProjection();
-    
-    virtual void reinitialize(bool nodeUpdated);
-    
-    virtual void splitImpulseSetup(const btContactSolverInfo& infoGlobal);
-    
-    virtual void setLagrangeMultiplier();
-    
-    void checkConstraints(const TVStack& x);
+
+	bool m_useStrainLimiting;
+
+	btDeformableContactProjection(btAlignedObjectArray<btSoftBody*>& softBodies)
+		: m_softBodies(softBodies)
+	{
+	}
+
+	virtual ~btDeformableContactProjection()
+	{
+	}
+
+	// apply the constraints to the rhs of the linear solve
+	virtual void project(TVStack& x);
+
+	// add friction force to the rhs of the linear solve
+	virtual void applyDynamicFriction(TVStack& f);
+
+	// update and solve the constraints
+	virtual btScalar update(btCollisionObject** deformableBodies, int numDeformableBodies, const btContactSolverInfo& infoGlobal);
+
+	// Add constraints to m_constraints. In addition, the constraints that each vertex own are recorded in m_constraintsDict.
+	virtual void setConstraints(const btContactSolverInfo& infoGlobal);
+
+	// Set up projections for each vertex by adding the projection direction to
+	virtual void setProjection();
+
+	virtual void reinitialize(bool nodeUpdated);
+
+	btScalar solveSplitImpulse(btCollisionObject** deformableBodies, int numDeformableBodies, const btContactSolverInfo& infoGlobal);
+
+	virtual void setLagrangeMultiplier();
+
+	void checkConstraints(const TVStack& x);
 };
 #endif /* btDeformableContactProjection_h */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableCorotatedForce.h b/thirdparty/bullet/BulletSoftBody/btDeformableCorotatedForce.h
index 2d042df729..dfd85523bc 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableCorotatedForce.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableCorotatedForce.h
@@ -21,107 +21,104 @@
 
 static inline int PolarDecomposition(const btMatrix3x3& m, btMatrix3x3& q, btMatrix3x3& s)
 {
-    static const btPolarDecomposition polar;
-    return polar.decompose(m, q, s);
+	static const btPolarDecomposition polar;
+	return polar.decompose(m, q, s);
 }
 
 class btDeformableCorotatedForce : public btDeformableLagrangianForce
 {
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    btScalar m_mu, m_lambda;
-    btDeformableCorotatedForce(): m_mu(1), m_lambda(1)
-    {
-        
-    }
-    
-    btDeformableCorotatedForce(btScalar mu, btScalar lambda): m_mu(mu), m_lambda(lambda)
-    {
-    }
-    
-    virtual void addScaledForces(btScalar scale, TVStack& force)
-    {
-        addScaledElasticForce(scale, force);
-    }
-    
-    virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
-    {
-        addScaledElasticForce(scale, force);
-    }
-    
-    virtual void addScaledDampingForce(btScalar scale, TVStack& force)
-    {
-    }
-    
-    virtual void addScaledElasticForce(btScalar scale, TVStack& force)
-    {
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= force.size());
-        btVector3 grad_N_hat_1st_col = btVector3(-1,-1,-1);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_tetras.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btMatrix3x3 P;
-                firstPiola(tetra.m_F,P);
-                btVector3 force_on_node0 = P * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
-                btMatrix3x3 force_on_node123 = P * tetra.m_Dm_inverse.transpose();
-                
-                btSoftBody::Node* node0 = tetra.m_n[0];
-                btSoftBody::Node* node1 = tetra.m_n[1];
-                btSoftBody::Node* node2 = tetra.m_n[2];
-                btSoftBody::Node* node3 = tetra.m_n[3];
-                size_t id0 = node0->index;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                size_t id3 = node3->index;
-                
-                // elastic force
-                // explicit elastic force
-                btScalar scale1 = scale * tetra.m_element_measure;
-                force[id0] -= scale1 * force_on_node0;
-                force[id1] -= scale1 * force_on_node123.getColumn(0);
-                force[id2] -= scale1 * force_on_node123.getColumn(1);
-                force[id3] -= scale1 * force_on_node123.getColumn(2);
-            }
-        }
-    }
-    
-    void firstPiola(const btMatrix3x3& F, btMatrix3x3& P)
-    {
-        // btMatrix3x3 JFinvT = F.adjoint();
-        btScalar J = F.determinant();
-        P =  F.adjoint().transpose() * (m_lambda * (J-1));
-        if (m_mu > SIMD_EPSILON)
-        {
-            btMatrix3x3 R,S;
-            if (J < 1024 * SIMD_EPSILON)
-                R.setIdentity();
-            else
-                PolarDecomposition(F, R, S); // this QR is not robust, consider using implicit shift svd
-            /*https://fuchuyuan.github.io/research/svd/paper.pdf*/
-            P += (F-R) * 2 * m_mu;
-        }
-    }
-    
-    virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
-    {
-    }
-    
-    virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
-    {
-    }
-    
-    virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA){}
-    
-    virtual btDeformableLagrangianForceType getForceType()
-    {
-        return BT_COROTATED_FORCE;
-    }
-    
-};
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	btScalar m_mu, m_lambda;
+	btDeformableCorotatedForce() : m_mu(1), m_lambda(1)
+	{
+	}
+
+	btDeformableCorotatedForce(btScalar mu, btScalar lambda) : m_mu(mu), m_lambda(lambda)
+	{
+	}
+
+	virtual void addScaledForces(btScalar scale, TVStack& force)
+	{
+		addScaledElasticForce(scale, force);
+	}
+
+	virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
+	{
+		addScaledElasticForce(scale, force);
+	}
+
+	virtual void addScaledDampingForce(btScalar scale, TVStack& force)
+	{
+	}
+
+	virtual void addScaledElasticForce(btScalar scale, TVStack& force)
+	{
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= force.size());
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btMatrix3x3 P;
+				firstPiola(tetra.m_F, P);
+				btVector3 force_on_node0 = P * (tetra.m_Dm_inverse.transpose() * grad_N_hat_1st_col);
+				btMatrix3x3 force_on_node123 = P * tetra.m_Dm_inverse.transpose();
+
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				size_t id0 = node0->index;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				size_t id3 = node3->index;
 
+				// elastic force
+				// explicit elastic force
+				btScalar scale1 = scale * tetra.m_element_measure;
+				force[id0] -= scale1 * force_on_node0;
+				force[id1] -= scale1 * force_on_node123.getColumn(0);
+				force[id2] -= scale1 * force_on_node123.getColumn(1);
+				force[id3] -= scale1 * force_on_node123.getColumn(2);
+			}
+		}
+	}
+
+	void firstPiola(const btMatrix3x3& F, btMatrix3x3& P)
+	{
+		// btMatrix3x3 JFinvT = F.adjoint();
+		btScalar J = F.determinant();
+		P = F.adjoint().transpose() * (m_lambda * (J - 1));
+		if (m_mu > SIMD_EPSILON)
+		{
+			btMatrix3x3 R, S;
+			if (J < 1024 * SIMD_EPSILON)
+				R.setIdentity();
+			else
+				PolarDecomposition(F, R, S);  // this QR is not robust, consider using implicit shift svd
+			/*https://fuchuyuan.github.io/research/svd/paper.pdf*/
+			P += (F - R) * 2 * m_mu;
+		}
+	}
+
+	virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
+	{
+	}
+
+	virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
+	{
+	}
+
+	virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA) {}
+
+	virtual btDeformableLagrangianForceType getForceType()
+	{
+		return BT_COROTATED_FORCE;
+	}
+};
 
 #endif /* btCorotated_h */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableGravityForce.h b/thirdparty/bullet/BulletSoftBody/btDeformableGravityForce.h
index 13ee3eacb6..d91867f457 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableGravityForce.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableGravityForce.h
@@ -21,87 +21,85 @@
 class btDeformableGravityForce : public btDeformableLagrangianForce
 {
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    btVector3 m_gravity;
-    
-    btDeformableGravityForce(const btVector3& g) : m_gravity(g)
-    {
-    }
-    
-    virtual void addScaledForces(btScalar scale, TVStack& force)
-    {
-        addScaledGravityForce(scale, force);
-    }
-    
-    virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
-    {
-        addScaledGravityForce(scale, force);
-    }
-    
-    virtual void addScaledDampingForce(btScalar scale, TVStack& force)
-    {
-    }
-    
-    virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
-    {
-    }
-    
-    virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
-    {
-    }
-    
-    virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA){}
-    
-    virtual void addScaledGravityForce(btScalar scale, TVStack& force)
-    {
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= force.size());
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                btSoftBody::Node& n = psb->m_nodes[j];
-                size_t id = n.index;
-                btScalar mass = (n.m_im == 0) ? 0 : 1. / n.m_im;
-                btVector3 scaled_force = scale * m_gravity * mass;
-                force[id] += scaled_force;
-            }
-        }
-    }
-    
-    virtual btDeformableLagrangianForceType getForceType()
-    {
-        return BT_GRAVITY_FORCE;
-    }
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	btVector3 m_gravity;
 
-    // the gravitational potential energy
-    virtual double totalEnergy(btScalar dt)
-    {
-        double e = 0;
-        for (int i = 0; i<m_softBodies.size();++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                const btSoftBody::Node& node = psb->m_nodes[j];
-                if (node.m_im > 0)
-                {
-                    e -= m_gravity.dot(node.m_q)/node.m_im;
-                }
-            }
-        }
-        return e;
-    }
-    
-    
+	btDeformableGravityForce(const btVector3& g) : m_gravity(g)
+	{
+	}
+
+	virtual void addScaledForces(btScalar scale, TVStack& force)
+	{
+		addScaledGravityForce(scale, force);
+	}
+
+	virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
+	{
+		addScaledGravityForce(scale, force);
+	}
+
+	virtual void addScaledDampingForce(btScalar scale, TVStack& force)
+	{
+	}
+
+	virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
+	{
+	}
+
+	virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
+	{
+	}
+
+	virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA) {}
+
+	virtual void addScaledGravityForce(btScalar scale, TVStack& force)
+	{
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= force.size());
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				btSoftBody::Node& n = psb->m_nodes[j];
+				size_t id = n.index;
+				btScalar mass = (n.m_im == 0) ? 0 : 1. / n.m_im;
+				btVector3 scaled_force = scale * m_gravity * mass * m_softBodies[i]->m_gravityFactor;
+				force[id] += scaled_force;
+			}
+		}
+	}
+
+	virtual btDeformableLagrangianForceType getForceType()
+	{
+		return BT_GRAVITY_FORCE;
+	}
+
+	// the gravitational potential energy
+	virtual double totalEnergy(btScalar dt)
+	{
+		double e = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				const btSoftBody::Node& node = psb->m_nodes[j];
+				if (node.m_im > 0)
+				{
+					e -= m_gravity.dot(node.m_q) / node.m_im;
+				}
+			}
+		}
+		return e;
+	}
 };
 #endif /* BT_DEFORMABLE_GRAVITY_FORCE_H */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableLagrangianForce.h b/thirdparty/bullet/BulletSoftBody/btDeformableLagrangianForce.h
index 0b6447442d..d58d825d1c 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableLagrangianForce.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableLagrangianForce.h
@@ -22,352 +22,351 @@
 
 enum btDeformableLagrangianForceType
 {
-    BT_GRAVITY_FORCE = 1,
-    BT_MASSSPRING_FORCE = 2,
-    BT_COROTATED_FORCE = 3,
-    BT_NEOHOOKEAN_FORCE = 4,
-    BT_LINEAR_ELASTICITY_FORCE = 5,
-    BT_MOUSE_PICKING_FORCE = 6
+	BT_GRAVITY_FORCE = 1,
+	BT_MASSSPRING_FORCE = 2,
+	BT_COROTATED_FORCE = 3,
+	BT_NEOHOOKEAN_FORCE = 4,
+	BT_LINEAR_ELASTICITY_FORCE = 5,
+	BT_MOUSE_PICKING_FORCE = 6
 };
 
 static inline double randomDouble(double low, double high)
 {
-    return low + static_cast<double>(rand()) / RAND_MAX * (high - low);
+	return low + static_cast<double>(rand()) / RAND_MAX * (high - low);
 }
 
 class btDeformableLagrangianForce
 {
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    btAlignedObjectArray<btSoftBody *> m_softBodies;
-    const btAlignedObjectArray<btSoftBody::Node*>* m_nodes;
-    
-    btDeformableLagrangianForce()
-    {
-    }
-    
-    virtual ~btDeformableLagrangianForce(){}
-    
-    // add all forces
-    virtual void addScaledForces(btScalar scale, TVStack& force) = 0;
-    
-    // add damping df
-    virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df) = 0;
-    
-    // build diagonal of A matrix
-    virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA) = 0;
-    
-    // add elastic df
-    virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df) = 0;
-    
-    // add all forces that are explicit in explicit solve
-    virtual void addScaledExplicitForce(btScalar scale, TVStack& force) = 0;
-    
-    // add all damping forces 
-    virtual void addScaledDampingForce(btScalar scale, TVStack& force) = 0;
-    
-    virtual btDeformableLagrangianForceType getForceType() = 0;
-    
-    virtual void reinitialize(bool nodeUpdated)
-    {
-    }
-    
-    // get number of nodes that have the force
-    virtual int getNumNodes()
-    {
-        int numNodes = 0;
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            numNodes += m_softBodies[i]->m_nodes.size();
-        }
-        return numNodes;
-    }
-    
-    // add a soft body to be affected by the particular lagrangian force
-    virtual void addSoftBody(btSoftBody* psb)
-    {
-        m_softBodies.push_back(psb);
-    }
-    
-    virtual void removeSoftBody(btSoftBody* psb)
-    {
-        m_softBodies.remove(psb);
-    }
-    
-    virtual void setIndices(const btAlignedObjectArray<btSoftBody::Node*>* nodes)
-    {
-        m_nodes = nodes;
-    }
-    
-     // Calculate the incremental deformable generated from the input dx
-    virtual btMatrix3x3 Ds(int id0, int id1, int id2, int id3, const TVStack& dx)
-    {
-        btVector3 c1 = dx[id1] - dx[id0];
-        btVector3 c2 = dx[id2] - dx[id0];
-        btVector3 c3 = dx[id3] - dx[id0];
-        return btMatrix3x3(c1,c2,c3).transpose();
-    }
-    
-    // Calculate the incremental deformable generated from the current velocity
-    virtual btMatrix3x3 DsFromVelocity(const btSoftBody::Node* n0, const btSoftBody::Node* n1, const btSoftBody::Node* n2, const btSoftBody::Node* n3)
-    {
-        btVector3 c1 = n1->m_v - n0->m_v;
-        btVector3 c2 = n2->m_v - n0->m_v;
-        btVector3 c3 = n3->m_v - n0->m_v;
-        return btMatrix3x3(c1,c2,c3).transpose();
-    }
-    
-    // test for addScaledElasticForce function
-    virtual void testDerivative()
-    {
-        for (int i = 0; i<m_softBodies.size();++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                psb->m_nodes[j].m_q += btVector3(randomDouble(-.1, .1), randomDouble(-.1, .1), randomDouble(-.1, .1));
-            }
-            psb->updateDeformation();
-        }
-        
-        TVStack dx;
-        dx.resize(getNumNodes());
-        TVStack dphi_dx;
-        dphi_dx.resize(dx.size());
-        for (int i =0; i < dphi_dx.size();++i)
-        {
-            dphi_dx[i].setZero();
-        }
-        addScaledForces(-1, dphi_dx);
-        
-        // write down the current position
-        TVStack x;
-        x.resize(dx.size());
-        int counter = 0;
-        for (int i = 0; i<m_softBodies.size();++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                x[counter] = psb->m_nodes[j].m_q;
-                counter++;
-            }
-        }
-        counter = 0;
-        
-        // populate dx with random vectors
-        for (int i = 0; i < dx.size(); ++i)
-        {
-            dx[i].setX(randomDouble(-1, 1));
-            dx[i].setY(randomDouble(-1, 1));
-            dx[i].setZ(randomDouble(-1, 1));
-        }
-        
-        btAlignedObjectArray<double> errors;
-        for (int it = 0; it < 10; ++it)
-        {
-            for (int i = 0; i < dx.size(); ++i)
-            {
-                dx[i] *= 0.5;
-            }
-            
-            // get dphi/dx * dx
-            double dphi = 0;
-            for (int i = 0; i < dx.size(); ++i)
-            {
-                dphi += dphi_dx[i].dot(dx[i]);
-            }
-            
-
-            for (int i = 0; i<m_softBodies.size();++i)
-            {
-                btSoftBody* psb = m_softBodies[i];
-                for (int j = 0; j < psb->m_nodes.size(); ++j)
-                {
-                    psb->m_nodes[j].m_q = x[counter] + dx[counter];
-                    counter++;
-                }
-                psb->updateDeformation();
-            }
-            counter = 0;
-            double f1 = totalElasticEnergy(0);
-            
-            for (int i = 0; i<m_softBodies.size();++i)
-            {
-                btSoftBody* psb = m_softBodies[i];
-                for (int j = 0; j < psb->m_nodes.size(); ++j)
-                {
-                    psb->m_nodes[j].m_q = x[counter] - dx[counter];
-                    counter++;
-                }
-                psb->updateDeformation();
-            }
-            counter = 0;
-            
-            double f2 = totalElasticEnergy(0);
-            
-            //restore m_q
-            for (int i = 0; i<m_softBodies.size();++i)
-            {
-                btSoftBody* psb = m_softBodies[i];
-                for (int j = 0; j < psb->m_nodes.size(); ++j)
-                {
-                    psb->m_nodes[j].m_q = x[counter];
-                    counter++;
-                }
-                psb->updateDeformation();
-            }
-            counter = 0;
-            double error = f1-f2-2*dphi;
-            errors.push_back(error);
-            std::cout << "Iteration = " << it <<", f1 = " << f1 << ", f2 = " << f2 << ", error = " << error << std::endl;
-        }
-        for (int i = 1; i < errors.size(); ++i)
-        {
-            std::cout << "Iteration = " << i << ", ratio = " << errors[i-1]/errors[i] << std::endl;
-        }
-    }
-    
-    // test for addScaledElasticForce function
-    virtual void testHessian()
-    {
-        for (int i = 0; i<m_softBodies.size();++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                psb->m_nodes[j].m_q += btVector3(randomDouble(-.1, .1), randomDouble(-.1, .1), randomDouble(-.1, .1));
-            }
-            psb->updateDeformation();
-        }
-        
-        
-        TVStack dx;
-        dx.resize(getNumNodes());
-        TVStack df;
-        df.resize(dx.size());
-        TVStack f1;
-        f1.resize(dx.size());
-        TVStack f2;
-        f2.resize(dx.size());
-        
-        
-        // write down the current position
-        TVStack x;
-        x.resize(dx.size());
-        int counter = 0;
-        for (int i = 0; i<m_softBodies.size();++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                x[counter] = psb->m_nodes[j].m_q;
-                counter++;
-            }
-        }
-        counter = 0;
-        
-        // populate dx with random vectors
-        for (int i = 0; i < dx.size(); ++i)
-        {
-            dx[i].setX(randomDouble(-1, 1));
-            dx[i].setY(randomDouble(-1, 1));
-            dx[i].setZ(randomDouble(-1, 1));
-        }
-        
-        btAlignedObjectArray<double> errors;
-        for (int it = 0; it < 10; ++it)
-        {
-            for (int i = 0; i < dx.size(); ++i)
-            {
-                dx[i] *= 0.5;
-            }
-            
-            // get df
-            for (int i =0; i < df.size();++i)
-            {
-                df[i].setZero();
-                f1[i].setZero();
-                f2[i].setZero();
-            }
-
-            //set df
-            addScaledElasticForceDifferential(-1, dx, df);
-            
-            for (int i = 0; i<m_softBodies.size();++i)
-            {
-                btSoftBody* psb = m_softBodies[i];
-                for (int j = 0; j < psb->m_nodes.size(); ++j)
-                {
-                    psb->m_nodes[j].m_q = x[counter] + dx[counter];
-                    counter++;
-                }
-                psb->updateDeformation();
-            }
-            counter = 0;
-            
-            //set f1
-            addScaledForces(-1, f1);
-            
-            for (int i = 0; i<m_softBodies.size();++i)
-            {
-                btSoftBody* psb = m_softBodies[i];
-                for (int j = 0; j < psb->m_nodes.size(); ++j)
-                {
-                    psb->m_nodes[j].m_q = x[counter] - dx[counter];
-                    counter++;
-                }
-                psb->updateDeformation();
-            }
-            counter = 0;
-            
-            //set f2
-            addScaledForces(-1, f2);
-            
-            //restore m_q
-            for (int i = 0; i<m_softBodies.size();++i)
-            {
-                btSoftBody* psb = m_softBodies[i];
-                for (int j = 0; j < psb->m_nodes.size(); ++j)
-                {
-                    psb->m_nodes[j].m_q = x[counter];
-                    counter++;
-                }
-                psb->updateDeformation();
-            }
-            counter = 0;
-            double error = 0;
-            for (int i = 0; i < df.size();++i)
-            {
-                btVector3 error_vector = f1[i]-f2[i]-2*df[i];
-                error += error_vector.length2();
-            }
-            error = btSqrt(error);
-            errors.push_back(error);
-            std::cout << "Iteration = " << it << ", error = " << error << std::endl;
-        }
-        for (int i = 1; i < errors.size(); ++i)
-        {
-            std::cout << "Iteration = " << i << ", ratio = " << errors[i-1]/errors[i] << std::endl;
-        }
-    }
-    
-    //
-    virtual double totalElasticEnergy(btScalar dt)
-    {
-        return 0;
-    }
-    
-    //
-    virtual double totalDampingEnergy(btScalar dt)
-    {
-        return 0;
-    }
-    
-    // total Energy takes dt as input because certain energies depend on dt
-    virtual double totalEnergy(btScalar dt)
-    {
-        return totalElasticEnergy(dt) + totalDampingEnergy(dt);
-    }
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	btAlignedObjectArray<btSoftBody*> m_softBodies;
+	const btAlignedObjectArray<btSoftBody::Node*>* m_nodes;
+
+	btDeformableLagrangianForce()
+	{
+	}
+
+	virtual ~btDeformableLagrangianForce() {}
+
+	// add all forces
+	virtual void addScaledForces(btScalar scale, TVStack& force) = 0;
+
+	// add damping df
+	virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df) = 0;
+
+	// build diagonal of A matrix
+	virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA) = 0;
+
+	// add elastic df
+	virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df) = 0;
+
+	// add all forces that are explicit in explicit solve
+	virtual void addScaledExplicitForce(btScalar scale, TVStack& force) = 0;
+
+	// add all damping forces
+	virtual void addScaledDampingForce(btScalar scale, TVStack& force) = 0;
+
+	virtual void addScaledHessian(btScalar scale) {}
+
+	virtual btDeformableLagrangianForceType getForceType() = 0;
+
+	virtual void reinitialize(bool nodeUpdated)
+	{
+	}
+
+	// get number of nodes that have the force
+	virtual int getNumNodes()
+	{
+		int numNodes = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			numNodes += m_softBodies[i]->m_nodes.size();
+		}
+		return numNodes;
+	}
+
+	// add a soft body to be affected by the particular lagrangian force
+	virtual void addSoftBody(btSoftBody* psb)
+	{
+		m_softBodies.push_back(psb);
+	}
+
+	virtual void removeSoftBody(btSoftBody* psb)
+	{
+		m_softBodies.remove(psb);
+	}
+
+	virtual void setIndices(const btAlignedObjectArray<btSoftBody::Node*>* nodes)
+	{
+		m_nodes = nodes;
+	}
+
+	// Calculate the incremental deformable generated from the input dx
+	virtual btMatrix3x3 Ds(int id0, int id1, int id2, int id3, const TVStack& dx)
+	{
+		btVector3 c1 = dx[id1] - dx[id0];
+		btVector3 c2 = dx[id2] - dx[id0];
+		btVector3 c3 = dx[id3] - dx[id0];
+		return btMatrix3x3(c1, c2, c3).transpose();
+	}
+
+	// Calculate the incremental deformable generated from the current velocity
+	virtual btMatrix3x3 DsFromVelocity(const btSoftBody::Node* n0, const btSoftBody::Node* n1, const btSoftBody::Node* n2, const btSoftBody::Node* n3)
+	{
+		btVector3 c1 = n1->m_v - n0->m_v;
+		btVector3 c2 = n2->m_v - n0->m_v;
+		btVector3 c3 = n3->m_v - n0->m_v;
+		return btMatrix3x3(c1, c2, c3).transpose();
+	}
+
+	// test for addScaledElasticForce function
+	virtual void testDerivative()
+	{
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				psb->m_nodes[j].m_q += btVector3(randomDouble(-.1, .1), randomDouble(-.1, .1), randomDouble(-.1, .1));
+			}
+			psb->updateDeformation();
+		}
+
+		TVStack dx;
+		dx.resize(getNumNodes());
+		TVStack dphi_dx;
+		dphi_dx.resize(dx.size());
+		for (int i = 0; i < dphi_dx.size(); ++i)
+		{
+			dphi_dx[i].setZero();
+		}
+		addScaledForces(-1, dphi_dx);
+
+		// write down the current position
+		TVStack x;
+		x.resize(dx.size());
+		int counter = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				x[counter] = psb->m_nodes[j].m_q;
+				counter++;
+			}
+		}
+		counter = 0;
+
+		// populate dx with random vectors
+		for (int i = 0; i < dx.size(); ++i)
+		{
+			dx[i].setX(randomDouble(-1, 1));
+			dx[i].setY(randomDouble(-1, 1));
+			dx[i].setZ(randomDouble(-1, 1));
+		}
+
+		btAlignedObjectArray<double> errors;
+		for (int it = 0; it < 10; ++it)
+		{
+			for (int i = 0; i < dx.size(); ++i)
+			{
+				dx[i] *= 0.5;
+			}
+
+			// get dphi/dx * dx
+			double dphi = 0;
+			for (int i = 0; i < dx.size(); ++i)
+			{
+				dphi += dphi_dx[i].dot(dx[i]);
+			}
+
+			for (int i = 0; i < m_softBodies.size(); ++i)
+			{
+				btSoftBody* psb = m_softBodies[i];
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+				{
+					psb->m_nodes[j].m_q = x[counter] + dx[counter];
+					counter++;
+				}
+				psb->updateDeformation();
+			}
+			counter = 0;
+			double f1 = totalElasticEnergy(0);
+
+			for (int i = 0; i < m_softBodies.size(); ++i)
+			{
+				btSoftBody* psb = m_softBodies[i];
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+				{
+					psb->m_nodes[j].m_q = x[counter] - dx[counter];
+					counter++;
+				}
+				psb->updateDeformation();
+			}
+			counter = 0;
+
+			double f2 = totalElasticEnergy(0);
+
+			//restore m_q
+			for (int i = 0; i < m_softBodies.size(); ++i)
+			{
+				btSoftBody* psb = m_softBodies[i];
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+				{
+					psb->m_nodes[j].m_q = x[counter];
+					counter++;
+				}
+				psb->updateDeformation();
+			}
+			counter = 0;
+			double error = f1 - f2 - 2 * dphi;
+			errors.push_back(error);
+			std::cout << "Iteration = " << it << ", f1 = " << f1 << ", f2 = " << f2 << ", error = " << error << std::endl;
+		}
+		for (int i = 1; i < errors.size(); ++i)
+		{
+			std::cout << "Iteration = " << i << ", ratio = " << errors[i - 1] / errors[i] << std::endl;
+		}
+	}
+
+	// test for addScaledElasticForce function
+	virtual void testHessian()
+	{
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				psb->m_nodes[j].m_q += btVector3(randomDouble(-.1, .1), randomDouble(-.1, .1), randomDouble(-.1, .1));
+			}
+			psb->updateDeformation();
+		}
+
+		TVStack dx;
+		dx.resize(getNumNodes());
+		TVStack df;
+		df.resize(dx.size());
+		TVStack f1;
+		f1.resize(dx.size());
+		TVStack f2;
+		f2.resize(dx.size());
+
+		// write down the current position
+		TVStack x;
+		x.resize(dx.size());
+		int counter = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				x[counter] = psb->m_nodes[j].m_q;
+				counter++;
+			}
+		}
+		counter = 0;
+
+		// populate dx with random vectors
+		for (int i = 0; i < dx.size(); ++i)
+		{
+			dx[i].setX(randomDouble(-1, 1));
+			dx[i].setY(randomDouble(-1, 1));
+			dx[i].setZ(randomDouble(-1, 1));
+		}
+
+		btAlignedObjectArray<double> errors;
+		for (int it = 0; it < 10; ++it)
+		{
+			for (int i = 0; i < dx.size(); ++i)
+			{
+				dx[i] *= 0.5;
+			}
+
+			// get df
+			for (int i = 0; i < df.size(); ++i)
+			{
+				df[i].setZero();
+				f1[i].setZero();
+				f2[i].setZero();
+			}
+
+			//set df
+			addScaledElasticForceDifferential(-1, dx, df);
+
+			for (int i = 0; i < m_softBodies.size(); ++i)
+			{
+				btSoftBody* psb = m_softBodies[i];
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+				{
+					psb->m_nodes[j].m_q = x[counter] + dx[counter];
+					counter++;
+				}
+				psb->updateDeformation();
+			}
+			counter = 0;
+
+			//set f1
+			addScaledForces(-1, f1);
+
+			for (int i = 0; i < m_softBodies.size(); ++i)
+			{
+				btSoftBody* psb = m_softBodies[i];
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+				{
+					psb->m_nodes[j].m_q = x[counter] - dx[counter];
+					counter++;
+				}
+				psb->updateDeformation();
+			}
+			counter = 0;
+
+			//set f2
+			addScaledForces(-1, f2);
+
+			//restore m_q
+			for (int i = 0; i < m_softBodies.size(); ++i)
+			{
+				btSoftBody* psb = m_softBodies[i];
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+				{
+					psb->m_nodes[j].m_q = x[counter];
+					counter++;
+				}
+				psb->updateDeformation();
+			}
+			counter = 0;
+			double error = 0;
+			for (int i = 0; i < df.size(); ++i)
+			{
+				btVector3 error_vector = f1[i] - f2[i] - 2 * df[i];
+				error += error_vector.length2();
+			}
+			error = btSqrt(error);
+			errors.push_back(error);
+			std::cout << "Iteration = " << it << ", error = " << error << std::endl;
+		}
+		for (int i = 1; i < errors.size(); ++i)
+		{
+			std::cout << "Iteration = " << i << ", ratio = " << errors[i - 1] / errors[i] << std::endl;
+		}
+	}
+
+	//
+	virtual double totalElasticEnergy(btScalar dt)
+	{
+		return 0;
+	}
+
+	//
+	virtual double totalDampingEnergy(btScalar dt)
+	{
+		return 0;
+	}
+
+	// total Energy takes dt as input because certain energies depend on dt
+	virtual double totalEnergy(btScalar dt)
+	{
+		return totalElasticEnergy(dt) + totalDampingEnergy(dt);
+	}
 };
 #endif /* BT_DEFORMABLE_LAGRANGIAN_FORCE */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableLinearElasticityForce.h b/thirdparty/bullet/BulletSoftBody/btDeformableLinearElasticityForce.h
index 106dc10ad6..971192050b 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableLinearElasticityForce.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableLinearElasticityForce.h
@@ -18,323 +18,445 @@
 
 #include "btDeformableLagrangianForce.h"
 #include "LinearMath/btQuickprof.h"
+#include "btSoftBodyInternals.h"
+#define TETRA_FLAT_THRESHOLD 0.01
 class btDeformableLinearElasticityForce : public btDeformableLagrangianForce
 {
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    btScalar m_mu, m_lambda;
-    btScalar m_mu_damp, m_lambda_damp;
-    btDeformableLinearElasticityForce(): m_mu(1), m_lambda(1)
-    {
-        btScalar damping = 0.05;
-        m_mu_damp = damping * m_mu;
-        m_lambda_damp = damping * m_lambda;
-    }
-    
-    btDeformableLinearElasticityForce(btScalar mu, btScalar lambda, btScalar damping = 0.05): m_mu(mu), m_lambda(lambda)
-    {
-        m_mu_damp = damping * m_mu;
-        m_lambda_damp = damping * m_lambda;
-    }
-    
-    virtual void addScaledForces(btScalar scale, TVStack& force)
-    {
-        addScaledDampingForce(scale, force);
-        addScaledElasticForce(scale, force);
-    }
-    
-    virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
-    {
-        addScaledElasticForce(scale, force);
-    }
-    
-    // The damping matrix is calculated using the time n state as described in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
-    virtual void addScaledDampingForce(btScalar scale, TVStack& force)
-    {
-        if (m_mu_damp == 0 && m_lambda_damp == 0)
-            return;
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= force.size());
-        btVector3 grad_N_hat_1st_col = btVector3(-1,-1,-1);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_tetras.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btSoftBody::Node* node0 = tetra.m_n[0];
-                btSoftBody::Node* node1 = tetra.m_n[1];
-                btSoftBody::Node* node2 = tetra.m_n[2];
-                btSoftBody::Node* node3 = tetra.m_n[3];
-                size_t id0 = node0->index;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                size_t id3 = node3->index;
-                btMatrix3x3 dF = DsFromVelocity(node0, node1, node2, node3) * tetra.m_Dm_inverse;
-                btMatrix3x3 I;
-                I.setIdentity();
-                btMatrix3x3 dP = (dF + dF.transpose()) * m_mu_damp + I * (dF[0][0]+dF[1][1]+dF[2][2]) * m_lambda_damp;
-                //                firstPiolaDampingDifferential(psb->m_tetraScratchesTn[j], dF, dP);
-                btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
-                btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
-                
-                // damping force differential
-                btScalar scale1 = scale * tetra.m_element_measure;
-                force[id0] -= scale1 * df_on_node0;
-                force[id1] -= scale1 * df_on_node123.getColumn(0);
-                force[id2] -= scale1 * df_on_node123.getColumn(1);
-                force[id3] -= scale1 * df_on_node123.getColumn(2);
-            }
-        }
-    }
-    
-    virtual double totalElasticEnergy(btScalar dt)
-    {
-        double energy = 0;
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_tetraScratches.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btSoftBody::TetraScratch& s = psb->m_tetraScratches[j];
-                energy += tetra.m_element_measure * elasticEnergyDensity(s);
-            }
-        }
-        return energy;
-    }
-    
-    // The damping energy is formulated as in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
-    virtual double totalDampingEnergy(btScalar dt)
-    {
-        double energy = 0;
-        int sz = 0;
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                sz = btMax(sz, psb->m_nodes[j].index);
-            }
-        }
-        TVStack dampingForce;
-        dampingForce.resize(sz+1);
-        for (int i = 0; i < dampingForce.size(); ++i)
-            dampingForce[i].setZero();
-        addScaledDampingForce(0.5, dampingForce);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                const btSoftBody::Node& node = psb->m_nodes[j];
-                energy -= dampingForce[node.index].dot(node.m_v) / dt;
-            }
-        }
-        return energy;
-    }
-    
-    double elasticEnergyDensity(const btSoftBody::TetraScratch& s)
-    {
-        double density = 0;
-        btMatrix3x3 epsilon = (s.m_F + s.m_F.transpose()) * 0.5 - btMatrix3x3::getIdentity();
-        btScalar trace = epsilon[0][0] + epsilon[1][1] + epsilon[2][2];
-        density += m_mu * (epsilon[0].length2() + epsilon[1].length2() + epsilon[2].length2());
-        density += m_lambda * trace * trace * 0.5;
-        return density;
-    }
-    
-    virtual void addScaledElasticForce(btScalar scale, TVStack& force)
-    {
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= force.size());
-        btVector3 grad_N_hat_1st_col = btVector3(-1,-1,-1);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            btScalar max_p = psb->m_cfg.m_maxStress;
-            for (int j = 0; j < psb->m_tetras.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btMatrix3x3 P;
-                firstPiola(psb->m_tetraScratches[j],P);
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	btScalar m_mu, m_lambda;
+	btScalar m_E, m_nu;  // Young's modulus and Poisson ratio
+	btScalar m_damping_alpha, m_damping_beta;
+	btDeformableLinearElasticityForce() : m_mu(1), m_lambda(1), m_damping_alpha(0.01), m_damping_beta(0.01)
+	{
+		updateYoungsModulusAndPoissonRatio();
+	}
+
+	btDeformableLinearElasticityForce(btScalar mu, btScalar lambda, btScalar damping_alpha = 0.01, btScalar damping_beta = 0.01) : m_mu(mu), m_lambda(lambda), m_damping_alpha(damping_alpha), m_damping_beta(damping_beta)
+	{
+		updateYoungsModulusAndPoissonRatio();
+	}
+
+	void updateYoungsModulusAndPoissonRatio()
+	{
+		// conversion from Lame Parameters to Young's modulus and Poisson ratio
+		// https://en.wikipedia.org/wiki/Lam%C3%A9_parameters
+		m_E = m_mu * (3 * m_lambda + 2 * m_mu) / (m_lambda + m_mu);
+		m_nu = m_lambda * 0.5 / (m_mu + m_lambda);
+	}
+
+	void updateLameParameters()
+	{
+		// conversion from Young's modulus and Poisson ratio to Lame Parameters
+		// https://en.wikipedia.org/wiki/Lam%C3%A9_parameters
+		m_mu = m_E * 0.5 / (1 + m_nu);
+		m_lambda = m_E * m_nu / ((1 + m_nu) * (1 - 2 * m_nu));
+	}
+
+	void setYoungsModulus(btScalar E)
+	{
+		m_E = E;
+		updateLameParameters();
+	}
+
+	void setPoissonRatio(btScalar nu)
+	{
+		m_nu = nu;
+		updateLameParameters();
+	}
+
+	void setDamping(btScalar damping_alpha, btScalar damping_beta)
+	{
+		m_damping_alpha = damping_alpha;
+		m_damping_beta = damping_beta;
+	}
+
+	void setLameParameters(btScalar mu, btScalar lambda)
+	{
+		m_mu = mu;
+		m_lambda = lambda;
+		updateYoungsModulusAndPoissonRatio();
+	}
+
+	virtual void addScaledForces(btScalar scale, TVStack& force)
+	{
+		addScaledDampingForce(scale, force);
+		addScaledElasticForce(scale, force);
+	}
+
+	virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
+	{
+		addScaledElasticForce(scale, force);
+	}
+
+	// The damping matrix is calculated using the time n state as described in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
+	virtual void addScaledDampingForce(btScalar scale, TVStack& force)
+	{
+		if (m_damping_alpha == 0 && m_damping_beta == 0)
+			return;
+		btScalar mu_damp = m_damping_beta * m_mu;
+		btScalar lambda_damp = m_damping_beta * m_lambda;
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= force.size());
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				bool close_to_flat = (psb->m_tetraScratches[j].m_J < TETRA_FLAT_THRESHOLD);
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				size_t id0 = node0->index;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				size_t id3 = node3->index;
+				btMatrix3x3 dF = DsFromVelocity(node0, node1, node2, node3) * tetra.m_Dm_inverse;
+				if (!close_to_flat)
+				{
+					dF = psb->m_tetraScratches[j].m_corotation.transpose() * dF;
+				}
+				btMatrix3x3 I;
+				I.setIdentity();
+				btMatrix3x3 dP = (dF + dF.transpose()) * mu_damp + I * ((dF[0][0] + dF[1][1] + dF[2][2]) * lambda_damp);
+				btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
+				if (!close_to_flat)
+				{
+					df_on_node123 = psb->m_tetraScratches[j].m_corotation * df_on_node123;
+				}
+				btVector3 df_on_node0 = df_on_node123 * grad_N_hat_1st_col;
+				// damping force differential
+				btScalar scale1 = scale * tetra.m_element_measure;
+				force[id0] -= scale1 * df_on_node0;
+				force[id1] -= scale1 * df_on_node123.getColumn(0);
+				force[id2] -= scale1 * df_on_node123.getColumn(1);
+				force[id3] -= scale1 * df_on_node123.getColumn(2);
+			}
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				const btSoftBody::Node& node = psb->m_nodes[j];
+				size_t id = node.index;
+				if (node.m_im > 0)
+				{
+					force[id] -= scale * node.m_v / node.m_im * m_damping_alpha;
+				}
+			}
+		}
+	}
+
+	virtual double totalElasticEnergy(btScalar dt)
+	{
+		double energy = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_tetraScratches.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btSoftBody::TetraScratch& s = psb->m_tetraScratches[j];
+				energy += tetra.m_element_measure * elasticEnergyDensity(s);
+			}
+		}
+		return energy;
+	}
+
+	// The damping energy is formulated as in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
+	virtual double totalDampingEnergy(btScalar dt)
+	{
+		double energy = 0;
+		int sz = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				sz = btMax(sz, psb->m_nodes[j].index);
+			}
+		}
+		TVStack dampingForce;
+		dampingForce.resize(sz + 1);
+		for (int i = 0; i < dampingForce.size(); ++i)
+			dampingForce[i].setZero();
+		addScaledDampingForce(0.5, dampingForce);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				const btSoftBody::Node& node = psb->m_nodes[j];
+				energy -= dampingForce[node.index].dot(node.m_v) / dt;
+			}
+		}
+		return energy;
+	}
+
+	double elasticEnergyDensity(const btSoftBody::TetraScratch& s)
+	{
+		double density = 0;
+		btMatrix3x3 epsilon = (s.m_F + s.m_F.transpose()) * 0.5 - btMatrix3x3::getIdentity();
+		btScalar trace = epsilon[0][0] + epsilon[1][1] + epsilon[2][2];
+		density += m_mu * (epsilon[0].length2() + epsilon[1].length2() + epsilon[2].length2());
+		density += m_lambda * trace * trace * 0.5;
+		return density;
+	}
+
+	virtual void addScaledElasticForce(btScalar scale, TVStack& force)
+	{
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= force.size());
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			btScalar max_p = psb->m_cfg.m_maxStress;
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btMatrix3x3 P;
+				firstPiola(psb->m_tetraScratches[j], P);
 #if USE_SVD
-                if (max_p > 0)
-                {
-                    // since we want to clamp the principal stress to max_p, we only need to
-                    // calculate SVD when sigma_0^2 + sigma_1^2 + sigma_2^2 > max_p * max_p
-                    btScalar trPTP = (P[0].length2() + P[1].length2() + P[2].length2());
-                    if (trPTP > max_p * max_p)
-                    {
-                        btMatrix3x3 U, V;
-                        btVector3 sigma;
-                        singularValueDecomposition(P, U, sigma, V);
-                        sigma[0] = btMin(sigma[0], max_p);
-                        sigma[1] = btMin(sigma[1], max_p);
-                        sigma[2] = btMin(sigma[2], max_p);
-                        sigma[0] = btMax(sigma[0], -max_p);
-                        sigma[1] = btMax(sigma[1], -max_p);
-                        sigma[2] = btMax(sigma[2], -max_p);
-                        btMatrix3x3 Sigma;
-                        Sigma.setIdentity();
-                        Sigma[0][0] = sigma[0];
-                        Sigma[1][1] = sigma[1];
-                        Sigma[2][2] = sigma[2];
-                        P = U * Sigma * V.transpose();
-                    }
-                }
+				if (max_p > 0)
+				{
+					// since we want to clamp the principal stress to max_p, we only need to
+					// calculate SVD when sigma_0^2 + sigma_1^2 + sigma_2^2 > max_p * max_p
+					btScalar trPTP = (P[0].length2() + P[1].length2() + P[2].length2());
+					if (trPTP > max_p * max_p)
+					{
+						btMatrix3x3 U, V;
+						btVector3 sigma;
+						singularValueDecomposition(P, U, sigma, V);
+						sigma[0] = btMin(sigma[0], max_p);
+						sigma[1] = btMin(sigma[1], max_p);
+						sigma[2] = btMin(sigma[2], max_p);
+						sigma[0] = btMax(sigma[0], -max_p);
+						sigma[1] = btMax(sigma[1], -max_p);
+						sigma[2] = btMax(sigma[2], -max_p);
+						btMatrix3x3 Sigma;
+						Sigma.setIdentity();
+						Sigma[0][0] = sigma[0];
+						Sigma[1][1] = sigma[1];
+						Sigma[2][2] = sigma[2];
+						P = U * Sigma * V.transpose();
+					}
+				}
 #endif
-                //                btVector3 force_on_node0 = P * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
-                btMatrix3x3 force_on_node123 = P * tetra.m_Dm_inverse.transpose();
-                btVector3 force_on_node0 = force_on_node123 * grad_N_hat_1st_col;
-                
-                btSoftBody::Node* node0 = tetra.m_n[0];
-                btSoftBody::Node* node1 = tetra.m_n[1];
-                btSoftBody::Node* node2 = tetra.m_n[2];
-                btSoftBody::Node* node3 = tetra.m_n[3];
-                size_t id0 = node0->index;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                size_t id3 = node3->index;
-                
-                // elastic force
-                btScalar scale1 = scale * tetra.m_element_measure;
-                force[id0] -= scale1 * force_on_node0;
-                force[id1] -= scale1 * force_on_node123.getColumn(0);
-                force[id2] -= scale1 * force_on_node123.getColumn(1);
-                force[id3] -= scale1 * force_on_node123.getColumn(2);
-            }
-        }
-    }
-    
-    // The damping matrix is calculated using the time n state as described in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
-    virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
-    {
-        if (m_mu_damp == 0 && m_lambda_damp == 0)
-            return;
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= df.size());
-        btVector3 grad_N_hat_1st_col = btVector3(-1,-1,-1);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_tetras.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btSoftBody::Node* node0 = tetra.m_n[0];
-                btSoftBody::Node* node1 = tetra.m_n[1];
-                btSoftBody::Node* node2 = tetra.m_n[2];
-                btSoftBody::Node* node3 = tetra.m_n[3];
-                size_t id0 = node0->index;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                size_t id3 = node3->index;
-                btMatrix3x3 dF = Ds(id0, id1, id2, id3, dv) * tetra.m_Dm_inverse;
-                btMatrix3x3 I;
-                I.setIdentity();
-                btMatrix3x3 dP = (dF + dF.transpose()) * m_mu_damp + I * (dF[0][0]+dF[1][1]+dF[2][2]) * m_lambda_damp;
-                //                firstPiolaDampingDifferential(psb->m_tetraScratchesTn[j], dF, dP);
-                //                btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
-                btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
-                btVector3 df_on_node0 = df_on_node123 * grad_N_hat_1st_col;
-                
-                // damping force differential
-                btScalar scale1 = scale * tetra.m_element_measure;
-                df[id0] -= scale1 * df_on_node0;
-                df[id1] -= scale1 * df_on_node123.getColumn(0);
-                df[id2] -= scale1 * df_on_node123.getColumn(1);
-                df[id3] -= scale1 * df_on_node123.getColumn(2);
-            }
-        }
-    }
-    
-    virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
-    {
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= df.size());
-        btVector3 grad_N_hat_1st_col = btVector3(-1,-1,-1);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_tetras.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btSoftBody::Node* node0 = tetra.m_n[0];
-                btSoftBody::Node* node1 = tetra.m_n[1];
-                btSoftBody::Node* node2 = tetra.m_n[2];
-                btSoftBody::Node* node3 = tetra.m_n[3];
-                size_t id0 = node0->index;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                size_t id3 = node3->index;
-                btMatrix3x3 dF = Ds(id0, id1, id2, id3, dx) * tetra.m_Dm_inverse;
-                btMatrix3x3 dP;
-                firstPiolaDifferential(psb->m_tetraScratches[j], dF, dP);
-                //                btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
-                btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
-                btVector3 df_on_node0 = df_on_node123 * grad_N_hat_1st_col;
-                
-                // elastic force differential
-                btScalar scale1 = scale * tetra.m_element_measure;
-                df[id0] -= scale1 * df_on_node0;
-                df[id1] -= scale1 * df_on_node123.getColumn(0);
-                df[id2] -= scale1 * df_on_node123.getColumn(1);
-                df[id3] -= scale1 * df_on_node123.getColumn(2);
-            }
-        }
-    }
-    
-    void firstPiola(const btSoftBody::TetraScratch& s, btMatrix3x3& P)
-    {
-        btMatrix3x3 epsilon = (s.m_F + s.m_F.transpose()) * 0.5 - btMatrix3x3::getIdentity();
-        btScalar trace = epsilon[0][0] + epsilon[1][1] + epsilon[2][2];
-        P = epsilon * btScalar(2) * m_mu + btMatrix3x3::getIdentity() * m_lambda * trace;
-    }
-    
-    // Let P be the first piola stress.
-    // This function calculates the dP = dP/dF * dF
-    void firstPiolaDifferential(const btSoftBody::TetraScratch& s, const btMatrix3x3& dF,  btMatrix3x3& dP)
-    {
-        btScalar trace = (dF[0][0] + dF[1][1] + dF[2][2]);
-        dP = (dF + dF.transpose()) * m_mu +  btMatrix3x3::getIdentity()  * m_lambda * trace;
-    }
-    
-    // Let Q be the damping stress.
-    // This function calculates the dP = dQ/dF * dF
-    void firstPiolaDampingDifferential(const btSoftBody::TetraScratch& s, const btMatrix3x3& dF,  btMatrix3x3& dP)
-    {
-        btScalar trace = (dF[0][0] + dF[1][1] + dF[2][2]);
-        dP = (dF + dF.transpose()) * m_mu_damp +  btMatrix3x3::getIdentity()  * m_lambda_damp * trace;
-    }
-    
-    virtual btDeformableLagrangianForceType getForceType()
-    {
-        return BT_LINEAR_ELASTICITY_FORCE;
-    }
-    
+				//                btVector3 force_on_node0 = P * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
+				btMatrix3x3 force_on_node123 = psb->m_tetraScratches[j].m_corotation * P * tetra.m_Dm_inverse.transpose();
+				btVector3 force_on_node0 = force_on_node123 * grad_N_hat_1st_col;
+
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				size_t id0 = node0->index;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				size_t id3 = node3->index;
+
+				// elastic force
+				btScalar scale1 = scale * tetra.m_element_measure;
+				force[id0] -= scale1 * force_on_node0;
+				force[id1] -= scale1 * force_on_node123.getColumn(0);
+				force[id2] -= scale1 * force_on_node123.getColumn(1);
+				force[id3] -= scale1 * force_on_node123.getColumn(2);
+			}
+		}
+	}
+
+	virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA) {}
+
+	// The damping matrix is calculated using the time n state as described in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
+	virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
+	{
+		if (m_damping_alpha == 0 && m_damping_beta == 0)
+			return;
+		btScalar mu_damp = m_damping_beta * m_mu;
+		btScalar lambda_damp = m_damping_beta * m_lambda;
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= df.size());
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				bool close_to_flat = (psb->m_tetraScratches[j].m_J < TETRA_FLAT_THRESHOLD);
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				size_t id0 = node0->index;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				size_t id3 = node3->index;
+				btMatrix3x3 dF = Ds(id0, id1, id2, id3, dv) * tetra.m_Dm_inverse;
+				if (!close_to_flat)
+				{
+					dF = psb->m_tetraScratches[j].m_corotation.transpose() * dF;
+				}
+				btMatrix3x3 I;
+				I.setIdentity();
+				btMatrix3x3 dP = (dF + dF.transpose()) * mu_damp + I * ((dF[0][0] + dF[1][1] + dF[2][2]) * lambda_damp);
+				btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
+				if (!close_to_flat)
+				{
+					df_on_node123 = psb->m_tetraScratches[j].m_corotation * df_on_node123;
+				}
+				btVector3 df_on_node0 = df_on_node123 * grad_N_hat_1st_col;
+
+				// damping force differential
+				btScalar scale1 = scale * tetra.m_element_measure;
+				df[id0] -= scale1 * df_on_node0;
+				df[id1] -= scale1 * df_on_node123.getColumn(0);
+				df[id2] -= scale1 * df_on_node123.getColumn(1);
+				df[id3] -= scale1 * df_on_node123.getColumn(2);
+			}
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				const btSoftBody::Node& node = psb->m_nodes[j];
+				size_t id = node.index;
+				if (node.m_im > 0)
+				{
+					df[id] -= scale * dv[id] / node.m_im * m_damping_alpha;
+				}
+			}
+		}
+	}
+
+	virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
+	{
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= df.size());
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				size_t id0 = node0->index;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				size_t id3 = node3->index;
+				btMatrix3x3 dF = psb->m_tetraScratches[j].m_corotation.transpose() * Ds(id0, id1, id2, id3, dx) * tetra.m_Dm_inverse;
+				btMatrix3x3 dP;
+				firstPiolaDifferential(psb->m_tetraScratches[j], dF, dP);
+				//                btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
+				btMatrix3x3 df_on_node123 = psb->m_tetraScratches[j].m_corotation * dP * tetra.m_Dm_inverse.transpose();
+				btVector3 df_on_node0 = df_on_node123 * grad_N_hat_1st_col;
+
+				// elastic force differential
+				btScalar scale1 = scale * tetra.m_element_measure;
+				df[id0] -= scale1 * df_on_node0;
+				df[id1] -= scale1 * df_on_node123.getColumn(0);
+				df[id2] -= scale1 * df_on_node123.getColumn(1);
+				df[id3] -= scale1 * df_on_node123.getColumn(2);
+			}
+		}
+	}
+
+	void firstPiola(const btSoftBody::TetraScratch& s, btMatrix3x3& P)
+	{
+		btMatrix3x3 corotated_F = s.m_corotation.transpose() * s.m_F;
+
+		btMatrix3x3 epsilon = (corotated_F + corotated_F.transpose()) * 0.5 - btMatrix3x3::getIdentity();
+		btScalar trace = epsilon[0][0] + epsilon[1][1] + epsilon[2][2];
+		P = epsilon * btScalar(2) * m_mu + btMatrix3x3::getIdentity() * m_lambda * trace;
+	}
+
+	// Let P be the first piola stress.
+	// This function calculates the dP = dP/dF * dF
+	void firstPiolaDifferential(const btSoftBody::TetraScratch& s, const btMatrix3x3& dF, btMatrix3x3& dP)
+	{
+		btScalar trace = (dF[0][0] + dF[1][1] + dF[2][2]);
+		dP = (dF + dF.transpose()) * m_mu + btMatrix3x3::getIdentity() * m_lambda * trace;
+	}
+
+	// Let Q be the damping stress.
+	// This function calculates the dP = dQ/dF * dF
+	void firstPiolaDampingDifferential(const btSoftBody::TetraScratch& s, const btMatrix3x3& dF, btMatrix3x3& dP)
+	{
+		btScalar mu_damp = m_damping_beta * m_mu;
+		btScalar lambda_damp = m_damping_beta * m_lambda;
+		btScalar trace = (dF[0][0] + dF[1][1] + dF[2][2]);
+		dP = (dF + dF.transpose()) * mu_damp + btMatrix3x3::getIdentity() * lambda_damp * trace;
+	}
+
+	virtual void addScaledHessian(btScalar scale)
+	{
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btMatrix3x3 P;
+				firstPiola(psb->m_tetraScratches[j], P);  // make sure scratch is evaluated at x_n + dt * vn
+				btMatrix3x3 force_on_node123 = psb->m_tetraScratches[j].m_corotation * P * tetra.m_Dm_inverse.transpose();
+				btVector3 force_on_node0 = force_on_node123 * grad_N_hat_1st_col;
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				btScalar scale1 = scale * (scale + m_damping_beta) * tetra.m_element_measure;  // stiff and stiffness-damping terms;
+				node0->m_effectiveMass += OuterProduct(force_on_node0, force_on_node0) * scale1;
+				node1->m_effectiveMass += OuterProduct(force_on_node123.getColumn(0), force_on_node123.getColumn(0)) * scale1;
+				node2->m_effectiveMass += OuterProduct(force_on_node123.getColumn(1), force_on_node123.getColumn(1)) * scale1;
+				node3->m_effectiveMass += OuterProduct(force_on_node123.getColumn(2), force_on_node123.getColumn(2)) * scale1;
+			}
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				btSoftBody::Node& node = psb->m_nodes[j];
+				if (node.m_im > 0)
+				{
+					btMatrix3x3 I;
+					I.setIdentity();
+					node.m_effectiveMass += I * (scale * (1.0 / node.m_im) * m_damping_alpha);
+				}
+			}
+		}
+	}
+
+	virtual btDeformableLagrangianForceType getForceType()
+	{
+		return BT_LINEAR_ELASTICITY_FORCE;
+	}
 };
 #endif /* BT_LINEAR_ELASTICITY_H */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableMassSpringForce.h b/thirdparty/bullet/BulletSoftBody/btDeformableMassSpringForce.h
index b128df92cc..8c97bd1ba8 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableMassSpringForce.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableMassSpringForce.h
@@ -20,282 +20,282 @@
 
 class btDeformableMassSpringForce : public btDeformableLagrangianForce
 {
-    // If true, the damping force will be in the direction of the spring
-    // If false, the damping force will be in the direction of the velocity
-    bool m_momentum_conserving;
-    btScalar m_elasticStiffness, m_dampingStiffness, m_bendingStiffness;
+	// If true, the damping force will be in the direction of the spring
+	// If false, the damping force will be in the direction of the velocity
+	bool m_momentum_conserving;
+	btScalar m_elasticStiffness, m_dampingStiffness, m_bendingStiffness;
+
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    btDeformableMassSpringForce() : m_momentum_conserving(false), m_elasticStiffness(1), m_dampingStiffness(0.05)
-    {
-    }
-    btDeformableMassSpringForce(btScalar k, btScalar d, bool conserve_angular = true, double bending_k = -1) : m_momentum_conserving(conserve_angular), m_elasticStiffness(k), m_dampingStiffness(d), m_bendingStiffness(bending_k)
-    {
-        if (m_bendingStiffness < btScalar(0))
-        {
-            m_bendingStiffness = m_elasticStiffness;
-        }
-    }
-    
-    virtual void addScaledForces(btScalar scale, TVStack& force)
-    {
-        addScaledDampingForce(scale, force);
-        addScaledElasticForce(scale, force);
-    }
-    
-    virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
-    {
-        addScaledElasticForce(scale, force);
-    }
-    
-    virtual void addScaledDampingForce(btScalar scale, TVStack& force)
-    {
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= force.size());
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            const btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_links.size(); ++j)
-            {
-                const btSoftBody::Link& link = psb->m_links[j];
-                btSoftBody::Node* node1 = link.m_n[0];
-                btSoftBody::Node* node2 = link.m_n[1];
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                
-                // damping force
-                btVector3 v_diff = (node2->m_v - node1->m_v);
-                btVector3 scaled_force = scale * m_dampingStiffness * v_diff;
-                if (m_momentum_conserving)
-                {
-                    if ((node2->m_x - node1->m_x).norm() > SIMD_EPSILON)
-                    {
-                        btVector3 dir = (node2->m_x - node1->m_x).normalized();
-                        scaled_force = scale * m_dampingStiffness * v_diff.dot(dir) * dir;
-                    }
-                }
-                force[id1] += scaled_force;
-                force[id2] -= scaled_force;
-            }
-        }
-    }
-    
-    virtual void addScaledElasticForce(btScalar scale, TVStack& force)
-    {
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= force.size());
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            const btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_links.size(); ++j)
-            {
-                const btSoftBody::Link& link = psb->m_links[j];
-                btSoftBody::Node* node1 = link.m_n[0];
-                btSoftBody::Node* node2 = link.m_n[1];
-                btScalar r = link.m_rl;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                
-                // elastic force
-                btVector3 dir = (node2->m_q - node1->m_q);
-                btVector3 dir_normalized = (dir.norm() > SIMD_EPSILON) ? dir.normalized() : btVector3(0,0,0);
-                btScalar scaled_stiffness = scale * (link.m_bbending ? m_bendingStiffness : m_elasticStiffness);
-                btVector3 scaled_force = scaled_stiffness * (dir - dir_normalized * r);
-                force[id1] += scaled_force;
-                force[id2] -= scaled_force;
-            }
-        }
-    }
-    
-    virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
-    {
-        // implicit damping force differential
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            btScalar scaled_k_damp = m_dampingStiffness * scale;
-            for (int j = 0; j < psb->m_links.size(); ++j)
-            {
-                const btSoftBody::Link& link = psb->m_links[j];
-                btSoftBody::Node* node1 = link.m_n[0];
-                btSoftBody::Node* node2 = link.m_n[1];
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	btDeformableMassSpringForce() : m_momentum_conserving(false), m_elasticStiffness(1), m_dampingStiffness(0.05)
+	{
+	}
+	btDeformableMassSpringForce(btScalar k, btScalar d, bool conserve_angular = true, double bending_k = -1) : m_momentum_conserving(conserve_angular), m_elasticStiffness(k), m_dampingStiffness(d), m_bendingStiffness(bending_k)
+	{
+		if (m_bendingStiffness < btScalar(0))
+		{
+			m_bendingStiffness = m_elasticStiffness;
+		}
+	}
+
+	virtual void addScaledForces(btScalar scale, TVStack& force)
+	{
+		addScaledDampingForce(scale, force);
+		addScaledElasticForce(scale, force);
+	}
+
+	virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
+	{
+		addScaledElasticForce(scale, force);
+	}
+
+	virtual void addScaledDampingForce(btScalar scale, TVStack& force)
+	{
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= force.size());
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			const btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_links.size(); ++j)
+			{
+				const btSoftBody::Link& link = psb->m_links[j];
+				btSoftBody::Node* node1 = link.m_n[0];
+				btSoftBody::Node* node2 = link.m_n[1];
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+
+				// damping force
+				btVector3 v_diff = (node2->m_v - node1->m_v);
+				btVector3 scaled_force = scale * m_dampingStiffness * v_diff;
+				if (m_momentum_conserving)
+				{
+					if ((node2->m_x - node1->m_x).norm() > SIMD_EPSILON)
+					{
+						btVector3 dir = (node2->m_x - node1->m_x).normalized();
+						scaled_force = scale * m_dampingStiffness * v_diff.dot(dir) * dir;
+					}
+				}
+				force[id1] += scaled_force;
+				force[id2] -= scaled_force;
+			}
+		}
+	}
+
+	virtual void addScaledElasticForce(btScalar scale, TVStack& force)
+	{
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= force.size());
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			const btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_links.size(); ++j)
+			{
+				const btSoftBody::Link& link = psb->m_links[j];
+				btSoftBody::Node* node1 = link.m_n[0];
+				btSoftBody::Node* node2 = link.m_n[1];
+				btScalar r = link.m_rl;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+
+				// elastic force
+				btVector3 dir = (node2->m_q - node1->m_q);
+				btVector3 dir_normalized = (dir.norm() > SIMD_EPSILON) ? dir.normalized() : btVector3(0, 0, 0);
+				btScalar scaled_stiffness = scale * (link.m_bbending ? m_bendingStiffness : m_elasticStiffness);
+				btVector3 scaled_force = scaled_stiffness * (dir - dir_normalized * r);
+				force[id1] += scaled_force;
+				force[id2] -= scaled_force;
+			}
+		}
+	}
+
+	virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
+	{
+		// implicit damping force differential
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			btScalar scaled_k_damp = m_dampingStiffness * scale;
+			for (int j = 0; j < psb->m_links.size(); ++j)
+			{
+				const btSoftBody::Link& link = psb->m_links[j];
+				btSoftBody::Node* node1 = link.m_n[0];
+				btSoftBody::Node* node2 = link.m_n[1];
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+
+				btVector3 local_scaled_df = scaled_k_damp * (dv[id2] - dv[id1]);
+				if (m_momentum_conserving)
+				{
+					if ((node2->m_x - node1->m_x).norm() > SIMD_EPSILON)
+					{
+						btVector3 dir = (node2->m_x - node1->m_x).normalized();
+						local_scaled_df = scaled_k_damp * (dv[id2] - dv[id1]).dot(dir) * dir;
+					}
+				}
+				df[id1] += local_scaled_df;
+				df[id2] -= local_scaled_df;
+			}
+		}
+	}
+
+	virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA)
+	{
+		// implicit damping force differential
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			btScalar scaled_k_damp = m_dampingStiffness * scale;
+			for (int j = 0; j < psb->m_links.size(); ++j)
+			{
+				const btSoftBody::Link& link = psb->m_links[j];
+				btSoftBody::Node* node1 = link.m_n[0];
+				btSoftBody::Node* node2 = link.m_n[1];
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				if (m_momentum_conserving)
+				{
+					if ((node2->m_x - node1->m_x).norm() > SIMD_EPSILON)
+					{
+						btVector3 dir = (node2->m_x - node1->m_x).normalized();
+						for (int d = 0; d < 3; ++d)
+						{
+							if (node1->m_im > 0)
+								diagA[id1][d] -= scaled_k_damp * dir[d] * dir[d];
+							if (node2->m_im > 0)
+								diagA[id2][d] -= scaled_k_damp * dir[d] * dir[d];
+						}
+					}
+				}
+				else
+				{
+					for (int d = 0; d < 3; ++d)
+					{
+						if (node1->m_im > 0)
+							diagA[id1][d] -= scaled_k_damp;
+						if (node2->m_im > 0)
+							diagA[id2][d] -= scaled_k_damp;
+					}
+				}
+			}
+		}
+	}
+
+	virtual double totalElasticEnergy(btScalar dt)
+	{
+		double energy = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			const btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_links.size(); ++j)
+			{
+				const btSoftBody::Link& link = psb->m_links[j];
+				btSoftBody::Node* node1 = link.m_n[0];
+				btSoftBody::Node* node2 = link.m_n[1];
+				btScalar r = link.m_rl;
+
+				// elastic force
+				btVector3 dir = (node2->m_q - node1->m_q);
+				energy += 0.5 * m_elasticStiffness * (dir.norm() - r) * (dir.norm() - r);
+			}
+		}
+		return energy;
+	}
+
+	virtual double totalDampingEnergy(btScalar dt)
+	{
+		double energy = 0;
+		int sz = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				sz = btMax(sz, psb->m_nodes[j].index);
+			}
+		}
+		TVStack dampingForce;
+		dampingForce.resize(sz + 1);
+		for (int i = 0; i < dampingForce.size(); ++i)
+			dampingForce[i].setZero();
+		addScaledDampingForce(0.5, dampingForce);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				const btSoftBody::Node& node = psb->m_nodes[j];
+				energy -= dampingForce[node.index].dot(node.m_v) / dt;
+			}
+		}
+		return energy;
+	}
+
+	virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
+	{
+		// implicit damping force differential
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			const btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_links.size(); ++j)
+			{
+				const btSoftBody::Link& link = psb->m_links[j];
+				btSoftBody::Node* node1 = link.m_n[0];
+				btSoftBody::Node* node2 = link.m_n[1];
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				btScalar r = link.m_rl;
 
-                btVector3 local_scaled_df = scaled_k_damp * (dv[id2] - dv[id1]);
-                if (m_momentum_conserving)
-                {
-                    if ((node2->m_x - node1->m_x).norm() > SIMD_EPSILON)
-                    {
-                        btVector3 dir = (node2->m_x - node1->m_x).normalized();
-                        local_scaled_df= scaled_k_damp * (dv[id2] - dv[id1]).dot(dir) * dir;
-                    }
-                }
-                df[id1] += local_scaled_df;
-                df[id2] -= local_scaled_df;
-            }
-        }
-    }
-    
-    virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA)
-    {
-        // implicit damping force differential
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            btScalar scaled_k_damp = m_dampingStiffness * scale;
-            for (int j = 0; j < psb->m_links.size(); ++j)
-            {
-                const btSoftBody::Link& link = psb->m_links[j];
-                btSoftBody::Node* node1 = link.m_n[0];
-                btSoftBody::Node* node2 = link.m_n[1];
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                if (m_momentum_conserving)
-                {
-                    if ((node2->m_x - node1->m_x).norm() > SIMD_EPSILON)
-                    {
-                        btVector3 dir = (node2->m_x - node1->m_x).normalized();
-                        for (int d = 0; d < 3; ++d)
-                        {
-                            if (node1->m_im > 0)
-                                diagA[id1][d] -= scaled_k_damp * dir[d] * dir[d];
-                            if (node2->m_im > 0)
-                                diagA[id2][d] -= scaled_k_damp * dir[d] * dir[d];
-                        }
-                    }
-                }
-                else
-                {
-                    for (int d = 0; d < 3; ++d)
-                    {
-                        if (node1->m_im > 0)
-                            diagA[id1][d] -= scaled_k_damp;
-                        if (node2->m_im > 0)
-                            diagA[id2][d] -= scaled_k_damp;
-                    }
-                }
-            }
-        }
-    }
-    
-    virtual double totalElasticEnergy(btScalar dt)
-    {
-        double energy = 0;
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            const btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_links.size(); ++j)
-            {
-                const btSoftBody::Link& link = psb->m_links[j];
-                btSoftBody::Node* node1 = link.m_n[0];
-                btSoftBody::Node* node2 = link.m_n[1];
-                btScalar r = link.m_rl;
+				btVector3 dir = (node1->m_q - node2->m_q);
+				btScalar dir_norm = dir.norm();
+				btVector3 dir_normalized = (dir_norm > SIMD_EPSILON) ? dir.normalized() : btVector3(0, 0, 0);
+				btVector3 dx_diff = dx[id1] - dx[id2];
+				btVector3 scaled_df = btVector3(0, 0, 0);
+				btScalar scaled_k = scale * (link.m_bbending ? m_bendingStiffness : m_elasticStiffness);
+				if (dir_norm > SIMD_EPSILON)
+				{
+					scaled_df -= scaled_k * dir_normalized.dot(dx_diff) * dir_normalized;
+					scaled_df += scaled_k * dir_normalized.dot(dx_diff) * ((dir_norm - r) / dir_norm) * dir_normalized;
+					scaled_df -= scaled_k * ((dir_norm - r) / dir_norm) * dx_diff;
+				}
 
-                // elastic force
-                btVector3 dir = (node2->m_q - node1->m_q);
-                energy += 0.5 * m_elasticStiffness * (dir.norm() - r) * (dir.norm() -r);
-            }
-        }
-        return energy;
-    }
-    
-    virtual double totalDampingEnergy(btScalar dt)
-    {
-        double energy = 0;
-        int sz = 0;
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                sz = btMax(sz, psb->m_nodes[j].index);
-            }
-        }
-        TVStack dampingForce;
-        dampingForce.resize(sz+1);
-        for (int i = 0; i < dampingForce.size(); ++i)
-            dampingForce[i].setZero();
-        addScaledDampingForce(0.5, dampingForce);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                const btSoftBody::Node& node = psb->m_nodes[j];
-                energy -= dampingForce[node.index].dot(node.m_v) / dt;
-            }
-        }
-        return energy;
-    }
-    
-    virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
-    {
-        // implicit damping force differential
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            const btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_links.size(); ++j)
-            {
-                const btSoftBody::Link& link = psb->m_links[j];
-                btSoftBody::Node* node1 = link.m_n[0];
-                btSoftBody::Node* node2 = link.m_n[1];
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                btScalar r = link.m_rl;
+				df[id1] += scaled_df;
+				df[id2] -= scaled_df;
+			}
+		}
+	}
 
-                btVector3 dir = (node1->m_q - node2->m_q);
-                btScalar dir_norm = dir.norm();
-                btVector3 dir_normalized = (dir_norm > SIMD_EPSILON) ? dir.normalized() : btVector3(0,0,0);
-                btVector3 dx_diff = dx[id1] - dx[id2];
-                btVector3 scaled_df = btVector3(0,0,0);
-                btScalar scaled_k = scale * (link.m_bbending ? m_bendingStiffness : m_elasticStiffness);
-                if (dir_norm > SIMD_EPSILON)
-                {
-                    scaled_df -= scaled_k * dir_normalized.dot(dx_diff) * dir_normalized;
-                    scaled_df += scaled_k * dir_normalized.dot(dx_diff) * ((dir_norm-r)/dir_norm) * dir_normalized;
-                    scaled_df -= scaled_k * ((dir_norm-r)/dir_norm) * dx_diff;
-                }
-                
-                df[id1] += scaled_df;
-                df[id2] -= scaled_df;
-            }
-        }
-    }
-    
-    virtual btDeformableLagrangianForceType getForceType()
-    {
-        return BT_MASSSPRING_FORCE;
-    }
-    
+	virtual btDeformableLagrangianForceType getForceType()
+	{
+		return BT_MASSSPRING_FORCE;
+	}
 };
 
 #endif /* btMassSpring_h */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableMousePickingForce.h b/thirdparty/bullet/BulletSoftBody/btDeformableMousePickingForce.h
index 07c10935f4..d218d96214 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableMousePickingForce.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableMousePickingForce.h
@@ -20,126 +20,143 @@
 
 class btDeformableMousePickingForce : public btDeformableLagrangianForce
 {
-    // If true, the damping force will be in the direction of the spring
-    // If false, the damping force will be in the direction of the velocity
-    btScalar m_elasticStiffness, m_dampingStiffness;
-    const btSoftBody::Face& m_face;
-    btVector3 m_mouse_pos;
-    btScalar m_maxForce;
+	// If true, the damping force will be in the direction of the spring
+	// If false, the damping force will be in the direction of the velocity
+	btScalar m_elasticStiffness, m_dampingStiffness;
+	const btSoftBody::Face& m_face;
+	btVector3 m_mouse_pos;
+	btScalar m_maxForce;
+
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    btDeformableMousePickingForce(btScalar k, btScalar d, const btSoftBody::Face& face, btVector3 mouse_pos, btScalar maxForce = 0.3) : m_elasticStiffness(k), m_dampingStiffness(d), m_face(face), m_mouse_pos(mouse_pos),  m_maxForce(maxForce)
-    {
-    }
-    
-    virtual void addScaledForces(btScalar scale, TVStack& force)
-    {
-        addScaledDampingForce(scale, force);
-        addScaledElasticForce(scale, force);
-    }
-    
-    virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
-    {
-        addScaledElasticForce(scale, force);
-    }
-    
-    virtual void addScaledDampingForce(btScalar scale, TVStack& force)
-    {
-        for (int i = 0; i < 3; ++i)
-        {
-            btVector3 v_diff = m_face.m_n[i]->m_v;
-            btVector3 scaled_force = scale * m_dampingStiffness * v_diff;
-            if ((m_face.m_n[i]->m_x - m_mouse_pos).norm() > SIMD_EPSILON)
-            {
-                btVector3 dir = (m_face.m_n[i]->m_x - m_mouse_pos).normalized();
-                scaled_force = scale * m_dampingStiffness * v_diff.dot(dir) * dir;
-            }
-            force[m_face.m_n[i]->index] -= scaled_force;
-        }
-    }
-    
-    virtual void addScaledElasticForce(btScalar scale, TVStack& force)
-    {
-        btScalar scaled_stiffness = scale * m_elasticStiffness;
-        for (int i = 0; i < 3; ++i)
-        {
-            btVector3 dir = (m_face.m_n[i]->m_q - m_mouse_pos);
-            btVector3 scaled_force = scaled_stiffness * dir;
-            if (scaled_force.safeNorm() > m_maxForce)
-            {
-                scaled_force.safeNormalize();
-                scaled_force *= m_maxForce;
-            }
-            force[m_face.m_n[i]->index] -= scaled_force;
-        }
-    }
-    
-    virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
-    {
-        btScalar scaled_k_damp = m_dampingStiffness * scale;
-        for (int i = 0; i < 3; ++i)
-        {
-            btVector3 local_scaled_df = scaled_k_damp * dv[m_face.m_n[i]->index];
-            if ((m_face.m_n[i]->m_x - m_mouse_pos).norm() > SIMD_EPSILON)
-            {
-                btVector3 dir = (m_face.m_n[i]->m_x - m_mouse_pos).normalized();
-                local_scaled_df= scaled_k_damp * dv[m_face.m_n[i]->index].dot(dir) * dir;
-            }
-            df[m_face.m_n[i]->index] -= local_scaled_df;
-        }
-    }
-    
-    virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA){}
-    
-    virtual double totalElasticEnergy(btScalar dt)
-    {
-        double energy = 0;
-        for (int i = 0; i < 3; ++i)
-        {
-            btVector3 dir = (m_face.m_n[i]->m_q - m_mouse_pos);
-            btVector3 scaled_force = m_elasticStiffness * dir;
-            if (scaled_force.safeNorm() > m_maxForce)
-            {
-                scaled_force.safeNormalize();
-                scaled_force *= m_maxForce;
-            }
-            energy += 0.5 * scaled_force.dot(dir);
-        }
-        return energy;
-    }
-    
-    virtual double totalDampingEnergy(btScalar dt)
-    {
-        double energy = 0;
-        for (int i = 0; i < 3; ++i)
-        {
-            btVector3 v_diff = m_face.m_n[i]->m_v;
-            btVector3 scaled_force = m_dampingStiffness * v_diff;
-            if ((m_face.m_n[i]->m_x - m_mouse_pos).norm() > SIMD_EPSILON)
-            {
-                btVector3 dir = (m_face.m_n[i]->m_x - m_mouse_pos).normalized();
-                scaled_force = m_dampingStiffness * v_diff.dot(dir) * dir;
-            }
-            energy -= scaled_force.dot(m_face.m_n[i]->m_v) / dt;
-        }
-        return energy;
-    }
-    
-    virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
-    {
-        //TODO
-    }
-    
-    void setMousePos(const btVector3& p)
-    {
-        m_mouse_pos = p;
-    }
-    
-    virtual btDeformableLagrangianForceType getForceType()
-    {
-        return BT_MOUSE_PICKING_FORCE;
-    }
-    
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	btDeformableMousePickingForce(btScalar k, btScalar d, const btSoftBody::Face& face, btVector3 mouse_pos, btScalar maxForce = 0.3) : m_elasticStiffness(k), m_dampingStiffness(d), m_face(face), m_mouse_pos(mouse_pos), m_maxForce(maxForce)
+	{
+	}
+
+	virtual void addScaledForces(btScalar scale, TVStack& force)
+	{
+		addScaledDampingForce(scale, force);
+		addScaledElasticForce(scale, force);
+	}
+
+	virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
+	{
+		addScaledElasticForce(scale, force);
+	}
+
+	virtual void addScaledDampingForce(btScalar scale, TVStack& force)
+	{
+		for (int i = 0; i < 3; ++i)
+		{
+			btVector3 v_diff = m_face.m_n[i]->m_v;
+			btVector3 scaled_force = scale * m_dampingStiffness * v_diff;
+			if ((m_face.m_n[i]->m_x - m_mouse_pos).norm() > SIMD_EPSILON)
+			{
+				btVector3 dir = (m_face.m_n[i]->m_x - m_mouse_pos).normalized();
+				scaled_force = scale * m_dampingStiffness * v_diff.dot(dir) * dir;
+			}
+			force[m_face.m_n[i]->index] -= scaled_force;
+		}
+	}
+
+	virtual void addScaledElasticForce(btScalar scale, TVStack& force)
+	{
+		btScalar scaled_stiffness = scale * m_elasticStiffness;
+		for (int i = 0; i < 3; ++i)
+		{
+			btVector3 dir = (m_face.m_n[i]->m_q - m_mouse_pos);
+			btVector3 scaled_force = scaled_stiffness * dir;
+			if (scaled_force.safeNorm() > m_maxForce)
+			{
+				scaled_force.safeNormalize();
+				scaled_force *= m_maxForce;
+			}
+			force[m_face.m_n[i]->index] -= scaled_force;
+		}
+	}
+
+	virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
+	{
+		btScalar scaled_k_damp = m_dampingStiffness * scale;
+		for (int i = 0; i < 3; ++i)
+		{
+			btVector3 local_scaled_df = scaled_k_damp * dv[m_face.m_n[i]->index];
+			if ((m_face.m_n[i]->m_x - m_mouse_pos).norm() > SIMD_EPSILON)
+			{
+				btVector3 dir = (m_face.m_n[i]->m_x - m_mouse_pos).normalized();
+				local_scaled_df = scaled_k_damp * dv[m_face.m_n[i]->index].dot(dir) * dir;
+			}
+			df[m_face.m_n[i]->index] -= local_scaled_df;
+		}
+	}
+
+	virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA) {}
+
+	virtual double totalElasticEnergy(btScalar dt)
+	{
+		double energy = 0;
+		for (int i = 0; i < 3; ++i)
+		{
+			btVector3 dir = (m_face.m_n[i]->m_q - m_mouse_pos);
+			btVector3 scaled_force = m_elasticStiffness * dir;
+			if (scaled_force.safeNorm() > m_maxForce)
+			{
+				scaled_force.safeNormalize();
+				scaled_force *= m_maxForce;
+			}
+			energy += 0.5 * scaled_force.dot(dir);
+		}
+		return energy;
+	}
+
+	virtual double totalDampingEnergy(btScalar dt)
+	{
+		double energy = 0;
+		for (int i = 0; i < 3; ++i)
+		{
+			btVector3 v_diff = m_face.m_n[i]->m_v;
+			btVector3 scaled_force = m_dampingStiffness * v_diff;
+			if ((m_face.m_n[i]->m_x - m_mouse_pos).norm() > SIMD_EPSILON)
+			{
+				btVector3 dir = (m_face.m_n[i]->m_x - m_mouse_pos).normalized();
+				scaled_force = m_dampingStiffness * v_diff.dot(dir) * dir;
+			}
+			energy -= scaled_force.dot(m_face.m_n[i]->m_v) / dt;
+		}
+		return energy;
+	}
+
+	virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
+	{
+		btScalar scaled_stiffness = scale * m_elasticStiffness;
+		for (int i = 0; i < 3; ++i)
+		{
+			btVector3 dir = (m_face.m_n[i]->m_q - m_mouse_pos);
+			btScalar dir_norm = dir.norm();
+			btVector3 dir_normalized = (dir_norm > SIMD_EPSILON) ? dir.normalized() : btVector3(0, 0, 0);
+			int id = m_face.m_n[i]->index;
+			btVector3 dx_diff = dx[id];
+			btScalar r = 0;  // rest length is 0 for picking spring
+			btVector3 scaled_df = btVector3(0, 0, 0);
+			if (dir_norm > SIMD_EPSILON)
+			{
+				scaled_df -= scaled_stiffness * dir_normalized.dot(dx_diff) * dir_normalized;
+				scaled_df += scaled_stiffness * dir_normalized.dot(dx_diff) * ((dir_norm - r) / dir_norm) * dir_normalized;
+				scaled_df -= scaled_stiffness * ((dir_norm - r) / dir_norm) * dx_diff;
+			}
+			df[id] += scaled_df;
+		}
+	}
+
+	void setMousePos(const btVector3& p)
+	{
+		m_mouse_pos = p;
+	}
+
+	virtual btDeformableLagrangianForceType getForceType()
+	{
+		return BT_MOUSE_PICKING_FORCE;
+	}
 };
 
 #endif /* btMassSpring_h */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyConstraintSolver.cpp b/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyConstraintSolver.cpp
index c8cc47923e..631fd5fbed 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyConstraintSolver.cpp
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyConstraintSolver.cpp
@@ -13,131 +13,132 @@
  3. This notice may not be removed or altered from any source distribution.
  */
 
-
 #include "btDeformableMultiBodyConstraintSolver.h"
 #include <iostream>
 // override the iterations method to include deformable/multibody contact
-btScalar btDeformableMultiBodyConstraintSolver::solveDeformableGroupIterations(btCollisionObject** bodies,int numBodies,btCollisionObject** deformableBodies,int numDeformableBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer)
+btScalar btDeformableMultiBodyConstraintSolver::solveDeformableGroupIterations(btCollisionObject** bodies, int numBodies, btCollisionObject** deformableBodies, int numDeformableBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer)
 {
-    {
-        ///this is a special step to resolve penetrations (just for contacts)
-        solveGroupCacheFriendlySplitImpulseIterations(bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer);
+	{
+		///this is a special step to resolve penetrations (just for contacts)
+		solveGroupCacheFriendlySplitImpulseIterations(bodies, numBodies, deformableBodies, numDeformableBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer);
+
+		int maxIterations = m_maxOverrideNumSolverIterations > infoGlobal.m_numIterations ? m_maxOverrideNumSolverIterations : infoGlobal.m_numIterations;
+		for (int iteration = 0; iteration < maxIterations; iteration++)
+		{
+			// rigid bodies are solved using solver body velocity, but rigid/deformable contact directly uses the velocity of the actual rigid body. So we have to do the following: Solve one iteration of the rigid/rigid contact, get the updated velocity in the solver body and update the velocity of the underlying rigid body. Then solve the rigid/deformable contact. Finally, grab the (once again) updated rigid velocity and update the velocity of the wrapping solver body
 
-        int maxIterations = m_maxOverrideNumSolverIterations > infoGlobal.m_numIterations ? m_maxOverrideNumSolverIterations : infoGlobal.m_numIterations;
-        for (int iteration = 0; iteration < maxIterations; iteration++)
-        {
-            // rigid bodies are solved using solver body velocity, but rigid/deformable contact directly uses the velocity of the actual rigid body. So we have to do the following: Solve one iteration of the rigid/rigid contact, get the updated velocity in the solver body and update the velocity of the underlying rigid body. Then solve the rigid/deformable contact. Finally, grab the (once again) updated rigid velocity and update the velocity of the wrapping solver body
-            
-            // solve rigid/rigid in solver body
-            m_leastSquaresResidual = solveSingleIteration(iteration, bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer);
-            // solver body velocity -> rigid body velocity
-            solverBodyWriteBack(infoGlobal);
-            btScalar deformableResidual = m_deformableSolver->solveContactConstraints(deformableBodies,numDeformableBodies, infoGlobal);
-            // update rigid body velocity in rigid/deformable contact
-            m_leastSquaresResidual = btMax(m_leastSquaresResidual, deformableResidual);
-            // solver body velocity <- rigid body velocity
-            writeToSolverBody(bodies, numBodies, infoGlobal);
-            
-            if (m_leastSquaresResidual <= infoGlobal.m_leastSquaresResidualThreshold || (iteration >= (maxIterations - 1)))
-            {
+			// solve rigid/rigid in solver body
+			m_leastSquaresResidual = solveSingleIteration(iteration, bodies, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal, debugDrawer);
+			// solver body velocity -> rigid body velocity
+			solverBodyWriteBack(infoGlobal);
+			btScalar deformableResidual = m_deformableSolver->solveContactConstraints(deformableBodies, numDeformableBodies, infoGlobal);
+			// update rigid body velocity in rigid/deformable contact
+			m_leastSquaresResidual = btMax(m_leastSquaresResidual, deformableResidual);
+			// solver body velocity <- rigid body velocity
+			writeToSolverBody(bodies, numBodies, infoGlobal);
+
+			if (m_leastSquaresResidual <= infoGlobal.m_leastSquaresResidualThreshold || (iteration >= (maxIterations - 1)))
+			{
 #ifdef VERBOSE_RESIDUAL_PRINTF
-                printf("residual = %f at iteration #%d\n", m_leastSquaresResidual, iteration);
+				if (iteration >= (maxIterations - 1))
+					printf("residual = %f at iteration #%d\n", m_leastSquaresResidual, iteration);
 #endif
-                m_analyticsData.m_numSolverCalls++;
-                m_analyticsData.m_numIterationsUsed = iteration+1;
-                m_analyticsData.m_islandId = -2;
-                if (numBodies>0)
-                    m_analyticsData.m_islandId = bodies[0]->getCompanionId();
-                m_analyticsData.m_numBodies = numBodies;
-                m_analyticsData.m_numContactManifolds = numManifolds;
-                m_analyticsData.m_remainingLeastSquaresResidual = m_leastSquaresResidual;
-                break;
-            }
-        }
-    }
-    return 0.f;
+				m_analyticsData.m_numSolverCalls++;
+				m_analyticsData.m_numIterationsUsed = iteration + 1;
+				m_analyticsData.m_islandId = -2;
+				if (numBodies > 0)
+					m_analyticsData.m_islandId = bodies[0]->getCompanionId();
+				m_analyticsData.m_numBodies = numBodies;
+				m_analyticsData.m_numContactManifolds = numManifolds;
+				m_analyticsData.m_remainingLeastSquaresResidual = m_leastSquaresResidual;
+				break;
+			}
+		}
+	}
+	return 0.f;
 }
 
-void btDeformableMultiBodyConstraintSolver::solveDeformableBodyGroup(btCollisionObject * *bodies, int numBodies, btCollisionObject * *deformableBodies, int numDeformableBodies, btPersistentManifold** manifold, int numManifolds, btTypedConstraint** constraints, int numConstraints, btMultiBodyConstraint** multiBodyConstraints, int numMultiBodyConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btDispatcher* dispatcher)
+void btDeformableMultiBodyConstraintSolver::solveDeformableBodyGroup(btCollisionObject** bodies, int numBodies, btCollisionObject** deformableBodies, int numDeformableBodies, btPersistentManifold** manifold, int numManifolds, btTypedConstraint** constraints, int numConstraints, btMultiBodyConstraint** multiBodyConstraints, int numMultiBodyConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btDispatcher* dispatcher)
 {
-    m_tmpMultiBodyConstraints = multiBodyConstraints;
-    m_tmpNumMultiBodyConstraints = numMultiBodyConstraints;
-    
-    // inherited from MultiBodyConstraintSolver
-    solveGroupCacheFriendlySetup(bodies, numBodies, manifold, numManifolds, constraints, numConstraints, info, debugDrawer);
-    
-    // overriden
-    solveDeformableGroupIterations(bodies, numBodies, deformableBodies, numDeformableBodies, manifold, numManifolds, constraints, numConstraints, info, debugDrawer);
-    
-    // inherited from MultiBodyConstraintSolver
-    solveGroupCacheFriendlyFinish(bodies, numBodies, info);
-    
-    m_tmpMultiBodyConstraints = 0;
-    m_tmpNumMultiBodyConstraints = 0;
+	m_tmpMultiBodyConstraints = multiBodyConstraints;
+	m_tmpNumMultiBodyConstraints = numMultiBodyConstraints;
+
+	// inherited from MultiBodyConstraintSolver
+	solveGroupCacheFriendlySetup(bodies, numBodies, manifold, numManifolds, constraints, numConstraints, info, debugDrawer);
+
+	// overriden
+	solveDeformableGroupIterations(bodies, numBodies, deformableBodies, numDeformableBodies, manifold, numManifolds, constraints, numConstraints, info, debugDrawer);
+
+	// inherited from MultiBodyConstraintSolver
+	solveGroupCacheFriendlyFinish(bodies, numBodies, info);
+
+	m_tmpMultiBodyConstraints = 0;
+	m_tmpNumMultiBodyConstraints = 0;
 }
 
 void btDeformableMultiBodyConstraintSolver::writeToSolverBody(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal)
 {
-    for (int i = 0; i < numBodies; i++)
-    {
-        int bodyId = getOrInitSolverBody(*bodies[i], infoGlobal.m_timeStep);
+	for (int i = 0; i < numBodies; i++)
+	{
+		int bodyId = getOrInitSolverBody(*bodies[i], infoGlobal.m_timeStep);
 
-        btRigidBody* body = btRigidBody::upcast(bodies[i]);
-        if (body && body->getInvMass())
-        {
-            btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId];
-            solverBody.m_linearVelocity = body->getLinearVelocity() - solverBody.m_deltaLinearVelocity;
-            solverBody.m_angularVelocity = body->getAngularVelocity() - solverBody.m_deltaAngularVelocity;
-        }
-    }
+		btRigidBody* body = btRigidBody::upcast(bodies[i]);
+		if (body && body->getInvMass())
+		{
+			btSolverBody& solverBody = m_tmpSolverBodyPool[bodyId];
+			solverBody.m_linearVelocity = body->getLinearVelocity() - solverBody.m_deltaLinearVelocity;
+			solverBody.m_angularVelocity = body->getAngularVelocity() - solverBody.m_deltaAngularVelocity;
+		}
+	}
 }
 
 void btDeformableMultiBodyConstraintSolver::solverBodyWriteBack(const btContactSolverInfo& infoGlobal)
 {
-    for (int i = 0; i < m_tmpSolverBodyPool.size(); i++)
-    {
-        btRigidBody* body = m_tmpSolverBodyPool[i].m_originalBody;
-        if (body)
-        {
-            m_tmpSolverBodyPool[i].m_originalBody->setLinearVelocity(m_tmpSolverBodyPool[i].m_linearVelocity + m_tmpSolverBodyPool[i].m_deltaLinearVelocity);
-            m_tmpSolverBodyPool[i].m_originalBody->setAngularVelocity(m_tmpSolverBodyPool[i].m_angularVelocity+m_tmpSolverBodyPool[i].m_deltaAngularVelocity);
-        }
-    }
+	for (int i = 0; i < m_tmpSolverBodyPool.size(); i++)
+	{
+		btRigidBody* body = m_tmpSolverBodyPool[i].m_originalBody;
+		if (body)
+		{
+			m_tmpSolverBodyPool[i].m_originalBody->setLinearVelocity(m_tmpSolverBodyPool[i].m_linearVelocity + m_tmpSolverBodyPool[i].m_deltaLinearVelocity);
+			m_tmpSolverBodyPool[i].m_originalBody->setAngularVelocity(m_tmpSolverBodyPool[i].m_angularVelocity + m_tmpSolverBodyPool[i].m_deltaAngularVelocity);
+		}
+	}
 }
 
-void btDeformableMultiBodyConstraintSolver::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer)
+void btDeformableMultiBodyConstraintSolver::solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies, int numBodies, btCollisionObject** deformableBodies, int numDeformableBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer)
 {
-    BT_PROFILE("solveGroupCacheFriendlySplitImpulseIterations");
-    int iteration;
-    if (infoGlobal.m_splitImpulse)
-    {
-        {
-//            m_deformableSolver->splitImpulseSetup(infoGlobal);
-            for (iteration = 0; iteration < infoGlobal.m_numIterations; iteration++)
-            {
-                btScalar leastSquaresResidual = 0.f;
-                {
-                    int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
-                    int j;
-                    for (j = 0; j < numPoolConstraints; j++)
-                    {
-                        const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
-                        
-                        btScalar residual = resolveSplitPenetrationImpulse(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
-                        leastSquaresResidual = btMax(leastSquaresResidual, residual * residual);
-                    }
-                    // solve the position correction between deformable and rigid/multibody
-//                    btScalar residual = m_deformableSolver->solveSplitImpulse(infoGlobal);
-//                    leastSquaresResidual = btMax(leastSquaresResidual, residual * residual);
-                }
-                if (leastSquaresResidual <= infoGlobal.m_leastSquaresResidualThreshold || iteration >= (infoGlobal.m_numIterations - 1))
-                {
+	BT_PROFILE("solveGroupCacheFriendlySplitImpulseIterations");
+	int iteration;
+	if (infoGlobal.m_splitImpulse)
+	{
+		{
+			for (iteration = 0; iteration < infoGlobal.m_numIterations; iteration++)
+			{
+				btScalar leastSquaresResidual = 0.f;
+				{
+					int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+					int j;
+					for (j = 0; j < numPoolConstraints; j++)
+					{
+						const btSolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+
+						btScalar residual = resolveSplitPenetrationImpulse(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+						leastSquaresResidual = btMax(leastSquaresResidual, residual * residual);
+					}
+					// solve the position correction between deformable and rigid/multibody
+					//                    btScalar residual = m_deformableSolver->solveSplitImpulse(infoGlobal);
+					btScalar residual = m_deformableSolver->m_objective->m_projection.solveSplitImpulse(deformableBodies, numDeformableBodies, infoGlobal);
+					leastSquaresResidual = btMax(leastSquaresResidual, residual * residual);
+				}
+				if (leastSquaresResidual <= infoGlobal.m_leastSquaresResidualThreshold || iteration >= (infoGlobal.m_numIterations - 1))
+				{
 #ifdef VERBOSE_RESIDUAL_PRINTF
-                    printf("residual = %f at iteration #%d\n", leastSquaresResidual, iteration);
+					if (iteration >= (infoGlobal.m_numIterations - 1))
+						printf("split impulse residual = %f at iteration #%d\n", leastSquaresResidual, iteration);
 #endif
-                    break;
-                }
-            }
-        }
-    }
+					break;
+				}
+			}
+		}
+	}
 }
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyConstraintSolver.h b/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyConstraintSolver.h
index 0c7cc26a83..94aabce838 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyConstraintSolver.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyConstraintSolver.h
@@ -13,7 +13,6 @@
  3. This notice may not be removed or altered from any source distribution.
  */
 
-
 #ifndef BT_DEFORMABLE_MULTIBODY_CONSTRAINT_SOLVER_H
 #define BT_DEFORMABLE_MULTIBODY_CONSTRAINT_SOLVER_H
 
@@ -32,30 +31,31 @@ class btDeformableBodySolver;
 ATTRIBUTE_ALIGNED16(class)
 btDeformableMultiBodyConstraintSolver : public btMultiBodyConstraintSolver
 {
-    btDeformableBodySolver* m_deformableSolver;
-    
+	btDeformableBodySolver* m_deformableSolver;
+
 protected:
-    // override the iterations method to include deformable/multibody contact
-//    virtual btScalar solveGroupCacheFriendlyIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
-    
-    // write the velocity of the the solver body to the underlying rigid body
-    void solverBodyWriteBack(const btContactSolverInfo& infoGlobal);
-
-    // write the velocity of the underlying rigid body to the the the solver body
-    void writeToSolverBody(btCollisionObject** bodies, int numBodies, const btContactSolverInfo& infoGlobal);
-    
-    virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject** bodies, int numBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer);
-    
-    virtual btScalar solveDeformableGroupIterations(btCollisionObject** bodies,int numBodies,btCollisionObject** deformableBodies,int numDeformableBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+	// override the iterations method to include deformable/multibody contact
+	//    virtual btScalar solveGroupCacheFriendlyIterations(btCollisionObject** bodies,int numBodies,btPersistentManifold** manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btContactSolverInfo& infoGlobal,btIDebugDraw* debugDrawer);
+
+	// write the velocity of the the solver body to the underlying rigid body
+	void solverBodyWriteBack(const btContactSolverInfo& infoGlobal);
+
+	// write the velocity of the underlying rigid body to the the the solver body
+	void writeToSolverBody(btCollisionObject * *bodies, int numBodies, const btContactSolverInfo& infoGlobal);
+
+	virtual void solveGroupCacheFriendlySplitImpulseIterations(btCollisionObject * *bodies, int numBodies, btCollisionObject** deformableBodies, int numDeformableBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer);
+
+	virtual btScalar solveDeformableGroupIterations(btCollisionObject * *bodies, int numBodies, btCollisionObject** deformableBodies, int numDeformableBodies, btPersistentManifold** manifoldPtr, int numManifolds, btTypedConstraint** constraints, int numConstraints, const btContactSolverInfo& infoGlobal, btIDebugDraw* debugDrawer);
+
 public:
-    BT_DECLARE_ALIGNED_ALLOCATOR();
-    
-    void setDeformableSolver(btDeformableBodySolver* deformableSolver)
-    {
-        m_deformableSolver = deformableSolver;
-    }
-    
-    virtual void solveDeformableBodyGroup(btCollisionObject * *bodies, int numBodies, btCollisionObject * *deformableBodies, int numDeformableBodies, btPersistentManifold** manifold, int numManifolds, btTypedConstraint** constraints, int numConstraints, btMultiBodyConstraint** multiBodyConstraints, int numMultiBodyConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btDispatcher* dispatcher);
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	void setDeformableSolver(btDeformableBodySolver * deformableSolver)
+	{
+		m_deformableSolver = deformableSolver;
+	}
+
+	virtual void solveDeformableBodyGroup(btCollisionObject * *bodies, int numBodies, btCollisionObject** deformableBodies, int numDeformableBodies, btPersistentManifold** manifold, int numManifolds, btTypedConstraint** constraints, int numConstraints, btMultiBodyConstraint** multiBodyConstraints, int numMultiBodyConstraints, const btContactSolverInfo& info, btIDebugDraw* debugDrawer, btDispatcher* dispatcher);
 };
 
 #endif /* BT_DEFORMABLE_MULTIBODY_CONSTRAINT_SOLVER_H */
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyDynamicsWorld.cpp b/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyDynamicsWorld.cpp
index 6b742978ef..983e622b5f 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyDynamicsWorld.cpp
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyDynamicsWorld.cpp
@@ -40,8 +40,9 @@ The algorithm also closely resembles the one in http://physbam.stanford.edu/~fed
 #include "LinearMath/btQuickprof.h"
 #include "btSoftBodyInternals.h"
 btDeformableMultiBodyDynamicsWorld::btDeformableMultiBodyDynamicsWorld(btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btDeformableMultiBodyConstraintSolver* constraintSolver, btCollisionConfiguration* collisionConfiguration, btDeformableBodySolver* deformableBodySolver)
-: btMultiBodyDynamicsWorld(dispatcher, pairCache, (btMultiBodyConstraintSolver*)constraintSolver, collisionConfiguration),
-m_deformableBodySolver(deformableBodySolver), m_solverCallback(0)
+	: btMultiBodyDynamicsWorld(dispatcher, pairCache, (btMultiBodyConstraintSolver*)constraintSolver, collisionConfiguration),
+	  m_deformableBodySolver(deformableBodySolver),
+	  m_solverCallback(0)
 {
 	m_drawFlags = fDrawFlags::Std;
 	m_drawNodeTree = true;
@@ -52,7 +53,7 @@ m_deformableBodySolver(deformableBodySolver), m_solverCallback(0)
 	m_sbi.m_sparsesdf.Initialize();
 	m_sbi.m_sparsesdf.setDefaultVoxelsz(0.005);
 	m_sbi.m_sparsesdf.Reset();
-	
+
 	m_sbi.air_density = (btScalar)1.2;
 	m_sbi.water_density = 0;
 	m_sbi.water_offset = 0;
@@ -61,57 +62,57 @@ m_deformableBodySolver(deformableBodySolver), m_solverCallback(0)
 	m_internalTime = 0.0;
 	m_implicit = false;
 	m_lineSearch = false;
-    m_useProjection = true;
+	m_useProjection = false;
 	m_ccdIterations = 5;
 	m_solverDeformableBodyIslandCallback = new DeformableBodyInplaceSolverIslandCallback(constraintSolver, dispatcher);
 }
 
 btDeformableMultiBodyDynamicsWorld::~btDeformableMultiBodyDynamicsWorld()
 {
-    delete m_solverDeformableBodyIslandCallback;
+	delete m_solverDeformableBodyIslandCallback;
 }
 
 void btDeformableMultiBodyDynamicsWorld::internalSingleStepSimulation(btScalar timeStep)
 {
-    BT_PROFILE("internalSingleStepSimulation");
-    if (0 != m_internalPreTickCallback)
-    {
-        (*m_internalPreTickCallback)(this, timeStep);
-    }
-    reinitialize(timeStep);
-    
-    // add gravity to velocity of rigid and multi bodys
-    applyRigidBodyGravity(timeStep);
-    
-    ///apply gravity and explicit force to velocity, predict motion
-    predictUnconstraintMotion(timeStep);
-    
-    ///perform collision detection that involves rigid/multi bodies
-    btMultiBodyDynamicsWorld::performDiscreteCollisionDetection();
-    
-    btMultiBodyDynamicsWorld::calculateSimulationIslands();
-    
-    beforeSolverCallbacks(timeStep);
-    
-    ///solve contact constraints and then deformable bodies momemtum equation
-    solveConstraints(timeStep);
-    
-    afterSolverCallbacks(timeStep);
+	BT_PROFILE("internalSingleStepSimulation");
+	if (0 != m_internalPreTickCallback)
+	{
+		(*m_internalPreTickCallback)(this, timeStep);
+	}
+	reinitialize(timeStep);
+
+	// add gravity to velocity of rigid and multi bodys
+	applyRigidBodyGravity(timeStep);
+
+	///apply gravity and explicit force to velocity, predict motion
+	predictUnconstraintMotion(timeStep);
+
+	///perform collision detection that involves rigid/multi bodies
+	btMultiBodyDynamicsWorld::performDiscreteCollisionDetection();
+
+	btMultiBodyDynamicsWorld::calculateSimulationIslands();
+
+	beforeSolverCallbacks(timeStep);
+
+	///solve contact constraints and then deformable bodies momemtum equation
+	solveConstraints(timeStep);
+
+	afterSolverCallbacks(timeStep);
 
 	performDeformableCollisionDetection();
 
-    applyRepulsionForce(timeStep);
+	applyRepulsionForce(timeStep);
+
+	performGeometricCollisions(timeStep);
+
+	integrateTransforms(timeStep);
 
-    performGeometricCollisions(timeStep);
+	///update vehicle simulation
+	btMultiBodyDynamicsWorld::updateActions(timeStep);
 
-    integrateTransforms(timeStep);
-    
-    ///update vehicle simulation
-    btMultiBodyDynamicsWorld::updateActions(timeStep);
-    
-    updateActivationState(timeStep);
-    // End solver-wise simulation step
-    // ///////////////////////////////
+	updateActivationState(timeStep);
+	// End solver-wise simulation step
+	// ///////////////////////////////
 }
 
 void btDeformableMultiBodyDynamicsWorld::performDeformableCollisionDetection()
@@ -120,7 +121,7 @@ void btDeformableMultiBodyDynamicsWorld::performDeformableCollisionDetection()
 	{
 		m_softBodies[i]->m_softSoftCollision = true;
 	}
-	
+
 	for (int i = 0; i < m_softBodies.size(); ++i)
 	{
 		for (int j = i; j < m_softBodies.size(); ++j)
@@ -128,7 +129,7 @@ void btDeformableMultiBodyDynamicsWorld::performDeformableCollisionDetection()
 			m_softBodies[i]->defaultCollisionHandler(m_softBodies[j]);
 		}
 	}
-	
+
 	for (int i = 0; i < m_softBodies.size(); ++i)
 	{
 		m_softBodies[i]->m_softSoftCollision = false;
@@ -137,45 +138,45 @@ void btDeformableMultiBodyDynamicsWorld::performDeformableCollisionDetection()
 
 void btDeformableMultiBodyDynamicsWorld::updateActivationState(btScalar timeStep)
 {
-    for (int i = 0; i < m_softBodies.size(); i++)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        psb->updateDeactivation(timeStep);
-        if (psb->wantsSleeping())
-        {
-            if (psb->getActivationState() == ACTIVE_TAG)
-                psb->setActivationState(WANTS_DEACTIVATION);
-            if (psb->getActivationState() == ISLAND_SLEEPING)
-            {
-                psb->setZeroVelocity();
-            }
-        }
-        else
-        {
-            if (psb->getActivationState() != DISABLE_DEACTIVATION)
-                psb->setActivationState(ACTIVE_TAG);
-        }
-    }
-    btMultiBodyDynamicsWorld::updateActivationState(timeStep);
+	for (int i = 0; i < m_softBodies.size(); i++)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		psb->updateDeactivation(timeStep);
+		if (psb->wantsSleeping())
+		{
+			if (psb->getActivationState() == ACTIVE_TAG)
+				psb->setActivationState(WANTS_DEACTIVATION);
+			if (psb->getActivationState() == ISLAND_SLEEPING)
+			{
+				psb->setZeroVelocity();
+			}
+		}
+		else
+		{
+			if (psb->getActivationState() != DISABLE_DEACTIVATION)
+				psb->setActivationState(ACTIVE_TAG);
+		}
+	}
+	btMultiBodyDynamicsWorld::updateActivationState(timeStep);
 }
 
 void btDeformableMultiBodyDynamicsWorld::applyRepulsionForce(btScalar timeStep)
 {
-    BT_PROFILE("btDeformableMultiBodyDynamicsWorld::applyRepulsionForce");
-    for (int i = 0; i < m_softBodies.size(); i++)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        if (psb->isActive())
-        {
+	BT_PROFILE("btDeformableMultiBodyDynamicsWorld::applyRepulsionForce");
+	for (int i = 0; i < m_softBodies.size(); i++)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		if (psb->isActive())
+		{
 			psb->applyRepulsionForce(timeStep, true);
-        }
-    }
+		}
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::performGeometricCollisions(btScalar timeStep)
 {
 	BT_PROFILE("btDeformableMultiBodyDynamicsWorld::performGeometricCollisions");
-    // refit the BVH tree for CCD
+	// refit the BVH tree for CCD
 	for (int i = 0; i < m_softBodies.size(); ++i)
 	{
 		btSoftBody* psb = m_softBodies[i];
@@ -214,7 +215,7 @@ void btDeformableMultiBodyDynamicsWorld::performGeometricCollisions(btScalar tim
 					f.m_vn = (f.m_n[1]->m_v - f.m_n[0]->m_v).cross(f.m_n[2]->m_v - f.m_n[0]->m_v) * timeStep * timeStep;
 				}
 			}
-        }
+		}
 
 		// apply CCD to register new contact points
 		for (int i = 0; i < m_softBodies.size(); ++i)
@@ -228,7 +229,7 @@ void btDeformableMultiBodyDynamicsWorld::performGeometricCollisions(btScalar tim
 					m_softBodies[i]->geometricCollisionHandler(m_softBodies[j]);
 				}
 			}
-        }
+		}
 
 		int penetration_count = 0;
 		for (int i = 0; i < m_softBodies.size(); ++i)
@@ -258,294 +259,292 @@ void btDeformableMultiBodyDynamicsWorld::performGeometricCollisions(btScalar tim
 
 void btDeformableMultiBodyDynamicsWorld::softBodySelfCollision()
 {
-    BT_PROFILE("btDeformableMultiBodyDynamicsWorld::softBodySelfCollision");
-    for (int i = 0; i < m_softBodies.size(); i++)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        if (psb->isActive())
-        {
-            psb->defaultCollisionHandler(psb);
-        }
-    }
+	BT_PROFILE("btDeformableMultiBodyDynamicsWorld::softBodySelfCollision");
+	for (int i = 0; i < m_softBodies.size(); i++)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		if (psb->isActive())
+		{
+			psb->defaultCollisionHandler(psb);
+		}
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::positionCorrection(btScalar timeStep)
 {
-    // correct the position of rigid bodies with temporary velocity generated from split impulse
-    btContactSolverInfo infoGlobal;
-    btVector3 zero(0,0,0);
-    for (int i = 0; i < m_nonStaticRigidBodies.size(); ++i)
-    {
-        btRigidBody* rb = m_nonStaticRigidBodies[i];
-        //correct the position/orientation based on push/turn recovery
-        btTransform newTransform;
-        btVector3 pushVelocity = rb->getPushVelocity();
-        btVector3 turnVelocity = rb->getTurnVelocity();
-        if (pushVelocity[0] != 0.f || pushVelocity[1] != 0 || pushVelocity[2] != 0 || turnVelocity[0] != 0.f || turnVelocity[1] != 0 || turnVelocity[2] != 0)
-        {
-            btTransformUtil::integrateTransform(rb->getWorldTransform(), pushVelocity, turnVelocity * infoGlobal.m_splitImpulseTurnErp, timeStep, newTransform);
-            rb->setWorldTransform(newTransform);
-            rb->setPushVelocity(zero);
-            rb->setTurnVelocity(zero);
-        }
-    }
+	// correct the position of rigid bodies with temporary velocity generated from split impulse
+	btContactSolverInfo infoGlobal;
+	btVector3 zero(0, 0, 0);
+	for (int i = 0; i < m_nonStaticRigidBodies.size(); ++i)
+	{
+		btRigidBody* rb = m_nonStaticRigidBodies[i];
+		//correct the position/orientation based on push/turn recovery
+		btTransform newTransform;
+		btVector3 pushVelocity = rb->getPushVelocity();
+		btVector3 turnVelocity = rb->getTurnVelocity();
+		if (pushVelocity[0] != 0.f || pushVelocity[1] != 0 || pushVelocity[2] != 0 || turnVelocity[0] != 0.f || turnVelocity[1] != 0 || turnVelocity[2] != 0)
+		{
+			btTransformUtil::integrateTransform(rb->getWorldTransform(), pushVelocity, turnVelocity * infoGlobal.m_splitImpulseTurnErp, timeStep, newTransform);
+			rb->setWorldTransform(newTransform);
+			rb->setPushVelocity(zero);
+			rb->setTurnVelocity(zero);
+		}
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::integrateTransforms(btScalar timeStep)
 {
-    BT_PROFILE("integrateTransforms");
-    positionCorrection(timeStep);
-    btMultiBodyDynamicsWorld::integrateTransforms(timeStep);
-    for (int i = 0; i < m_softBodies.size(); ++i)
-    {
-        btSoftBody* psb = m_softBodies[i];
-        for (int j = 0; j < psb->m_nodes.size(); ++j)
-        {
-            btSoftBody::Node& node = psb->m_nodes[j];
-            btScalar maxDisplacement = psb->getWorldInfo()->m_maxDisplacement;
-            btScalar clampDeltaV = maxDisplacement / timeStep;
-            for (int c = 0; c < 3; c++)
-            {
-                if (node.m_v[c] > clampDeltaV)
-                {
-                    node.m_v[c] = clampDeltaV;
-                }
-                if (node.m_v[c] < -clampDeltaV)
-                {
-                    node.m_v[c] = -clampDeltaV;
-                }
-            }
-            node.m_x  =  node.m_x + timeStep * node.m_v;
-            node.m_q = node.m_x;
-            node.m_vn = node.m_v;
-        }
-        // enforce anchor constraints
-        for (int j = 0; j < psb->m_deformableAnchors.size();++j)
-        {
-            btSoftBody::DeformableNodeRigidAnchor& a = psb->m_deformableAnchors[j];
-            btSoftBody::Node* n = a.m_node;
-            n->m_x = a.m_cti.m_colObj->getWorldTransform() * a.m_local;
-            
-            // update multibody anchor info
-            if (a.m_cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
-            {
-                btMultiBodyLinkCollider* multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(a.m_cti.m_colObj);
-                if (multibodyLinkCol)
-                {
-                    btVector3 nrm;
-                    const btCollisionShape* shp = multibodyLinkCol->getCollisionShape();
-                    const btTransform& wtr = multibodyLinkCol->getWorldTransform();
-                    psb->m_worldInfo->m_sparsesdf.Evaluate(
-                                                      wtr.invXform(n->m_x),
-                                                      shp,
-                                                      nrm,
-                                                      0);
-                    a.m_cti.m_normal = wtr.getBasis() * nrm;
-                    btVector3 normal = a.m_cti.m_normal;
-                    btVector3 t1 = generateUnitOrthogonalVector(normal);
-                    btVector3 t2 = btCross(normal, t1);
-                    btMultiBodyJacobianData jacobianData_normal, jacobianData_t1, jacobianData_t2;
-                    findJacobian(multibodyLinkCol, jacobianData_normal, a.m_node->m_x, normal);
-                    findJacobian(multibodyLinkCol, jacobianData_t1, a.m_node->m_x, t1);
-                    findJacobian(multibodyLinkCol, jacobianData_t2, a.m_node->m_x, t2);
-            
-                    btScalar* J_n = &jacobianData_normal.m_jacobians[0];
-                    btScalar* J_t1 = &jacobianData_t1.m_jacobians[0];
-                    btScalar* J_t2 = &jacobianData_t2.m_jacobians[0];
-                    
-                    btScalar* u_n = &jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
-                    btScalar* u_t1 = &jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
-                    btScalar* u_t2 = &jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
-                    
-                    btMatrix3x3 rot(normal.getX(), normal.getY(), normal.getZ(),
-                                    t1.getX(), t1.getY(), t1.getZ(),
-                                    t2.getX(), t2.getY(), t2.getZ()); // world frame to local frame
-                    const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
-                    btMatrix3x3 local_impulse_matrix = (Diagonal(n->m_im) + OuterProduct(J_n, J_t1, J_t2, u_n, u_t1, u_t2, ndof)).inverse();
-                    a.m_c0 =  rot.transpose() * local_impulse_matrix * rot;
-                    a.jacobianData_normal = jacobianData_normal;
-                    a.jacobianData_t1 = jacobianData_t1;
-                    a.jacobianData_t2 = jacobianData_t2;
-                    a.t1 = t1;
-                    a.t2 = t2;
-                }
-            }
-        }
-        psb->interpolateRenderMesh();
-    }
+	BT_PROFILE("integrateTransforms");
+	positionCorrection(timeStep);
+	btMultiBodyDynamicsWorld::integrateTransforms(timeStep);
+	for (int i = 0; i < m_softBodies.size(); ++i)
+	{
+		btSoftBody* psb = m_softBodies[i];
+		for (int j = 0; j < psb->m_nodes.size(); ++j)
+		{
+			btSoftBody::Node& node = psb->m_nodes[j];
+			btScalar maxDisplacement = psb->getWorldInfo()->m_maxDisplacement;
+			btScalar clampDeltaV = maxDisplacement / timeStep;
+			for (int c = 0; c < 3; c++)
+			{
+				if (node.m_v[c] > clampDeltaV)
+				{
+					node.m_v[c] = clampDeltaV;
+				}
+				if (node.m_v[c] < -clampDeltaV)
+				{
+					node.m_v[c] = -clampDeltaV;
+				}
+			}
+			node.m_x = node.m_x + timeStep * (node.m_v + node.m_splitv);
+			node.m_q = node.m_x;
+			node.m_vn = node.m_v;
+		}
+		// enforce anchor constraints
+		for (int j = 0; j < psb->m_deformableAnchors.size(); ++j)
+		{
+			btSoftBody::DeformableNodeRigidAnchor& a = psb->m_deformableAnchors[j];
+			btSoftBody::Node* n = a.m_node;
+			n->m_x = a.m_cti.m_colObj->getWorldTransform() * a.m_local;
+
+			// update multibody anchor info
+			if (a.m_cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
+			{
+				btMultiBodyLinkCollider* multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(a.m_cti.m_colObj);
+				if (multibodyLinkCol)
+				{
+					btVector3 nrm;
+					const btCollisionShape* shp = multibodyLinkCol->getCollisionShape();
+					const btTransform& wtr = multibodyLinkCol->getWorldTransform();
+					psb->m_worldInfo->m_sparsesdf.Evaluate(
+						wtr.invXform(n->m_x),
+						shp,
+						nrm,
+						0);
+					a.m_cti.m_normal = wtr.getBasis() * nrm;
+					btVector3 normal = a.m_cti.m_normal;
+					btVector3 t1 = generateUnitOrthogonalVector(normal);
+					btVector3 t2 = btCross(normal, t1);
+					btMultiBodyJacobianData jacobianData_normal, jacobianData_t1, jacobianData_t2;
+					findJacobian(multibodyLinkCol, jacobianData_normal, a.m_node->m_x, normal);
+					findJacobian(multibodyLinkCol, jacobianData_t1, a.m_node->m_x, t1);
+					findJacobian(multibodyLinkCol, jacobianData_t2, a.m_node->m_x, t2);
+
+					btScalar* J_n = &jacobianData_normal.m_jacobians[0];
+					btScalar* J_t1 = &jacobianData_t1.m_jacobians[0];
+					btScalar* J_t2 = &jacobianData_t2.m_jacobians[0];
+
+					btScalar* u_n = &jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
+					btScalar* u_t1 = &jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
+					btScalar* u_t2 = &jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
+
+					btMatrix3x3 rot(normal.getX(), normal.getY(), normal.getZ(),
+									t1.getX(), t1.getY(), t1.getZ(),
+									t2.getX(), t2.getY(), t2.getZ());  // world frame to local frame
+					const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
+					btMatrix3x3 local_impulse_matrix = (Diagonal(n->m_im) + OuterProduct(J_n, J_t1, J_t2, u_n, u_t1, u_t2, ndof)).inverse();
+					a.m_c0 = rot.transpose() * local_impulse_matrix * rot;
+					a.jacobianData_normal = jacobianData_normal;
+					a.jacobianData_t1 = jacobianData_t1;
+					a.jacobianData_t2 = jacobianData_t2;
+					a.t1 = t1;
+					a.t2 = t2;
+				}
+			}
+		}
+		psb->interpolateRenderMesh();
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::solveConstraints(btScalar timeStep)
 {
-    BT_PROFILE("btDeformableMultiBodyDynamicsWorld::solveConstraints");
-    // save v_{n+1}^* velocity after explicit forces
-    m_deformableBodySolver->backupVelocity();
-    
-    // set up constraints among multibodies and between multibodies and deformable bodies
-    setupConstraints();
-    
-    // solve contact constraints
-    solveContactConstraints();
-    
-    // set up the directions in which the velocity does not change in the momentum solve
-    if (m_useProjection)
-        m_deformableBodySolver->m_objective->m_projection.setProjection();
-    else
-        m_deformableBodySolver->m_objective->m_projection.setLagrangeMultiplier();
-
-    // for explicit scheme, m_backupVelocity = v_{n+1}^*
-    // for implicit scheme, m_backupVelocity = v_n
-    // Here, set dv = v_{n+1} - v_n for nodes in contact
-    m_deformableBodySolver->setupDeformableSolve(m_implicit);
-    
-    // At this point, dv should be golden for nodes in contact
-    // proceed to solve deformable momentum equation
-    m_deformableBodySolver->solveDeformableConstraints(timeStep);
+	BT_PROFILE("btDeformableMultiBodyDynamicsWorld::solveConstraints");
+	// save v_{n+1}^* velocity after explicit forces
+	m_deformableBodySolver->backupVelocity();
+
+	// set up constraints among multibodies and between multibodies and deformable bodies
+	setupConstraints();
+
+	// solve contact constraints
+	solveContactConstraints();
+
+	// set up the directions in which the velocity does not change in the momentum solve
+	if (m_useProjection)
+		m_deformableBodySolver->m_objective->m_projection.setProjection();
+	else
+		m_deformableBodySolver->m_objective->m_projection.setLagrangeMultiplier();
+
+	// for explicit scheme, m_backupVelocity = v_{n+1}^*
+	// for implicit scheme, m_backupVelocity = v_n
+	// Here, set dv = v_{n+1} - v_n for nodes in contact
+	m_deformableBodySolver->setupDeformableSolve(m_implicit);
+
+	// At this point, dv should be golden for nodes in contact
+	// proceed to solve deformable momentum equation
+	m_deformableBodySolver->solveDeformableConstraints(timeStep);
 }
 
 void btDeformableMultiBodyDynamicsWorld::setupConstraints()
 {
-    // set up constraints between multibody and deformable bodies
-    m_deformableBodySolver->setConstraints(m_solverInfo);
-    
-    // set up constraints among multibodies
-    {
-        sortConstraints();
-        // setup the solver callback
-        btMultiBodyConstraint** sortedMultiBodyConstraints = m_sortedMultiBodyConstraints.size() ? &m_sortedMultiBodyConstraints[0] : 0;
-        btTypedConstraint** constraintsPtr = getNumConstraints() ? &m_sortedConstraints[0] : 0;
-        m_solverDeformableBodyIslandCallback->setup(&m_solverInfo, constraintsPtr, m_sortedConstraints.size(), sortedMultiBodyConstraints, m_sortedMultiBodyConstraints.size(), getDebugDrawer());
-        
-        // build islands
-        m_islandManager->buildIslands(getCollisionWorld()->getDispatcher(), getCollisionWorld());
-    }
+	// set up constraints between multibody and deformable bodies
+	m_deformableBodySolver->setConstraints(m_solverInfo);
+
+	// set up constraints among multibodies
+	{
+		sortConstraints();
+		// setup the solver callback
+		btMultiBodyConstraint** sortedMultiBodyConstraints = m_sortedMultiBodyConstraints.size() ? &m_sortedMultiBodyConstraints[0] : 0;
+		btTypedConstraint** constraintsPtr = getNumConstraints() ? &m_sortedConstraints[0] : 0;
+		m_solverDeformableBodyIslandCallback->setup(&m_solverInfo, constraintsPtr, m_sortedConstraints.size(), sortedMultiBodyConstraints, m_sortedMultiBodyConstraints.size(), getDebugDrawer());
+
+		// build islands
+		m_islandManager->buildIslands(getCollisionWorld()->getDispatcher(), getCollisionWorld());
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::sortConstraints()
 {
-    m_sortedConstraints.resize(m_constraints.size());
-    int i;
-    for (i = 0; i < getNumConstraints(); i++)
-    {
-        m_sortedConstraints[i] = m_constraints[i];
-    }
-    m_sortedConstraints.quickSort(btSortConstraintOnIslandPredicate2());
-    
-    m_sortedMultiBodyConstraints.resize(m_multiBodyConstraints.size());
-    for (i = 0; i < m_multiBodyConstraints.size(); i++)
-    {
-        m_sortedMultiBodyConstraints[i] = m_multiBodyConstraints[i];
-    }
-    m_sortedMultiBodyConstraints.quickSort(btSortMultiBodyConstraintOnIslandPredicate());
+	m_sortedConstraints.resize(m_constraints.size());
+	int i;
+	for (i = 0; i < getNumConstraints(); i++)
+	{
+		m_sortedConstraints[i] = m_constraints[i];
+	}
+	m_sortedConstraints.quickSort(btSortConstraintOnIslandPredicate2());
+
+	m_sortedMultiBodyConstraints.resize(m_multiBodyConstraints.size());
+	for (i = 0; i < m_multiBodyConstraints.size(); i++)
+	{
+		m_sortedMultiBodyConstraints[i] = m_multiBodyConstraints[i];
+	}
+	m_sortedMultiBodyConstraints.quickSort(btSortMultiBodyConstraintOnIslandPredicate());
 }
-    
-    
+
 void btDeformableMultiBodyDynamicsWorld::solveContactConstraints()
 {
-    // process constraints on each island
-    m_islandManager->processIslands(getCollisionWorld()->getDispatcher(), getCollisionWorld(), m_solverDeformableBodyIslandCallback);
-    
-    // process deferred
-    m_solverDeformableBodyIslandCallback->processConstraints();
-    m_constraintSolver->allSolved(m_solverInfo, m_debugDrawer);
-    
-    // write joint feedback
-    {
-        for (int i = 0; i < this->m_multiBodies.size(); i++)
-        {
-            btMultiBody* bod = m_multiBodies[i];
-            
-            bool isSleeping = false;
-            
-            if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
-            {
-                isSleeping = true;
-            }
-            for (int b = 0; b < bod->getNumLinks(); b++)
-            {
-                if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState() == ISLAND_SLEEPING)
-                    isSleeping = true;
-            }
-            
-            if (!isSleeping)
-            {
-                //useless? they get resized in stepVelocities once again (AND DIFFERENTLY)
-                m_scratch_r.resize(bod->getNumLinks() + 1);  //multidof? ("Y"s use it and it is used to store qdd)
-                m_scratch_v.resize(bod->getNumLinks() + 1);
-                m_scratch_m.resize(bod->getNumLinks() + 1);
-                
-                if (bod->internalNeedsJointFeedback())
-                {
-                    if (!bod->isUsingRK4Integration())
-                    {
-                        if (bod->internalNeedsJointFeedback())
-                        {
-                            bool isConstraintPass = true;
-                            bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(m_solverInfo.m_timeStep, m_scratch_r, m_scratch_v, m_scratch_m, isConstraintPass,
-                                                                                      getSolverInfo().m_jointFeedbackInWorldSpace,
-                                                                                      getSolverInfo().m_jointFeedbackInJointFrame);
-                        }
-                    }
-                }
-            }
-        }
-    }
-    
-    for (int i = 0; i < this->m_multiBodies.size(); i++)
-    {
-        btMultiBody* bod = m_multiBodies[i];
-        bod->processDeltaVeeMultiDof2();
-    }
+	// process constraints on each island
+	m_islandManager->processIslands(getCollisionWorld()->getDispatcher(), getCollisionWorld(), m_solverDeformableBodyIslandCallback);
+
+	// process deferred
+	m_solverDeformableBodyIslandCallback->processConstraints();
+	m_constraintSolver->allSolved(m_solverInfo, m_debugDrawer);
+
+	// write joint feedback
+	{
+		for (int i = 0; i < this->m_multiBodies.size(); i++)
+		{
+			btMultiBody* bod = m_multiBodies[i];
+
+			bool isSleeping = false;
+
+			if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
+			{
+				isSleeping = true;
+			}
+			for (int b = 0; b < bod->getNumLinks(); b++)
+			{
+				if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState() == ISLAND_SLEEPING)
+					isSleeping = true;
+			}
+
+			if (!isSleeping)
+			{
+				//useless? they get resized in stepVelocities once again (AND DIFFERENTLY)
+				m_scratch_r.resize(bod->getNumLinks() + 1);  //multidof? ("Y"s use it and it is used to store qdd)
+				m_scratch_v.resize(bod->getNumLinks() + 1);
+				m_scratch_m.resize(bod->getNumLinks() + 1);
+
+				if (bod->internalNeedsJointFeedback())
+				{
+					if (!bod->isUsingRK4Integration())
+					{
+						if (bod->internalNeedsJointFeedback())
+						{
+							bool isConstraintPass = true;
+							bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(m_solverInfo.m_timeStep, m_scratch_r, m_scratch_v, m_scratch_m, isConstraintPass,
+																					  getSolverInfo().m_jointFeedbackInWorldSpace,
+																					  getSolverInfo().m_jointFeedbackInJointFrame);
+						}
+					}
+				}
+			}
+		}
+	}
+
+	for (int i = 0; i < this->m_multiBodies.size(); i++)
+	{
+		btMultiBody* bod = m_multiBodies[i];
+		bod->processDeltaVeeMultiDof2();
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::addSoftBody(btSoftBody* body, int collisionFilterGroup, int collisionFilterMask)
 {
-    m_softBodies.push_back(body);
-    
-    // Set the soft body solver that will deal with this body
-    // to be the world's solver
-    body->setSoftBodySolver(m_deformableBodySolver);
-    
-    btCollisionWorld::addCollisionObject(body,
-                                         collisionFilterGroup,
-                                         collisionFilterMask);
+	m_softBodies.push_back(body);
+
+	// Set the soft body solver that will deal with this body
+	// to be the world's solver
+	body->setSoftBodySolver(m_deformableBodySolver);
+
+	btCollisionWorld::addCollisionObject(body,
+										 collisionFilterGroup,
+										 collisionFilterMask);
 }
 
 void btDeformableMultiBodyDynamicsWorld::predictUnconstraintMotion(btScalar timeStep)
 {
-    BT_PROFILE("predictUnconstraintMotion");
-    btMultiBodyDynamicsWorld::predictUnconstraintMotion(timeStep);
-    m_deformableBodySolver->predictMotion(timeStep);
+	BT_PROFILE("predictUnconstraintMotion");
+	btMultiBodyDynamicsWorld::predictUnconstraintMotion(timeStep);
+	m_deformableBodySolver->predictMotion(timeStep);
 }
 
 void btDeformableMultiBodyDynamicsWorld::reinitialize(btScalar timeStep)
 {
-    m_internalTime += timeStep;
-    m_deformableBodySolver->setImplicit(m_implicit);
-    m_deformableBodySolver->setLineSearch(m_lineSearch);
-    m_deformableBodySolver->reinitialize(m_softBodies, timeStep);
-    btDispatcherInfo& dispatchInfo = btMultiBodyDynamicsWorld::getDispatchInfo();
-    dispatchInfo.m_timeStep = timeStep;
-    dispatchInfo.m_stepCount = 0;
-    dispatchInfo.m_debugDraw = btMultiBodyDynamicsWorld::getDebugDrawer();
-    btMultiBodyDynamicsWorld::getSolverInfo().m_timeStep = timeStep;
-    if (m_useProjection)
-    {
-        m_deformableBodySolver->m_useProjection = true;
-//        m_deformableBodySolver->m_objective->m_projection.m_useStrainLimiting = true;
-        m_deformableBodySolver->m_objective->m_preconditioner =  m_deformableBodySolver->m_objective->m_massPreconditioner;
-    }
-    else
-    {
-        m_deformableBodySolver->m_objective->m_preconditioner =  m_deformableBodySolver->m_objective->m_KKTPreconditioner;
-    }
-        
+	m_internalTime += timeStep;
+	m_deformableBodySolver->setImplicit(m_implicit);
+	m_deformableBodySolver->setLineSearch(m_lineSearch);
+	m_deformableBodySolver->reinitialize(m_softBodies, timeStep);
+	btDispatcherInfo& dispatchInfo = btMultiBodyDynamicsWorld::getDispatchInfo();
+	dispatchInfo.m_timeStep = timeStep;
+	dispatchInfo.m_stepCount = 0;
+	dispatchInfo.m_debugDraw = btMultiBodyDynamicsWorld::getDebugDrawer();
+	btMultiBodyDynamicsWorld::getSolverInfo().m_timeStep = timeStep;
+	if (m_useProjection)
+	{
+		m_deformableBodySolver->m_useProjection = true;
+		m_deformableBodySolver->m_objective->m_projection.m_useStrainLimiting = true;
+		m_deformableBodySolver->m_objective->m_preconditioner = m_deformableBodySolver->m_objective->m_massPreconditioner;
+	}
+	else
+	{
+		m_deformableBodySolver->m_useProjection = false;
+		m_deformableBodySolver->m_objective->m_projection.m_useStrainLimiting = false;
+		m_deformableBodySolver->m_objective->m_preconditioner = m_deformableBodySolver->m_objective->m_KKTPreconditioner;
+	}
 }
 
-
 void btDeformableMultiBodyDynamicsWorld::debugDrawWorld()
 {
-
 	btMultiBodyDynamicsWorld::debugDrawWorld();
 
 	for (int i = 0; i < getSoftBodyArray().size(); i++)
@@ -556,253 +555,260 @@ void btDeformableMultiBodyDynamicsWorld::debugDrawWorld()
 			btSoftBodyHelpers::Draw(psb, getDebugDrawer(), getDrawFlags());
 		}
 	}
-
-	
 }
 
 void btDeformableMultiBodyDynamicsWorld::applyRigidBodyGravity(btScalar timeStep)
 {
-    // Gravity is applied in stepSimulation and then cleared here and then applied here and then cleared here again
-    // so that 1) gravity is applied to velocity before constraint solve and 2) gravity is applied in each substep
-    // when there are multiple substeps
-    btMultiBodyDynamicsWorld::applyGravity();
-    // integrate rigid body gravity
-    for (int i = 0; i < m_nonStaticRigidBodies.size(); ++i)
-    {
-        btRigidBody* rb = m_nonStaticRigidBodies[i];
-        rb->integrateVelocities(timeStep);
-    }
-    
-    // integrate multibody gravity
-    {
-        forwardKinematics();
-        clearMultiBodyConstraintForces();
-        {
-            for (int i = 0; i < this->m_multiBodies.size(); i++)
-            {
-                btMultiBody* bod = m_multiBodies[i];
-                
-                bool isSleeping = false;
-                
-                if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
-                {
-                    isSleeping = true;
-                }
-                for (int b = 0; b < bod->getNumLinks(); b++)
-                {
-                    if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState() == ISLAND_SLEEPING)
-                        isSleeping = true;
-                }
-                
-                if (!isSleeping)
-                {
-                    m_scratch_r.resize(bod->getNumLinks() + 1);
-                    m_scratch_v.resize(bod->getNumLinks() + 1);
-                    m_scratch_m.resize(bod->getNumLinks() + 1);
-                    bool isConstraintPass = false;
-                    {
-                        if (!bod->isUsingRK4Integration())
-                        {
-                            bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(m_solverInfo.m_timeStep,
-                                                                                      m_scratch_r, m_scratch_v, m_scratch_m,isConstraintPass,
-                                                                                      getSolverInfo().m_jointFeedbackInWorldSpace,
-                                                                                      getSolverInfo().m_jointFeedbackInJointFrame);
-                        }
-                        else
-                        {
-                            btAssert(" RK4Integration is not supported" );
-                        }
-                    }
-                }
-            }
-        }
-    }
-    clearGravity();
+	// Gravity is applied in stepSimulation and then cleared here and then applied here and then cleared here again
+	// so that 1) gravity is applied to velocity before constraint solve and 2) gravity is applied in each substep
+	// when there are multiple substeps
+	btMultiBodyDynamicsWorld::applyGravity();
+	// integrate rigid body gravity
+	for (int i = 0; i < m_nonStaticRigidBodies.size(); ++i)
+	{
+		btRigidBody* rb = m_nonStaticRigidBodies[i];
+		rb->integrateVelocities(timeStep);
+	}
+
+	// integrate multibody gravity
+	{
+		forwardKinematics();
+		clearMultiBodyConstraintForces();
+		{
+			for (int i = 0; i < this->m_multiBodies.size(); i++)
+			{
+				btMultiBody* bod = m_multiBodies[i];
+
+				bool isSleeping = false;
+
+				if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
+				{
+					isSleeping = true;
+				}
+				for (int b = 0; b < bod->getNumLinks(); b++)
+				{
+					if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState() == ISLAND_SLEEPING)
+						isSleeping = true;
+				}
+
+				if (!isSleeping)
+				{
+					m_scratch_r.resize(bod->getNumLinks() + 1);
+					m_scratch_v.resize(bod->getNumLinks() + 1);
+					m_scratch_m.resize(bod->getNumLinks() + 1);
+					bool isConstraintPass = false;
+					{
+						if (!bod->isUsingRK4Integration())
+						{
+							bod->computeAccelerationsArticulatedBodyAlgorithmMultiDof(m_solverInfo.m_timeStep,
+																					  m_scratch_r, m_scratch_v, m_scratch_m, isConstraintPass,
+																					  getSolverInfo().m_jointFeedbackInWorldSpace,
+																					  getSolverInfo().m_jointFeedbackInJointFrame);
+						}
+						else
+						{
+							btAssert(" RK4Integration is not supported");
+						}
+					}
+				}
+			}
+		}
+	}
+	clearGravity();
 }
 
 void btDeformableMultiBodyDynamicsWorld::clearGravity()
 {
-    BT_PROFILE("btMultiBody clearGravity");
-    // clear rigid body gravity
-    for (int i = 0; i < m_nonStaticRigidBodies.size(); i++)
-    {
-        btRigidBody* body = m_nonStaticRigidBodies[i];
-        if (body->isActive())
-        {
-            body->clearGravity();
-        }
-    }
-    // clear multibody gravity
-    for (int i = 0; i < this->m_multiBodies.size(); i++)
-    {
-        btMultiBody* bod = m_multiBodies[i];
-        
-        bool isSleeping = false;
-        
-        if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
-        {
-            isSleeping = true;
-        }
-        for (int b = 0; b < bod->getNumLinks(); b++)
-        {
-            if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState() == ISLAND_SLEEPING)
-                isSleeping = true;
-        }
-        
-        if (!isSleeping)
-        {
-            bod->addBaseForce(-m_gravity * bod->getBaseMass());
-            
-            for (int j = 0; j < bod->getNumLinks(); ++j)
-            {
-                bod->addLinkForce(j, -m_gravity * bod->getLinkMass(j));
-            }
-        }
-    }
+	BT_PROFILE("btMultiBody clearGravity");
+	// clear rigid body gravity
+	for (int i = 0; i < m_nonStaticRigidBodies.size(); i++)
+	{
+		btRigidBody* body = m_nonStaticRigidBodies[i];
+		if (body->isActive())
+		{
+			body->clearGravity();
+		}
+	}
+	// clear multibody gravity
+	for (int i = 0; i < this->m_multiBodies.size(); i++)
+	{
+		btMultiBody* bod = m_multiBodies[i];
+
+		bool isSleeping = false;
+
+		if (bod->getBaseCollider() && bod->getBaseCollider()->getActivationState() == ISLAND_SLEEPING)
+		{
+			isSleeping = true;
+		}
+		for (int b = 0; b < bod->getNumLinks(); b++)
+		{
+			if (bod->getLink(b).m_collider && bod->getLink(b).m_collider->getActivationState() == ISLAND_SLEEPING)
+				isSleeping = true;
+		}
+
+		if (!isSleeping)
+		{
+			bod->addBaseForce(-m_gravity * bod->getBaseMass());
+
+			for (int j = 0; j < bod->getNumLinks(); ++j)
+			{
+				bod->addLinkForce(j, -m_gravity * bod->getLinkMass(j));
+			}
+		}
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::beforeSolverCallbacks(btScalar timeStep)
 {
-    if (0 != m_internalTickCallback)
-    {
-        (*m_internalTickCallback)(this, timeStep);
-    }
-    
-    if (0 != m_solverCallback)
-    {
-        (*m_solverCallback)(m_internalTime, this);
-    }
+	if (0 != m_internalTickCallback)
+	{
+		(*m_internalTickCallback)(this, timeStep);
+	}
+
+	if (0 != m_solverCallback)
+	{
+		(*m_solverCallback)(m_internalTime, this);
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::afterSolverCallbacks(btScalar timeStep)
 {
-    if (0 != m_solverCallback)
-    {
-        (*m_solverCallback)(m_internalTime, this);
-    }
+	if (0 != m_solverCallback)
+	{
+		(*m_solverCallback)(m_internalTime, this);
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::addForce(btSoftBody* psb, btDeformableLagrangianForce* force)
 {
-    btAlignedObjectArray<btDeformableLagrangianForce*>& forces = m_deformableBodySolver->m_objective->m_lf;
-    bool added = false;
-    for (int i = 0; i < forces.size(); ++i)
-    {
-        if (forces[i]->getForceType() == force->getForceType())
-        {
-            forces[i]->addSoftBody(psb);
-            added = true;
-            break;
-        }
-    }
-    if (!added)
-    {
-        force->addSoftBody(psb);
-        force->setIndices(m_deformableBodySolver->m_objective->getIndices());
-        forces.push_back(force);
-    }
+	btAlignedObjectArray<btDeformableLagrangianForce*>& forces = m_deformableBodySolver->m_objective->m_lf;
+	bool added = false;
+	for (int i = 0; i < forces.size(); ++i)
+	{
+		if (forces[i]->getForceType() == force->getForceType())
+		{
+			forces[i]->addSoftBody(psb);
+			added = true;
+			break;
+		}
+	}
+	if (!added)
+	{
+		force->addSoftBody(psb);
+		force->setIndices(m_deformableBodySolver->m_objective->getIndices());
+		forces.push_back(force);
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::removeForce(btSoftBody* psb, btDeformableLagrangianForce* force)
 {
-    btAlignedObjectArray<btDeformableLagrangianForce*>& forces = m_deformableBodySolver->m_objective->m_lf;
-    int removed_index = -1;
-    for (int i = 0; i < forces.size(); ++i)
-    {
-        if (forces[i]->getForceType() == force->getForceType())
-        {
-            forces[i]->removeSoftBody(psb);
-            if (forces[i]->m_softBodies.size() == 0)
-                removed_index = i;
-            break;
-        }
-    }
-    if (removed_index >= 0)
-        forces.removeAtIndex(removed_index);
+	btAlignedObjectArray<btDeformableLagrangianForce*>& forces = m_deformableBodySolver->m_objective->m_lf;
+	int removed_index = -1;
+	for (int i = 0; i < forces.size(); ++i)
+	{
+		if (forces[i]->getForceType() == force->getForceType())
+		{
+			forces[i]->removeSoftBody(psb);
+			if (forces[i]->m_softBodies.size() == 0)
+				removed_index = i;
+			break;
+		}
+	}
+	if (removed_index >= 0)
+		forces.removeAtIndex(removed_index);
+}
+
+void btDeformableMultiBodyDynamicsWorld::removeSoftBodyForce(btSoftBody* psb)
+{
+	btAlignedObjectArray<btDeformableLagrangianForce*>& forces = m_deformableBodySolver->m_objective->m_lf;
+	for (int i = 0; i < forces.size(); ++i)
+	{
+		forces[i]->removeSoftBody(psb);
+	}
 }
 
 void btDeformableMultiBodyDynamicsWorld::removeSoftBody(btSoftBody* body)
 {
-    m_softBodies.remove(body);
-    btCollisionWorld::removeCollisionObject(body);
-    // force a reinitialize so that node indices get updated.
-    m_deformableBodySolver->reinitialize(m_softBodies, btScalar(-1));
+	removeSoftBodyForce(body);
+	m_softBodies.remove(body);
+	btCollisionWorld::removeCollisionObject(body);
+	// force a reinitialize so that node indices get updated.
+	m_deformableBodySolver->reinitialize(m_softBodies, btScalar(-1));
 }
 
 void btDeformableMultiBodyDynamicsWorld::removeCollisionObject(btCollisionObject* collisionObject)
 {
-    btSoftBody* body = btSoftBody::upcast(collisionObject);
-    if (body)
-        removeSoftBody(body);
-    else
-        btDiscreteDynamicsWorld::removeCollisionObject(collisionObject);
+	btSoftBody* body = btSoftBody::upcast(collisionObject);
+	if (body)
+		removeSoftBody(body);
+	else
+		btDiscreteDynamicsWorld::removeCollisionObject(collisionObject);
 }
 
-
 int btDeformableMultiBodyDynamicsWorld::stepSimulation(btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep)
 {
-    startProfiling(timeStep);
-    
-    int numSimulationSubSteps = 0;
-    
-    if (maxSubSteps)
-    {
-        //fixed timestep with interpolation
-        m_fixedTimeStep = fixedTimeStep;
-        m_localTime += timeStep;
-        if (m_localTime >= fixedTimeStep)
-        {
-            numSimulationSubSteps = int(m_localTime / fixedTimeStep);
-            m_localTime -= numSimulationSubSteps * fixedTimeStep;
-        }
-    }
-    else
-    {
-        //variable timestep
-        fixedTimeStep = timeStep;
-        m_localTime = m_latencyMotionStateInterpolation ? 0 : timeStep;
-        m_fixedTimeStep = 0;
-        if (btFuzzyZero(timeStep))
-        {
-            numSimulationSubSteps = 0;
-            maxSubSteps = 0;
-        }
-        else
-        {
-            numSimulationSubSteps = 1;
-            maxSubSteps = 1;
-        }
-    }
-    
-    //process some debugging flags
-    if (getDebugDrawer())
-    {
-        btIDebugDraw* debugDrawer = getDebugDrawer();
-        gDisableDeactivation = (debugDrawer->getDebugMode() & btIDebugDraw::DBG_NoDeactivation) != 0;
-    }
-    if (numSimulationSubSteps)
-    {
-        //clamp the number of substeps, to prevent simulation grinding spiralling down to a halt
-        int clampedSimulationSteps = (numSimulationSubSteps > maxSubSteps) ? maxSubSteps : numSimulationSubSteps;
-        
-        saveKinematicState(fixedTimeStep * clampedSimulationSteps);
-        
-        for (int i = 0; i < clampedSimulationSteps; i++)
-        {
-            internalSingleStepSimulation(fixedTimeStep);
-            synchronizeMotionStates();
-        }
-    }
-    else
-    {
-        synchronizeMotionStates();
-    }
-    
-    clearForces();
-    
+	startProfiling(timeStep);
+
+	int numSimulationSubSteps = 0;
+
+	if (maxSubSteps)
+	{
+		//fixed timestep with interpolation
+		m_fixedTimeStep = fixedTimeStep;
+		m_localTime += timeStep;
+		if (m_localTime >= fixedTimeStep)
+		{
+			numSimulationSubSteps = int(m_localTime / fixedTimeStep);
+			m_localTime -= numSimulationSubSteps * fixedTimeStep;
+		}
+	}
+	else
+	{
+		//variable timestep
+		fixedTimeStep = timeStep;
+		m_localTime = m_latencyMotionStateInterpolation ? 0 : timeStep;
+		m_fixedTimeStep = 0;
+		if (btFuzzyZero(timeStep))
+		{
+			numSimulationSubSteps = 0;
+			maxSubSteps = 0;
+		}
+		else
+		{
+			numSimulationSubSteps = 1;
+			maxSubSteps = 1;
+		}
+	}
+
+	//process some debugging flags
+	if (getDebugDrawer())
+	{
+		btIDebugDraw* debugDrawer = getDebugDrawer();
+		gDisableDeactivation = (debugDrawer->getDebugMode() & btIDebugDraw::DBG_NoDeactivation) != 0;
+	}
+	if (numSimulationSubSteps)
+	{
+		//clamp the number of substeps, to prevent simulation grinding spiralling down to a halt
+		int clampedSimulationSteps = (numSimulationSubSteps > maxSubSteps) ? maxSubSteps : numSimulationSubSteps;
+
+		saveKinematicState(fixedTimeStep * clampedSimulationSteps);
+
+		for (int i = 0; i < clampedSimulationSteps; i++)
+		{
+			internalSingleStepSimulation(fixedTimeStep);
+			synchronizeMotionStates();
+		}
+	}
+	else
+	{
+		synchronizeMotionStates();
+	}
+
+	clearForces();
+
 #ifndef BT_NO_PROFILE
-    CProfileManager::Increment_Frame_Counter();
+	CProfileManager::Increment_Frame_Counter();
 #endif  //BT_NO_PROFILE
-    
-    return numSimulationSubSteps;
+
+	return numSimulationSubSteps;
 }
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyDynamicsWorld.h b/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyDynamicsWorld.h
index 76b58a0378..4b7069aac7 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyDynamicsWorld.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableMultiBodyDynamicsWorld.h
@@ -36,185 +36,192 @@ typedef btAlignedObjectArray<btSoftBody*> btSoftBodyArray;
 
 class btDeformableMultiBodyDynamicsWorld : public btMultiBodyDynamicsWorld
 {
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    ///Solver classes that encapsulate multiple deformable bodies for solving
-    btDeformableBodySolver* m_deformableBodySolver;
-    btSoftBodyArray m_softBodies;
-    int m_drawFlags;
-    bool m_drawNodeTree;
-    bool m_drawFaceTree;
-    bool m_drawClusterTree;
-    btSoftBodyWorldInfo m_sbi;
-    btScalar m_internalTime;
-    int m_ccdIterations;
-    bool m_implicit;
-    bool m_lineSearch;
-    bool m_useProjection;
-    DeformableBodyInplaceSolverIslandCallback* m_solverDeformableBodyIslandCallback;
-    
-    typedef void (*btSolverCallback)(btScalar time, btDeformableMultiBodyDynamicsWorld* world);
-    btSolverCallback m_solverCallback;
-    
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	///Solver classes that encapsulate multiple deformable bodies for solving
+	btDeformableBodySolver* m_deformableBodySolver;
+	btSoftBodyArray m_softBodies;
+	int m_drawFlags;
+	bool m_drawNodeTree;
+	bool m_drawFaceTree;
+	bool m_drawClusterTree;
+	btSoftBodyWorldInfo m_sbi;
+	btScalar m_internalTime;
+	int m_ccdIterations;
+	bool m_implicit;
+	bool m_lineSearch;
+	bool m_useProjection;
+	DeformableBodyInplaceSolverIslandCallback* m_solverDeformableBodyIslandCallback;
+
+	typedef void (*btSolverCallback)(btScalar time, btDeformableMultiBodyDynamicsWorld* world);
+	btSolverCallback m_solverCallback;
+
 protected:
-    virtual void internalSingleStepSimulation(btScalar timeStep);
-    
-    virtual void integrateTransforms(btScalar timeStep);
-    
-    void positionCorrection(btScalar timeStep);
-    
-    void solveConstraints(btScalar timeStep);
-    
-    void updateActivationState(btScalar timeStep);
-    
-    void clearGravity();
-    
+	virtual void internalSingleStepSimulation(btScalar timeStep);
+
+	virtual void integrateTransforms(btScalar timeStep);
+
+	void positionCorrection(btScalar timeStep);
+
+	void solveConstraints(btScalar timeStep);
+
+	void updateActivationState(btScalar timeStep);
+
+	void clearGravity();
+
 public:
 	btDeformableMultiBodyDynamicsWorld(btDispatcher* dispatcher, btBroadphaseInterface* pairCache, btDeformableMultiBodyConstraintSolver* constraintSolver, btCollisionConfiguration* collisionConfiguration, btDeformableBodySolver* deformableBodySolver = 0);
 
-    virtual int stepSimulation(btScalar timeStep, int maxSubSteps = 1, btScalar fixedTimeStep = btScalar(1.) / btScalar(60.));
+	virtual int stepSimulation(btScalar timeStep, int maxSubSteps = 1, btScalar fixedTimeStep = btScalar(1.) / btScalar(60.));
 
 	virtual void debugDrawWorld();
 
-    void setSolverCallback(btSolverCallback cb)
-    {
-        m_solverCallback = cb;
-    }
-    
-    virtual ~btDeformableMultiBodyDynamicsWorld();
-    
-    virtual btMultiBodyDynamicsWorld* getMultiBodyDynamicsWorld()
-    {
-        return (btMultiBodyDynamicsWorld*)(this);
-    }
-    
-    virtual const btMultiBodyDynamicsWorld* getMultiBodyDynamicsWorld() const
-    {
-        return (const btMultiBodyDynamicsWorld*)(this);
-    }
-    
-    virtual btDynamicsWorldType getWorldType() const
-    {
-        return BT_DEFORMABLE_MULTIBODY_DYNAMICS_WORLD;
-    }
-    
-    virtual void predictUnconstraintMotion(btScalar timeStep);
-    
-    virtual void addSoftBody(btSoftBody* body, int collisionFilterGroup = btBroadphaseProxy::DefaultFilter, int collisionFilterMask = btBroadphaseProxy::AllFilter);
-    
-    btSoftBodyArray& getSoftBodyArray()
-    {
-        return m_softBodies;
-    }
-    
-    const btSoftBodyArray& getSoftBodyArray() const
-    {
-        return m_softBodies;
-    }
-    
-    btSoftBodyWorldInfo& getWorldInfo()
-    {
-        return m_sbi;
-    }
-    
-    const btSoftBodyWorldInfo& getWorldInfo() const
-    {
-        return m_sbi;
-    }
-    
-    void reinitialize(btScalar timeStep);
-    
-    void applyRigidBodyGravity(btScalar timeStep);
-    
-    void beforeSolverCallbacks(btScalar timeStep);
-    
-    void afterSolverCallbacks(btScalar timeStep);
-    
-    void addForce(btSoftBody* psb, btDeformableLagrangianForce* force);
-    
-    void removeForce(btSoftBody* psb, btDeformableLagrangianForce* force);
-    
-    void removeSoftBody(btSoftBody* body);
-    
-    void removeCollisionObject(btCollisionObject* collisionObject);
-    
-    int getDrawFlags() const { return (m_drawFlags); }
-    void setDrawFlags(int f) { m_drawFlags = f; }
-    
-    void setupConstraints();
-    
-    void performDeformableCollisionDetection();
-    
-    void solveMultiBodyConstraints();
-    
-    void solveContactConstraints();
-    
-    void sortConstraints();
-    
-    void softBodySelfCollision();
-    
-    void setImplicit(bool implicit)
-    {
-        m_implicit = implicit;
-    }
-    
-    void setLineSearch(bool lineSearch)
-    {
-        m_lineSearch = lineSearch;
-    }
-    
-    void applyRepulsionForce(btScalar timeStep);
-    
-    void performGeometricCollisions(btScalar timeStep);
-    
-    struct btDeformableSingleRayCallback : public btBroadphaseRayCallback
-    {
-        btVector3 m_rayFromWorld;
-        btVector3 m_rayToWorld;
-        btTransform m_rayFromTrans;
-        btTransform m_rayToTrans;
-        btVector3 m_hitNormal;
-        
-        const btDeformableMultiBodyDynamicsWorld* m_world;
-        btCollisionWorld::RayResultCallback& m_resultCallback;
-        
-        btDeformableSingleRayCallback(const btVector3& rayFromWorld, const btVector3& rayToWorld, const btDeformableMultiBodyDynamicsWorld* world, btCollisionWorld::RayResultCallback& resultCallback)
-        : m_rayFromWorld(rayFromWorld),
-        m_rayToWorld(rayToWorld),
-        m_world(world),
-        m_resultCallback(resultCallback)
-        {
-            m_rayFromTrans.setIdentity();
-            m_rayFromTrans.setOrigin(m_rayFromWorld);
-            m_rayToTrans.setIdentity();
-            m_rayToTrans.setOrigin(m_rayToWorld);
-            
-            btVector3 rayDir = (rayToWorld - rayFromWorld);
-            
-            rayDir.normalize();
-            ///what about division by zero? --> just set rayDirection[i] to INF/1e30
-            m_rayDirectionInverse[0] = rayDir[0] == btScalar(0.0) ? btScalar(1e30) : btScalar(1.0) / rayDir[0];
-            m_rayDirectionInverse[1] = rayDir[1] == btScalar(0.0) ? btScalar(1e30) : btScalar(1.0) / rayDir[1];
-            m_rayDirectionInverse[2] = rayDir[2] == btScalar(0.0) ? btScalar(1e30) : btScalar(1.0) / rayDir[2];
-            m_signs[0] = m_rayDirectionInverse[0] < 0.0;
-            m_signs[1] = m_rayDirectionInverse[1] < 0.0;
-            m_signs[2] = m_rayDirectionInverse[2] < 0.0;
-            
-            m_lambda_max = rayDir.dot(m_rayToWorld - m_rayFromWorld);
-        }
-        
-        virtual bool process(const btBroadphaseProxy* proxy)
-        {
-            ///terminate further ray tests, once the closestHitFraction reached zero
-            if (m_resultCallback.m_closestHitFraction == btScalar(0.f))
-                return false;
-            
-            btCollisionObject* collisionObject = (btCollisionObject*)proxy->m_clientObject;
-            
-            //only perform raycast if filterMask matches
-            if (m_resultCallback.needsCollision(collisionObject->getBroadphaseHandle()))
-            {
-                //RigidcollisionObject* collisionObject = ctrl->GetRigidcollisionObject();
-                //btVector3 collisionObjectAabbMin,collisionObjectAabbMax;
+	void setSolverCallback(btSolverCallback cb)
+	{
+		m_solverCallback = cb;
+	}
+
+	virtual ~btDeformableMultiBodyDynamicsWorld();
+
+	virtual btMultiBodyDynamicsWorld* getMultiBodyDynamicsWorld()
+	{
+		return (btMultiBodyDynamicsWorld*)(this);
+	}
+
+	virtual const btMultiBodyDynamicsWorld* getMultiBodyDynamicsWorld() const
+	{
+		return (const btMultiBodyDynamicsWorld*)(this);
+	}
+
+	virtual btDynamicsWorldType getWorldType() const
+	{
+		return BT_DEFORMABLE_MULTIBODY_DYNAMICS_WORLD;
+	}
+
+	virtual void predictUnconstraintMotion(btScalar timeStep);
+
+	virtual void addSoftBody(btSoftBody* body, int collisionFilterGroup = btBroadphaseProxy::DefaultFilter, int collisionFilterMask = btBroadphaseProxy::AllFilter);
+
+	btSoftBodyArray& getSoftBodyArray()
+	{
+		return m_softBodies;
+	}
+
+	const btSoftBodyArray& getSoftBodyArray() const
+	{
+		return m_softBodies;
+	}
+
+	btSoftBodyWorldInfo& getWorldInfo()
+	{
+		return m_sbi;
+	}
+
+	const btSoftBodyWorldInfo& getWorldInfo() const
+	{
+		return m_sbi;
+	}
+
+	void reinitialize(btScalar timeStep);
+
+	void applyRigidBodyGravity(btScalar timeStep);
+
+	void beforeSolverCallbacks(btScalar timeStep);
+
+	void afterSolverCallbacks(btScalar timeStep);
+
+	void addForce(btSoftBody* psb, btDeformableLagrangianForce* force);
+
+	void removeForce(btSoftBody* psb, btDeformableLagrangianForce* force);
+
+	void removeSoftBodyForce(btSoftBody* psb);
+
+	void removeSoftBody(btSoftBody* body);
+
+	void removeCollisionObject(btCollisionObject* collisionObject);
+
+	int getDrawFlags() const { return (m_drawFlags); }
+	void setDrawFlags(int f) { m_drawFlags = f; }
+
+	void setupConstraints();
+
+	void performDeformableCollisionDetection();
+
+	void solveMultiBodyConstraints();
+
+	void solveContactConstraints();
+
+	void sortConstraints();
+
+	void softBodySelfCollision();
+
+	void setImplicit(bool implicit)
+	{
+		m_implicit = implicit;
+	}
+
+	void setLineSearch(bool lineSearch)
+	{
+		m_lineSearch = lineSearch;
+	}
+
+	void setUseProjection(bool useProjection)
+	{
+		m_useProjection = useProjection;
+	}
+
+	void applyRepulsionForce(btScalar timeStep);
+
+	void performGeometricCollisions(btScalar timeStep);
+
+	struct btDeformableSingleRayCallback : public btBroadphaseRayCallback
+	{
+		btVector3 m_rayFromWorld;
+		btVector3 m_rayToWorld;
+		btTransform m_rayFromTrans;
+		btTransform m_rayToTrans;
+		btVector3 m_hitNormal;
+
+		const btDeformableMultiBodyDynamicsWorld* m_world;
+		btCollisionWorld::RayResultCallback& m_resultCallback;
+
+		btDeformableSingleRayCallback(const btVector3& rayFromWorld, const btVector3& rayToWorld, const btDeformableMultiBodyDynamicsWorld* world, btCollisionWorld::RayResultCallback& resultCallback)
+			: m_rayFromWorld(rayFromWorld),
+			  m_rayToWorld(rayToWorld),
+			  m_world(world),
+			  m_resultCallback(resultCallback)
+		{
+			m_rayFromTrans.setIdentity();
+			m_rayFromTrans.setOrigin(m_rayFromWorld);
+			m_rayToTrans.setIdentity();
+			m_rayToTrans.setOrigin(m_rayToWorld);
+
+			btVector3 rayDir = (rayToWorld - rayFromWorld);
+
+			rayDir.normalize();
+			///what about division by zero? --> just set rayDirection[i] to INF/1e30
+			m_rayDirectionInverse[0] = rayDir[0] == btScalar(0.0) ? btScalar(1e30) : btScalar(1.0) / rayDir[0];
+			m_rayDirectionInverse[1] = rayDir[1] == btScalar(0.0) ? btScalar(1e30) : btScalar(1.0) / rayDir[1];
+			m_rayDirectionInverse[2] = rayDir[2] == btScalar(0.0) ? btScalar(1e30) : btScalar(1.0) / rayDir[2];
+			m_signs[0] = m_rayDirectionInverse[0] < 0.0;
+			m_signs[1] = m_rayDirectionInverse[1] < 0.0;
+			m_signs[2] = m_rayDirectionInverse[2] < 0.0;
+
+			m_lambda_max = rayDir.dot(m_rayToWorld - m_rayFromWorld);
+		}
+
+		virtual bool process(const btBroadphaseProxy* proxy)
+		{
+			///terminate further ray tests, once the closestHitFraction reached zero
+			if (m_resultCallback.m_closestHitFraction == btScalar(0.f))
+				return false;
+
+			btCollisionObject* collisionObject = (btCollisionObject*)proxy->m_clientObject;
+
+			//only perform raycast if filterMask matches
+			if (m_resultCallback.needsCollision(collisionObject->getBroadphaseHandle()))
+			{
+				//RigidcollisionObject* collisionObject = ctrl->GetRigidcollisionObject();
+				//btVector3 collisionObjectAabbMin,collisionObjectAabbMax;
 #if 0
 #ifdef RECALCULATE_AABB
                 btVector3 collisionObjectAabbMin,collisionObjectAabbMax;
@@ -225,87 +232,85 @@ public:
                 const btVector3& collisionObjectAabbMax = collisionObject->getBroadphaseHandle()->m_aabbMax;
 #endif
 #endif
-                //btScalar hitLambda = m_resultCallback.m_closestHitFraction;
-                //culling already done by broadphase
-                //if (btRayAabb(m_rayFromWorld,m_rayToWorld,collisionObjectAabbMin,collisionObjectAabbMax,hitLambda,m_hitNormal))
-                {
-                    m_world->rayTestSingle(m_rayFromTrans, m_rayToTrans,
-                                           collisionObject,
-                                           collisionObject->getCollisionShape(),
-                                           collisionObject->getWorldTransform(),
-                                           m_resultCallback);
-                }
-            }
-            return true;
-        }
-    };
-
-    
-    
-    void rayTest(const btVector3& rayFromWorld, const btVector3& rayToWorld, RayResultCallback& resultCallback) const
-    {
-        BT_PROFILE("rayTest");
-        /// use the broadphase to accelerate the search for objects, based on their aabb
-        /// and for each object with ray-aabb overlap, perform an exact ray test
-        btDeformableSingleRayCallback rayCB(rayFromWorld, rayToWorld, this, resultCallback);
-        
+				//btScalar hitLambda = m_resultCallback.m_closestHitFraction;
+				//culling already done by broadphase
+				//if (btRayAabb(m_rayFromWorld,m_rayToWorld,collisionObjectAabbMin,collisionObjectAabbMax,hitLambda,m_hitNormal))
+				{
+					m_world->rayTestSingle(m_rayFromTrans, m_rayToTrans,
+										   collisionObject,
+										   collisionObject->getCollisionShape(),
+										   collisionObject->getWorldTransform(),
+										   m_resultCallback);
+				}
+			}
+			return true;
+		}
+	};
+
+	void rayTest(const btVector3& rayFromWorld, const btVector3& rayToWorld, RayResultCallback& resultCallback) const
+	{
+		BT_PROFILE("rayTest");
+		/// use the broadphase to accelerate the search for objects, based on their aabb
+		/// and for each object with ray-aabb overlap, perform an exact ray test
+		btDeformableSingleRayCallback rayCB(rayFromWorld, rayToWorld, this, resultCallback);
+
 #ifndef USE_BRUTEFORCE_RAYBROADPHASE
-        m_broadphasePairCache->rayTest(rayFromWorld, rayToWorld, rayCB);
+		m_broadphasePairCache->rayTest(rayFromWorld, rayToWorld, rayCB);
 #else
-        for (int i = 0; i < this->getNumCollisionObjects(); i++)
-        {
-            rayCB.process(m_collisionObjects[i]->getBroadphaseHandle());
-        }
+		for (int i = 0; i < this->getNumCollisionObjects(); i++)
+		{
+			rayCB.process(m_collisionObjects[i]->getBroadphaseHandle());
+		}
 #endif  //USE_BRUTEFORCE_RAYBROADPHASE
-    }
-    
-    void rayTestSingle(const btTransform& rayFromTrans, const btTransform& rayToTrans,
-                                                     btCollisionObject* collisionObject,
-                                                     const btCollisionShape* collisionShape,
-                                                     const btTransform& colObjWorldTransform,
-                                                     RayResultCallback& resultCallback) const
-    {
-        if (collisionShape->isSoftBody())
-        {
-            btSoftBody* softBody = btSoftBody::upcast(collisionObject);
-            if (softBody)
-            {
-                btSoftBody::sRayCast softResult;
-                if (softBody->rayFaceTest(rayFromTrans.getOrigin(), rayToTrans.getOrigin(), softResult))
-                {
-                    if (softResult.fraction <= resultCallback.m_closestHitFraction)
-                    {
-                        btCollisionWorld::LocalShapeInfo shapeInfo;
-                        shapeInfo.m_shapePart = 0;
-                        shapeInfo.m_triangleIndex = softResult.index;
-                        // get the normal
-                        btVector3 rayDir = rayToTrans.getOrigin() - rayFromTrans.getOrigin();
-                        btVector3 normal = -rayDir;
-                        normal.normalize();
-                        {
-                            normal = softBody->m_faces[softResult.index].m_normal;
-                            if (normal.dot(rayDir) > 0)
-                            {
-                                // normal always point toward origin of the ray
-                                normal = -normal;
-                            }
-                        }
-                        
-                        btCollisionWorld::LocalRayResult rayResult(collisionObject,
-                                                                   &shapeInfo,
-                                                                   normal,
-                                                                   softResult.fraction);
-                        bool normalInWorldSpace = true;
-                        resultCallback.addSingleResult(rayResult, normalInWorldSpace);
-                    }
-                }
-            }
-        }
-        else
-        {
-            btCollisionWorld::rayTestSingle(rayFromTrans, rayToTrans, collisionObject, collisionShape, colObjWorldTransform, resultCallback);
-        }
-    }
+	}
+
+	void rayTestSingle(const btTransform& rayFromTrans, const btTransform& rayToTrans,
+					   btCollisionObject* collisionObject,
+					   const btCollisionShape* collisionShape,
+					   const btTransform& colObjWorldTransform,
+					   RayResultCallback& resultCallback) const
+	{
+		if (collisionShape->isSoftBody())
+		{
+			btSoftBody* softBody = btSoftBody::upcast(collisionObject);
+			if (softBody)
+			{
+				btSoftBody::sRayCast softResult;
+				if (softBody->rayFaceTest(rayFromTrans.getOrigin(), rayToTrans.getOrigin(), softResult))
+				{
+					if (softResult.fraction <= resultCallback.m_closestHitFraction)
+					{
+						btCollisionWorld::LocalShapeInfo shapeInfo;
+						shapeInfo.m_shapePart = 0;
+						shapeInfo.m_triangleIndex = softResult.index;
+						// get the normal
+						btVector3 rayDir = rayToTrans.getOrigin() - rayFromTrans.getOrigin();
+						btVector3 normal = -rayDir;
+						normal.normalize();
+						{
+							normal = softBody->m_faces[softResult.index].m_normal;
+							if (normal.dot(rayDir) > 0)
+							{
+								// normal always point toward origin of the ray
+								normal = -normal;
+							}
+						}
+
+						btCollisionWorld::LocalRayResult rayResult(collisionObject,
+																   &shapeInfo,
+																   normal,
+																   softResult.fraction);
+						bool normalInWorldSpace = true;
+						resultCallback.addSingleResult(rayResult, normalInWorldSpace);
+					}
+				}
+			}
+		}
+		else
+		{
+			btCollisionWorld::rayTestSingle(rayFromTrans, rayToTrans, collisionObject, collisionShape, colObjWorldTransform, resultCallback);
+		}
+	}
 };
 
 #endif  //BT_DEFORMABLE_MULTIBODY_DYNAMICS_WORLD_H
diff --git a/thirdparty/bullet/BulletSoftBody/btDeformableNeoHookeanForce.h b/thirdparty/bullet/BulletSoftBody/btDeformableNeoHookeanForce.h
index d89bc4aca4..60798c5bcd 100644
--- a/thirdparty/bullet/BulletSoftBody/btDeformableNeoHookeanForce.h
+++ b/thirdparty/bullet/BulletSoftBody/btDeformableNeoHookeanForce.h
@@ -23,30 +23,30 @@ subject to the following restrictions:
 class btDeformableNeoHookeanForce : public btDeformableLagrangianForce
 {
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-	btScalar m_mu, m_lambda; // Lame Parameters
-	btScalar m_E, m_nu;  // Young's modulus and Poisson ratio
-    btScalar m_mu_damp, m_lambda_damp;
-    btDeformableNeoHookeanForce(): m_mu(1), m_lambda(1)
-    {
-        btScalar damping = 0.05;
-        m_mu_damp = damping * m_mu;
-        m_lambda_damp = damping * m_lambda;
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	btScalar m_mu, m_lambda;  // Lame Parameters
+	btScalar m_E, m_nu;       // Young's modulus and Poisson ratio
+	btScalar m_mu_damp, m_lambda_damp;
+	btDeformableNeoHookeanForce() : m_mu(1), m_lambda(1)
+	{
+		btScalar damping = 0.05;
+		m_mu_damp = damping * m_mu;
+		m_lambda_damp = damping * m_lambda;
 		updateYoungsModulusAndPoissonRatio();
-    }
-    
-    btDeformableNeoHookeanForce(btScalar mu, btScalar lambda, btScalar damping = 0.05): m_mu(mu), m_lambda(lambda)
-    {
-        m_mu_damp = damping * m_mu;
-        m_lambda_damp = damping * m_lambda;
+	}
+
+	btDeformableNeoHookeanForce(btScalar mu, btScalar lambda, btScalar damping = 0.05) : m_mu(mu), m_lambda(lambda)
+	{
+		m_mu_damp = damping * m_mu;
+		m_lambda_damp = damping * m_lambda;
 		updateYoungsModulusAndPoissonRatio();
-    }
+	}
 
 	void updateYoungsModulusAndPoissonRatio()
 	{
 		// conversion from Lame Parameters to Young's modulus and Poisson ratio
 		// https://en.wikipedia.org/wiki/Lam%C3%A9_parameters
-		m_E  = m_mu * (3*m_lambda + 2*m_mu)/(m_lambda + m_mu);
+		m_E = m_mu * (3 * m_lambda + 2 * m_mu) / (m_lambda + m_mu);
 		m_nu = m_lambda * 0.5 / (m_mu + m_lambda);
 	}
 
@@ -55,21 +55,21 @@ public:
 		// conversion from Young's modulus and Poisson ratio to Lame Parameters
 		// https://en.wikipedia.org/wiki/Lam%C3%A9_parameters
 		m_mu = m_E * 0.5 / (1 + m_nu);
-		m_lambda = m_E * m_nu / ((1 + m_nu) * (1- 2*m_nu));
+		m_lambda = m_E * m_nu / ((1 + m_nu) * (1 - 2 * m_nu));
 	}
 
-    void setYoungsModulus(btScalar E)
-    {
+	void setYoungsModulus(btScalar E)
+	{
 		m_E = E;
 		updateLameParameters();
-    }
+	}
 
 	void setPoissonRatio(btScalar nu)
 	{
 		m_nu = nu;
 		updateLameParameters();
 	}
-	
+
 	void setDamping(btScalar damping)
 	{
 		m_mu_damp = damping * m_mu;
@@ -83,339 +83,338 @@ public:
 		updateYoungsModulusAndPoissonRatio();
 	}
 
-    virtual void addScaledForces(btScalar scale, TVStack& force)
-    {
-        addScaledDampingForce(scale, force);
-        addScaledElasticForce(scale, force);
-    }
-    
-    virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
-    {
-        addScaledElasticForce(scale, force);
-    }
-    
-    // The damping matrix is calculated using the time n state as described in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
-    virtual void addScaledDampingForce(btScalar scale, TVStack& force)
-    {
-        if (m_mu_damp == 0 && m_lambda_damp == 0)
-            return;
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= force.size());
-        btVector3 grad_N_hat_1st_col = btVector3(-1,-1,-1);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_tetras.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btSoftBody::Node* node0 = tetra.m_n[0];
-                btSoftBody::Node* node1 = tetra.m_n[1];
-                btSoftBody::Node* node2 = tetra.m_n[2];
-                btSoftBody::Node* node3 = tetra.m_n[3];
-                size_t id0 = node0->index;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                size_t id3 = node3->index;
-                btMatrix3x3 dF = DsFromVelocity(node0, node1, node2, node3) * tetra.m_Dm_inverse;
-                btMatrix3x3 I;
-                I.setIdentity();
-                btMatrix3x3 dP = (dF + dF.transpose()) * m_mu_damp + I * (dF[0][0]+dF[1][1]+dF[2][2]) * m_lambda_damp;
-//                firstPiolaDampingDifferential(psb->m_tetraScratchesTn[j], dF, dP);
-                btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
-                btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
+	virtual void addScaledForces(btScalar scale, TVStack& force)
+	{
+		addScaledDampingForce(scale, force);
+		addScaledElasticForce(scale, force);
+	}
+
+	virtual void addScaledExplicitForce(btScalar scale, TVStack& force)
+	{
+		addScaledElasticForce(scale, force);
+	}
+
+	// The damping matrix is calculated using the time n state as described in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
+	virtual void addScaledDampingForce(btScalar scale, TVStack& force)
+	{
+		if (m_mu_damp == 0 && m_lambda_damp == 0)
+			return;
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= force.size());
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				size_t id0 = node0->index;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				size_t id3 = node3->index;
+				btMatrix3x3 dF = DsFromVelocity(node0, node1, node2, node3) * tetra.m_Dm_inverse;
+				btMatrix3x3 I;
+				I.setIdentity();
+				btMatrix3x3 dP = (dF + dF.transpose()) * m_mu_damp + I * (dF[0][0] + dF[1][1] + dF[2][2]) * m_lambda_damp;
+				//                firstPiolaDampingDifferential(psb->m_tetraScratchesTn[j], dF, dP);
+				btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose() * grad_N_hat_1st_col);
+				btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
+
+				// damping force differential
+				btScalar scale1 = scale * tetra.m_element_measure;
+				force[id0] -= scale1 * df_on_node0;
+				force[id1] -= scale1 * df_on_node123.getColumn(0);
+				force[id2] -= scale1 * df_on_node123.getColumn(1);
+				force[id3] -= scale1 * df_on_node123.getColumn(2);
+			}
+		}
+	}
+
+	virtual double totalElasticEnergy(btScalar dt)
+	{
+		double energy = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_tetraScratches.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btSoftBody::TetraScratch& s = psb->m_tetraScratches[j];
+				energy += tetra.m_element_measure * elasticEnergyDensity(s);
+			}
+		}
+		return energy;
+	}
+
+	// The damping energy is formulated as in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
+	virtual double totalDampingEnergy(btScalar dt)
+	{
+		double energy = 0;
+		int sz = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				sz = btMax(sz, psb->m_nodes[j].index);
+			}
+		}
+		TVStack dampingForce;
+		dampingForce.resize(sz + 1);
+		for (int i = 0; i < dampingForce.size(); ++i)
+			dampingForce[i].setZero();
+		addScaledDampingForce(0.5, dampingForce);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				const btSoftBody::Node& node = psb->m_nodes[j];
+				energy -= dampingForce[node.index].dot(node.m_v) / dt;
+			}
+		}
+		return energy;
+	}
+
+	double elasticEnergyDensity(const btSoftBody::TetraScratch& s)
+	{
+		double density = 0;
+		density += m_mu * 0.5 * (s.m_trace - 3.);
+		density += m_lambda * 0.5 * (s.m_J - 1. - 0.75 * m_mu / m_lambda) * (s.m_J - 1. - 0.75 * m_mu / m_lambda);
+		density -= m_mu * 0.5 * log(s.m_trace + 1);
+		return density;
+	}
 
-                // damping force differential
-                btScalar scale1 = scale * tetra.m_element_measure;
-                force[id0] -= scale1 * df_on_node0;
-                force[id1] -= scale1 * df_on_node123.getColumn(0);
-                force[id2] -= scale1 * df_on_node123.getColumn(1);
-                force[id3] -= scale1 * df_on_node123.getColumn(2);
-            }
-        }
-    }
-    
-    virtual double totalElasticEnergy(btScalar dt)
-    {
-        double energy = 0;
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_tetraScratches.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btSoftBody::TetraScratch& s = psb->m_tetraScratches[j];
-                energy += tetra.m_element_measure * elasticEnergyDensity(s);
-            }
-        }
-        return energy;
-    }
-    
-    // The damping energy is formulated as in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
-    virtual double totalDampingEnergy(btScalar dt)
-    {
-        double energy = 0;
-        int sz = 0;
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                sz = btMax(sz, psb->m_nodes[j].index);
-            }
-        }
-        TVStack dampingForce;
-        dampingForce.resize(sz+1);
-        for (int i = 0; i < dampingForce.size(); ++i)
-            dampingForce[i].setZero();
-        addScaledDampingForce(0.5, dampingForce);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                const btSoftBody::Node& node = psb->m_nodes[j];
-                energy -= dampingForce[node.index].dot(node.m_v) / dt;
-            }
-        }
-        return energy;
-    }
-    
-    double elasticEnergyDensity(const btSoftBody::TetraScratch& s)
-    {
-        double density = 0;
-        density += m_mu * 0.5 * (s.m_trace - 3.);
-        density += m_lambda * 0.5 * (s.m_J - 1. - 0.75 * m_mu / m_lambda)* (s.m_J - 1. - 0.75 * m_mu / m_lambda);
-        density -= m_mu * 0.5 * log(s.m_trace+1);
-        return density;
-    }
-    
-    virtual void addScaledElasticForce(btScalar scale, TVStack& force)
-    {
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= force.size());
-        btVector3 grad_N_hat_1st_col = btVector3(-1,-1,-1);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            btScalar max_p = psb->m_cfg.m_maxStress;
-            for (int j = 0; j < psb->m_tetras.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btMatrix3x3 P;
-                firstPiola(psb->m_tetraScratches[j],P);
+	virtual void addScaledElasticForce(btScalar scale, TVStack& force)
+	{
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= force.size());
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			btScalar max_p = psb->m_cfg.m_maxStress;
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btMatrix3x3 P;
+				firstPiola(psb->m_tetraScratches[j], P);
 #ifdef USE_SVD
-                if (max_p > 0)
-                {
-                    // since we want to clamp the principal stress to max_p, we only need to
-                    // calculate SVD when sigma_0^2 + sigma_1^2 + sigma_2^2 > max_p * max_p
-                    btScalar trPTP = (P[0].length2() + P[1].length2() + P[2].length2());
-                    if (trPTP > max_p * max_p)
-                    {
-                        btMatrix3x3 U, V;
-                        btVector3 sigma;
-                        singularValueDecomposition(P, U, sigma, V);
-                        sigma[0] = btMin(sigma[0], max_p);
-                        sigma[1] = btMin(sigma[1], max_p);
-                        sigma[2] = btMin(sigma[2], max_p);
-                        sigma[0] = btMax(sigma[0], -max_p);
-                        sigma[1] = btMax(sigma[1], -max_p);
-                        sigma[2] = btMax(sigma[2], -max_p);
-                        btMatrix3x3 Sigma;
-                        Sigma.setIdentity();
-                        Sigma[0][0] = sigma[0];
-                        Sigma[1][1] = sigma[1];
-                        Sigma[2][2] = sigma[2];
-                        P = U * Sigma * V.transpose();
-                    }
-                }
+				if (max_p > 0)
+				{
+					// since we want to clamp the principal stress to max_p, we only need to
+					// calculate SVD when sigma_0^2 + sigma_1^2 + sigma_2^2 > max_p * max_p
+					btScalar trPTP = (P[0].length2() + P[1].length2() + P[2].length2());
+					if (trPTP > max_p * max_p)
+					{
+						btMatrix3x3 U, V;
+						btVector3 sigma;
+						singularValueDecomposition(P, U, sigma, V);
+						sigma[0] = btMin(sigma[0], max_p);
+						sigma[1] = btMin(sigma[1], max_p);
+						sigma[2] = btMin(sigma[2], max_p);
+						sigma[0] = btMax(sigma[0], -max_p);
+						sigma[1] = btMax(sigma[1], -max_p);
+						sigma[2] = btMax(sigma[2], -max_p);
+						btMatrix3x3 Sigma;
+						Sigma.setIdentity();
+						Sigma[0][0] = sigma[0];
+						Sigma[1][1] = sigma[1];
+						Sigma[2][2] = sigma[2];
+						P = U * Sigma * V.transpose();
+					}
+				}
 #endif
-//                btVector3 force_on_node0 = P * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
-                btMatrix3x3 force_on_node123 = P * tetra.m_Dm_inverse.transpose();
-                btVector3 force_on_node0 = force_on_node123 * grad_N_hat_1st_col;
-                
-                btSoftBody::Node* node0 = tetra.m_n[0];
-                btSoftBody::Node* node1 = tetra.m_n[1];
-                btSoftBody::Node* node2 = tetra.m_n[2];
-                btSoftBody::Node* node3 = tetra.m_n[3];
-                size_t id0 = node0->index;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                size_t id3 = node3->index;
-                
-                // elastic force
-                btScalar scale1 = scale * tetra.m_element_measure;
-                force[id0] -= scale1 * force_on_node0;
-                force[id1] -= scale1 * force_on_node123.getColumn(0);
-                force[id2] -= scale1 * force_on_node123.getColumn(1);
-                force[id3] -= scale1 * force_on_node123.getColumn(2);
-            }
-        }
-    }
-    
-    // The damping matrix is calculated using the time n state as described in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
-    virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
-    {
-        if (m_mu_damp == 0 && m_lambda_damp == 0)
-            return;
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= df.size());
-        btVector3 grad_N_hat_1st_col = btVector3(-1,-1,-1);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_tetras.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btSoftBody::Node* node0 = tetra.m_n[0];
-                btSoftBody::Node* node1 = tetra.m_n[1];
-                btSoftBody::Node* node2 = tetra.m_n[2];
-                btSoftBody::Node* node3 = tetra.m_n[3];
-                size_t id0 = node0->index;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                size_t id3 = node3->index;
-                btMatrix3x3 dF = Ds(id0, id1, id2, id3, dv) * tetra.m_Dm_inverse;
-                btMatrix3x3 I;
-                I.setIdentity();
-                btMatrix3x3 dP = (dF + dF.transpose()) * m_mu_damp + I * (dF[0][0]+dF[1][1]+dF[2][2]) * m_lambda_damp;
-//                firstPiolaDampingDifferential(psb->m_tetraScratchesTn[j], dF, dP);
-//                btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
-                btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
-                btVector3 df_on_node0 = df_on_node123 * grad_N_hat_1st_col;
+				//                btVector3 force_on_node0 = P * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
+				btMatrix3x3 force_on_node123 = P * tetra.m_Dm_inverse.transpose();
+				btVector3 force_on_node0 = force_on_node123 * grad_N_hat_1st_col;
+
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				size_t id0 = node0->index;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				size_t id3 = node3->index;
+
+				// elastic force
+				btScalar scale1 = scale * tetra.m_element_measure;
+				force[id0] -= scale1 * force_on_node0;
+				force[id1] -= scale1 * force_on_node123.getColumn(0);
+				force[id2] -= scale1 * force_on_node123.getColumn(1);
+				force[id3] -= scale1 * force_on_node123.getColumn(2);
+			}
+		}
+	}
+
+	// The damping matrix is calculated using the time n state as described in https://www.math.ucla.edu/~jteran/papers/GSSJT15.pdf to allow line search
+	virtual void addScaledDampingForceDifferential(btScalar scale, const TVStack& dv, TVStack& df)
+	{
+		if (m_mu_damp == 0 && m_lambda_damp == 0)
+			return;
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= df.size());
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				size_t id0 = node0->index;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				size_t id3 = node3->index;
+				btMatrix3x3 dF = Ds(id0, id1, id2, id3, dv) * tetra.m_Dm_inverse;
+				btMatrix3x3 I;
+				I.setIdentity();
+				btMatrix3x3 dP = (dF + dF.transpose()) * m_mu_damp + I * (dF[0][0] + dF[1][1] + dF[2][2]) * m_lambda_damp;
+				//                firstPiolaDampingDifferential(psb->m_tetraScratchesTn[j], dF, dP);
+				//                btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
+				btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
+				btVector3 df_on_node0 = df_on_node123 * grad_N_hat_1st_col;
+
+				// damping force differential
+				btScalar scale1 = scale * tetra.m_element_measure;
+				df[id0] -= scale1 * df_on_node0;
+				df[id1] -= scale1 * df_on_node123.getColumn(0);
+				df[id2] -= scale1 * df_on_node123.getColumn(1);
+				df[id3] -= scale1 * df_on_node123.getColumn(2);
+			}
+		}
+	}
+
+	virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA) {}
+
+	virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
+	{
+		int numNodes = getNumNodes();
+		btAssert(numNodes <= df.size());
+		btVector3 grad_N_hat_1st_col = btVector3(-1, -1, -1);
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			if (!psb->isActive())
+			{
+				continue;
+			}
+			for (int j = 0; j < psb->m_tetras.size(); ++j)
+			{
+				btSoftBody::Tetra& tetra = psb->m_tetras[j];
+				btSoftBody::Node* node0 = tetra.m_n[0];
+				btSoftBody::Node* node1 = tetra.m_n[1];
+				btSoftBody::Node* node2 = tetra.m_n[2];
+				btSoftBody::Node* node3 = tetra.m_n[3];
+				size_t id0 = node0->index;
+				size_t id1 = node1->index;
+				size_t id2 = node2->index;
+				size_t id3 = node3->index;
+				btMatrix3x3 dF = Ds(id0, id1, id2, id3, dx) * tetra.m_Dm_inverse;
+				btMatrix3x3 dP;
+				firstPiolaDifferential(psb->m_tetraScratches[j], dF, dP);
+				//                btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
+				btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
+				btVector3 df_on_node0 = df_on_node123 * grad_N_hat_1st_col;
+
+				// elastic force differential
+				btScalar scale1 = scale * tetra.m_element_measure;
+				df[id0] -= scale1 * df_on_node0;
+				df[id1] -= scale1 * df_on_node123.getColumn(0);
+				df[id2] -= scale1 * df_on_node123.getColumn(1);
+				df[id3] -= scale1 * df_on_node123.getColumn(2);
+			}
+		}
+	}
+
+	void firstPiola(const btSoftBody::TetraScratch& s, btMatrix3x3& P)
+	{
+		btScalar c1 = (m_mu * (1. - 1. / (s.m_trace + 1.)));
+		btScalar c2 = (m_lambda * (s.m_J - 1.) - 0.75 * m_mu);
+		P = s.m_F * c1 + s.m_cofF * c2;
+	}
+
+	// Let P be the first piola stress.
+	// This function calculates the dP = dP/dF * dF
+	void firstPiolaDifferential(const btSoftBody::TetraScratch& s, const btMatrix3x3& dF, btMatrix3x3& dP)
+	{
+		btScalar c1 = m_mu * (1. - 1. / (s.m_trace + 1.));
+		btScalar c2 = (2. * m_mu) * DotProduct(s.m_F, dF) * (1. / ((1. + s.m_trace) * (1. + s.m_trace)));
+		btScalar c3 = (m_lambda * DotProduct(s.m_cofF, dF));
+		dP = dF * c1 + s.m_F * c2;
+		addScaledCofactorMatrixDifferential(s.m_F, dF, m_lambda * (s.m_J - 1.) - 0.75 * m_mu, dP);
+		dP += s.m_cofF * c3;
+	}
 
-                // damping force differential
-                btScalar scale1 = scale * tetra.m_element_measure;
-                df[id0] -= scale1 * df_on_node0;
-                df[id1] -= scale1 * df_on_node123.getColumn(0);
-                df[id2] -= scale1 * df_on_node123.getColumn(1);
-                df[id3] -= scale1 * df_on_node123.getColumn(2);
-            }
-        }
-    }
-    
-    virtual void buildDampingForceDifferentialDiagonal(btScalar scale, TVStack& diagA){}
-    
-    virtual void addScaledElasticForceDifferential(btScalar scale, const TVStack& dx, TVStack& df)
-    {
-        int numNodes = getNumNodes();
-        btAssert(numNodes <= df.size());
-        btVector3 grad_N_hat_1st_col = btVector3(-1,-1,-1);
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            if (!psb->isActive())
-            {
-                continue;
-            }
-            for (int j = 0; j < psb->m_tetras.size(); ++j)
-            {
-                btSoftBody::Tetra& tetra = psb->m_tetras[j];
-                btSoftBody::Node* node0 = tetra.m_n[0];
-                btSoftBody::Node* node1 = tetra.m_n[1];
-                btSoftBody::Node* node2 = tetra.m_n[2];
-                btSoftBody::Node* node3 = tetra.m_n[3];
-                size_t id0 = node0->index;
-                size_t id1 = node1->index;
-                size_t id2 = node2->index;
-                size_t id3 = node3->index;
-                btMatrix3x3 dF = Ds(id0, id1, id2, id3, dx) * tetra.m_Dm_inverse;
-                btMatrix3x3 dP;
-                firstPiolaDifferential(psb->m_tetraScratches[j], dF, dP);
-//                btVector3 df_on_node0 = dP * (tetra.m_Dm_inverse.transpose()*grad_N_hat_1st_col);
-                btMatrix3x3 df_on_node123 = dP * tetra.m_Dm_inverse.transpose();
-                btVector3 df_on_node0 = df_on_node123 * grad_N_hat_1st_col;
-                
-                // elastic force differential
-                btScalar scale1 = scale * tetra.m_element_measure;
-                df[id0] -= scale1 * df_on_node0;
-                df[id1] -= scale1 * df_on_node123.getColumn(0);
-                df[id2] -= scale1 * df_on_node123.getColumn(1);
-                df[id3] -= scale1 * df_on_node123.getColumn(2);
-            }
-        }
-    }
-    
-    void firstPiola(const btSoftBody::TetraScratch& s, btMatrix3x3& P)
-    {
-        btScalar c1 = (m_mu * ( 1. - 1. / (s.m_trace + 1.)));
-        btScalar c2 = (m_lambda * (s.m_J - 1.) - 0.75 * m_mu);
-        P = s.m_F * c1 + s.m_cofF * c2;
-    }
-    
-    // Let P be the first piola stress.
-    // This function calculates the dP = dP/dF * dF
-    void firstPiolaDifferential(const btSoftBody::TetraScratch& s, const btMatrix3x3& dF,  btMatrix3x3& dP)
-    {
-        btScalar c1 = m_mu * ( 1. - 1. / (s.m_trace + 1.));
-        btScalar c2 = (2.*m_mu) * DotProduct(s.m_F, dF) * (1./((1.+s.m_trace)*(1.+s.m_trace)));
-        btScalar c3 = (m_lambda * DotProduct(s.m_cofF, dF));
-        dP = dF * c1 + s.m_F * c2;
-        addScaledCofactorMatrixDifferential(s.m_F, dF, m_lambda*(s.m_J-1.) - 0.75*m_mu, dP);
-        dP += s.m_cofF * c3;
-    }
-    
-    // Let Q be the damping stress.
-    // This function calculates the dP = dQ/dF * dF
-    void firstPiolaDampingDifferential(const btSoftBody::TetraScratch& s, const btMatrix3x3& dF,  btMatrix3x3& dP)
-    {
-        btScalar c1 = (m_mu_damp * ( 1. - 1. / (s.m_trace + 1.)));
-        btScalar c2 = ((2.*m_mu_damp) * DotProduct(s.m_F, dF) *(1./((1.+s.m_trace)*(1.+s.m_trace))));
-        btScalar c3 = (m_lambda_damp * DotProduct(s.m_cofF, dF));
-        dP = dF * c1 + s.m_F * c2;
-        addScaledCofactorMatrixDifferential(s.m_F, dF, m_lambda_damp*(s.m_J-1.) - 0.75*m_mu_damp, dP);
-        dP += s.m_cofF * c3;
-    }
-    
-    btScalar DotProduct(const btMatrix3x3& A, const btMatrix3x3& B)
-    {
-        btScalar ans = 0;
-        for (int i = 0; i < 3; ++i)
-        {
-            ans += A[i].dot(B[i]);
-        }
-        return ans;
-    }
-    
-    // Let C(A) be the cofactor of the matrix A
-    // Let H = the derivative of C(A) with respect to A evaluated at F = A
-    // This function calculates H*dF
-    void addScaledCofactorMatrixDifferential(const btMatrix3x3& F, const btMatrix3x3& dF, btScalar scale, btMatrix3x3& M)
-    {
-        M[0][0] += scale * (dF[1][1] * F[2][2] + F[1][1] * dF[2][2] - dF[2][1] * F[1][2] - F[2][1] * dF[1][2]);
-        M[1][0] += scale * (dF[2][1] * F[0][2] + F[2][1] * dF[0][2] - dF[0][1] * F[2][2] - F[0][1] * dF[2][2]);
-        M[2][0] += scale * (dF[0][1] * F[1][2] + F[0][1] * dF[1][2] - dF[1][1] * F[0][2] - F[1][1] * dF[0][2]);
-        M[0][1] += scale * (dF[2][0] * F[1][2] + F[2][0] * dF[1][2] - dF[1][0] * F[2][2] - F[1][0] * dF[2][2]);
-        M[1][1] += scale * (dF[0][0] * F[2][2] + F[0][0] * dF[2][2] - dF[2][0] * F[0][2] - F[2][0] * dF[0][2]);
-        M[2][1] += scale * (dF[1][0] * F[0][2] + F[1][0] * dF[0][2] - dF[0][0] * F[1][2] - F[0][0] * dF[1][2]);
-        M[0][2] += scale * (dF[1][0] * F[2][1] + F[1][0] * dF[2][1] - dF[2][0] * F[1][1] - F[2][0] * dF[1][1]);
-        M[1][2] += scale * (dF[2][0] * F[0][1] + F[2][0] * dF[0][1] - dF[0][0] * F[2][1] - F[0][0] * dF[2][1]);
-        M[2][2] += scale * (dF[0][0] * F[1][1] + F[0][0] * dF[1][1] - dF[1][0] * F[0][1] - F[1][0] * dF[0][1]);
-    }
-    
-    virtual btDeformableLagrangianForceType getForceType()
-    {
-        return BT_NEOHOOKEAN_FORCE;
-    }
-    
+	// Let Q be the damping stress.
+	// This function calculates the dP = dQ/dF * dF
+	void firstPiolaDampingDifferential(const btSoftBody::TetraScratch& s, const btMatrix3x3& dF, btMatrix3x3& dP)
+	{
+		btScalar c1 = (m_mu_damp * (1. - 1. / (s.m_trace + 1.)));
+		btScalar c2 = ((2. * m_mu_damp) * DotProduct(s.m_F, dF) * (1. / ((1. + s.m_trace) * (1. + s.m_trace))));
+		btScalar c3 = (m_lambda_damp * DotProduct(s.m_cofF, dF));
+		dP = dF * c1 + s.m_F * c2;
+		addScaledCofactorMatrixDifferential(s.m_F, dF, m_lambda_damp * (s.m_J - 1.) - 0.75 * m_mu_damp, dP);
+		dP += s.m_cofF * c3;
+	}
+
+	btScalar DotProduct(const btMatrix3x3& A, const btMatrix3x3& B)
+	{
+		btScalar ans = 0;
+		for (int i = 0; i < 3; ++i)
+		{
+			ans += A[i].dot(B[i]);
+		}
+		return ans;
+	}
+
+	// Let C(A) be the cofactor of the matrix A
+	// Let H = the derivative of C(A) with respect to A evaluated at F = A
+	// This function calculates H*dF
+	void addScaledCofactorMatrixDifferential(const btMatrix3x3& F, const btMatrix3x3& dF, btScalar scale, btMatrix3x3& M)
+	{
+		M[0][0] += scale * (dF[1][1] * F[2][2] + F[1][1] * dF[2][2] - dF[2][1] * F[1][2] - F[2][1] * dF[1][2]);
+		M[1][0] += scale * (dF[2][1] * F[0][2] + F[2][1] * dF[0][2] - dF[0][1] * F[2][2] - F[0][1] * dF[2][2]);
+		M[2][0] += scale * (dF[0][1] * F[1][2] + F[0][1] * dF[1][2] - dF[1][1] * F[0][2] - F[1][1] * dF[0][2]);
+		M[0][1] += scale * (dF[2][0] * F[1][2] + F[2][0] * dF[1][2] - dF[1][0] * F[2][2] - F[1][0] * dF[2][2]);
+		M[1][1] += scale * (dF[0][0] * F[2][2] + F[0][0] * dF[2][2] - dF[2][0] * F[0][2] - F[2][0] * dF[0][2]);
+		M[2][1] += scale * (dF[1][0] * F[0][2] + F[1][0] * dF[0][2] - dF[0][0] * F[1][2] - F[0][0] * dF[1][2]);
+		M[0][2] += scale * (dF[1][0] * F[2][1] + F[1][0] * dF[2][1] - dF[2][0] * F[1][1] - F[2][0] * dF[1][1]);
+		M[1][2] += scale * (dF[2][0] * F[0][1] + F[2][0] * dF[0][1] - dF[0][0] * F[2][1] - F[0][0] * dF[2][1]);
+		M[2][2] += scale * (dF[0][0] * F[1][1] + F[0][0] * dF[1][1] - dF[1][0] * F[0][1] - F[1][0] * dF[0][1]);
+	}
+
+	virtual btDeformableLagrangianForceType getForceType()
+	{
+		return BT_NEOHOOKEAN_FORCE;
+	}
 };
 #endif /* BT_NEOHOOKEAN_H */
diff --git a/thirdparty/bullet/BulletSoftBody/btKrylovSolver.h b/thirdparty/bullet/BulletSoftBody/btKrylovSolver.h
new file mode 100644
index 0000000000..59126b47ae
--- /dev/null
+++ b/thirdparty/bullet/BulletSoftBody/btKrylovSolver.h
@@ -0,0 +1,107 @@
+/*
+ Written by Xuchen Han <xuchenhan2015@u.northwestern.edu>
+ 
+ Bullet Continuous Collision Detection and Physics Library
+ Copyright (c) 2019 Google Inc. http://bulletphysics.org
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from the use of this software.
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+ 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef BT_KRYLOV_SOLVER_H
+#define BT_KRYLOV_SOLVER_H
+#include <iostream>
+#include <cmath>
+#include <limits>
+#include <LinearMath/btAlignedObjectArray.h>
+#include <LinearMath/btVector3.h>
+#include <LinearMath/btScalar.h>
+#include "LinearMath/btQuickprof.h"
+
+template <class MatrixX>
+class btKrylovSolver
+{
+	typedef btAlignedObjectArray<btVector3> TVStack;
+
+public:
+	int m_maxIterations;
+	btScalar m_tolerance;
+	btKrylovSolver(int maxIterations, btScalar tolerance)
+		: m_maxIterations(maxIterations), m_tolerance(tolerance)
+	{
+	}
+
+	virtual ~btKrylovSolver() {}
+
+	virtual int solve(MatrixX& A, TVStack& x, const TVStack& b, bool verbose = false) = 0;
+
+	virtual void reinitialize(const TVStack& b) = 0;
+
+	virtual SIMD_FORCE_INLINE TVStack sub(const TVStack& a, const TVStack& b)
+	{
+		// c = a-b
+		btAssert(a.size() == b.size());
+		TVStack c;
+		c.resize(a.size());
+		for (int i = 0; i < a.size(); ++i)
+		{
+			c[i] = a[i] - b[i];
+		}
+		return c;
+	}
+
+	virtual SIMD_FORCE_INLINE btScalar squaredNorm(const TVStack& a)
+	{
+		return dot(a, a);
+	}
+
+	virtual SIMD_FORCE_INLINE btScalar norm(const TVStack& a)
+	{
+		btScalar ret = 0;
+		for (int i = 0; i < a.size(); ++i)
+		{
+			for (int d = 0; d < 3; ++d)
+			{
+				ret = btMax(ret, btFabs(a[i][d]));
+			}
+		}
+		return ret;
+	}
+
+	virtual SIMD_FORCE_INLINE btScalar dot(const TVStack& a, const TVStack& b)
+	{
+		btScalar ans(0);
+		for (int i = 0; i < a.size(); ++i)
+			ans += a[i].dot(b[i]);
+		return ans;
+	}
+
+	virtual SIMD_FORCE_INLINE void multAndAddTo(btScalar s, const TVStack& a, TVStack& result)
+	{
+		//        result += s*a
+		btAssert(a.size() == result.size());
+		for (int i = 0; i < a.size(); ++i)
+			result[i] += s * a[i];
+	}
+
+	virtual SIMD_FORCE_INLINE TVStack multAndAdd(btScalar s, const TVStack& a, const TVStack& b)
+	{
+		// result = a*s + b
+		TVStack result;
+		result.resize(a.size());
+		for (int i = 0; i < a.size(); ++i)
+			result[i] = s * a[i] + b[i];
+		return result;
+	}
+
+	virtual SIMD_FORCE_INLINE void setTolerance(btScalar tolerance)
+	{
+		m_tolerance = tolerance;
+	}
+};
+#endif /* BT_KRYLOV_SOLVER_H */
diff --git a/thirdparty/bullet/BulletSoftBody/btPreconditioner.h b/thirdparty/bullet/BulletSoftBody/btPreconditioner.h
index c2db448ef8..21c1106a42 100644
--- a/thirdparty/bullet/BulletSoftBody/btPreconditioner.h
+++ b/thirdparty/bullet/BulletSoftBody/btPreconditioner.h
@@ -19,269 +19,266 @@
 class Preconditioner
 {
 public:
-    typedef btAlignedObjectArray<btVector3> TVStack;
-    virtual void operator()(const TVStack& x, TVStack& b) = 0;
-    virtual void reinitialize(bool nodeUpdated) = 0;
-    virtual ~Preconditioner(){}
+	typedef btAlignedObjectArray<btVector3> TVStack;
+	virtual void operator()(const TVStack& x, TVStack& b) = 0;
+	virtual void reinitialize(bool nodeUpdated) = 0;
+	virtual ~Preconditioner() {}
 };
 
 class DefaultPreconditioner : public Preconditioner
 {
 public:
-    virtual void operator()(const TVStack& x, TVStack& b)
-    {
-        btAssert(b.size() == x.size());
-        for (int i = 0; i < b.size(); ++i)
-            b[i] = x[i];
-    }
-    virtual void reinitialize(bool nodeUpdated)
-    {
-    }
-    
-    virtual ~DefaultPreconditioner(){}
+	virtual void operator()(const TVStack& x, TVStack& b)
+	{
+		btAssert(b.size() == x.size());
+		for (int i = 0; i < b.size(); ++i)
+			b[i] = x[i];
+	}
+	virtual void reinitialize(bool nodeUpdated)
+	{
+	}
+
+	virtual ~DefaultPreconditioner() {}
 };
 
 class MassPreconditioner : public Preconditioner
 {
-    btAlignedObjectArray<btScalar> m_inv_mass;
-    const btAlignedObjectArray<btSoftBody *>& m_softBodies;
+	btAlignedObjectArray<btScalar> m_inv_mass;
+	const btAlignedObjectArray<btSoftBody*>& m_softBodies;
+
 public:
-    MassPreconditioner(const btAlignedObjectArray<btSoftBody *>& softBodies)
-    : m_softBodies(softBodies)
-    {
-    }
-    
-    virtual void reinitialize(bool nodeUpdated)
-    {
-        if (nodeUpdated)
-        {
-            m_inv_mass.clear();
-            for (int i = 0; i < m_softBodies.size(); ++i)
-            {
-                btSoftBody* psb = m_softBodies[i];
-                for (int j = 0; j < psb->m_nodes.size(); ++j)
-                    m_inv_mass.push_back(psb->m_nodes[j].m_im);
-            }
-        }
-    }
-    
-    virtual void operator()(const TVStack& x, TVStack& b)
-    {
-        btAssert(b.size() == x.size());
-        btAssert(m_inv_mass.size() <= x.size());
-        for (int i = 0; i < m_inv_mass.size(); ++i)
-        {
-            b[i] = x[i] * m_inv_mass[i];
-        }
-        for (int i = m_inv_mass.size(); i < b.size(); ++i)
-        {
-            b[i] = x[i];
-        }
-    }
-};
+	MassPreconditioner(const btAlignedObjectArray<btSoftBody*>& softBodies)
+		: m_softBodies(softBodies)
+	{
+	}
 
+	virtual void reinitialize(bool nodeUpdated)
+	{
+		if (nodeUpdated)
+		{
+			m_inv_mass.clear();
+			for (int i = 0; i < m_softBodies.size(); ++i)
+			{
+				btSoftBody* psb = m_softBodies[i];
+				for (int j = 0; j < psb->m_nodes.size(); ++j)
+					m_inv_mass.push_back(psb->m_nodes[j].m_im);
+			}
+		}
+	}
+
+	virtual void operator()(const TVStack& x, TVStack& b)
+	{
+		btAssert(b.size() == x.size());
+		btAssert(m_inv_mass.size() <= x.size());
+		for (int i = 0; i < m_inv_mass.size(); ++i)
+		{
+			b[i] = x[i] * m_inv_mass[i];
+		}
+		for (int i = m_inv_mass.size(); i < b.size(); ++i)
+		{
+			b[i] = x[i];
+		}
+	}
+};
 
 class KKTPreconditioner : public Preconditioner
 {
-    const btAlignedObjectArray<btSoftBody *>& m_softBodies;
-    const btDeformableContactProjection& m_projections;
-    const btAlignedObjectArray<btDeformableLagrangianForce*>& m_lf;
-    TVStack m_inv_A, m_inv_S;
-    const btScalar& m_dt;
-    const bool& m_implicit;
+	const btAlignedObjectArray<btSoftBody*>& m_softBodies;
+	const btDeformableContactProjection& m_projections;
+	const btAlignedObjectArray<btDeformableLagrangianForce*>& m_lf;
+	TVStack m_inv_A, m_inv_S;
+	const btScalar& m_dt;
+	const bool& m_implicit;
+
 public:
-    KKTPreconditioner(const btAlignedObjectArray<btSoftBody *>& softBodies, const btDeformableContactProjection& projections, const btAlignedObjectArray<btDeformableLagrangianForce*>& lf, const btScalar& dt, const bool& implicit)
-    : m_softBodies(softBodies)
-    , m_projections(projections)
-    , m_lf(lf)
-    , m_dt(dt)
-    , m_implicit(implicit)
-    {
-    }
-    
-    virtual void reinitialize(bool nodeUpdated)
-    {
-        if (nodeUpdated)
-        {
-            int num_nodes = 0;
-            for (int i = 0; i < m_softBodies.size(); ++i)
-            {
-                btSoftBody* psb = m_softBodies[i];
-                num_nodes += psb->m_nodes.size();
-            }
-            m_inv_A.resize(num_nodes);
-        }
-        buildDiagonalA(m_inv_A);
-        for (int i = 0; i < m_inv_A.size(); ++i)
-        {
-//            printf("A[%d] = %f, %f, %f \n", i, m_inv_A[i][0], m_inv_A[i][1], m_inv_A[i][2]);
-            for (int d = 0; d < 3; ++d)
-            {
-                m_inv_A[i][d] = (m_inv_A[i][d] == 0) ? 0.0 : 1.0/ m_inv_A[i][d];
-            }
-        }
-        m_inv_S.resize(m_projections.m_lagrangeMultipliers.size());
-//        printf("S.size() = %d \n", m_inv_S.size());
-        buildDiagonalS(m_inv_A, m_inv_S);
-        for (int i = 0; i < m_inv_S.size(); ++i)
-        {
-//            printf("S[%d] = %f, %f, %f \n", i, m_inv_S[i][0], m_inv_S[i][1], m_inv_S[i][2]);
-            for (int d = 0; d < 3; ++d)
-            {
-                m_inv_S[i][d] = (m_inv_S[i][d] == 0) ? 0.0 : 1.0/ m_inv_S[i][d];
-            }
-        }
-    }
-    
-    void buildDiagonalA(TVStack& diagA) const
-    {
-        size_t counter = 0;
-        for (int i = 0; i < m_softBodies.size(); ++i)
-        {
-            btSoftBody* psb = m_softBodies[i];
-            for (int j = 0; j < psb->m_nodes.size(); ++j)
-            {
-                const btSoftBody::Node& node = psb->m_nodes[j];
-                diagA[counter] = (node.m_im == 0) ? btVector3(0,0,0) : btVector3(1.0/node.m_im, 1.0 / node.m_im, 1.0 / node.m_im);
-                ++counter;
-            }
-        }
-        if (m_implicit)
-        {
-            printf("implicit not implemented\n");
-            btAssert(false);
-        }
-        for (int i = 0; i < m_lf.size(); ++i)
-        {
-            // add damping matrix
-            m_lf[i]->buildDampingForceDifferentialDiagonal(-m_dt, diagA);
-        }
-    }
-    
-    void buildDiagonalS(const TVStack& inv_A, TVStack& diagS)
-    {
-        for (int c = 0; c < m_projections.m_lagrangeMultipliers.size(); ++c)
-        {
-            // S[k,k] = e_k^T * C A_d^-1 C^T * e_k
-            const LagrangeMultiplier& lm = m_projections.m_lagrangeMultipliers[c];
-            btVector3& t = diagS[c];
-            t.setZero();
-            for (int j = 0; j < lm.m_num_constraints; ++j)
-            {
-                for (int i = 0; i < lm.m_num_nodes; ++i)
-                {
-                    for (int d = 0; d < 3; ++d)
-                    {
-                        t[j] += inv_A[lm.m_indices[i]][d] * lm.m_dirs[j][d] * lm.m_dirs[j][d] * lm.m_weights[i] * lm.m_weights[i];
-                    }
-                }
-            }
-        }
-    }
-#define USE_FULL_PRECONDITIONER
+	KKTPreconditioner(const btAlignedObjectArray<btSoftBody*>& softBodies, const btDeformableContactProjection& projections, const btAlignedObjectArray<btDeformableLagrangianForce*>& lf, const btScalar& dt, const bool& implicit)
+		: m_softBodies(softBodies), m_projections(projections), m_lf(lf), m_dt(dt), m_implicit(implicit)
+	{
+	}
+
+	virtual void reinitialize(bool nodeUpdated)
+	{
+		if (nodeUpdated)
+		{
+			int num_nodes = 0;
+			for (int i = 0; i < m_softBodies.size(); ++i)
+			{
+				btSoftBody* psb = m_softBodies[i];
+				num_nodes += psb->m_nodes.size();
+			}
+			m_inv_A.resize(num_nodes);
+		}
+		buildDiagonalA(m_inv_A);
+		for (int i = 0; i < m_inv_A.size(); ++i)
+		{
+			//            printf("A[%d] = %f, %f, %f \n", i, m_inv_A[i][0], m_inv_A[i][1], m_inv_A[i][2]);
+			for (int d = 0; d < 3; ++d)
+			{
+				m_inv_A[i][d] = (m_inv_A[i][d] == 0) ? 0.0 : 1.0 / m_inv_A[i][d];
+			}
+		}
+		m_inv_S.resize(m_projections.m_lagrangeMultipliers.size());
+		//        printf("S.size() = %d \n", m_inv_S.size());
+		buildDiagonalS(m_inv_A, m_inv_S);
+		for (int i = 0; i < m_inv_S.size(); ++i)
+		{
+			//            printf("S[%d] = %f, %f, %f \n", i, m_inv_S[i][0], m_inv_S[i][1], m_inv_S[i][2]);
+			for (int d = 0; d < 3; ++d)
+			{
+				m_inv_S[i][d] = (m_inv_S[i][d] == 0) ? 0.0 : 1.0 / m_inv_S[i][d];
+			}
+		}
+	}
+
+	void buildDiagonalA(TVStack& diagA) const
+	{
+		size_t counter = 0;
+		for (int i = 0; i < m_softBodies.size(); ++i)
+		{
+			btSoftBody* psb = m_softBodies[i];
+			for (int j = 0; j < psb->m_nodes.size(); ++j)
+			{
+				const btSoftBody::Node& node = psb->m_nodes[j];
+				diagA[counter] = (node.m_im == 0) ? btVector3(0, 0, 0) : btVector3(1.0 / node.m_im, 1.0 / node.m_im, 1.0 / node.m_im);
+				++counter;
+			}
+		}
+		if (m_implicit)
+		{
+			printf("implicit not implemented\n");
+			btAssert(false);
+		}
+		for (int i = 0; i < m_lf.size(); ++i)
+		{
+			// add damping matrix
+			m_lf[i]->buildDampingForceDifferentialDiagonal(-m_dt, diagA);
+		}
+	}
+
+	void buildDiagonalS(const TVStack& inv_A, TVStack& diagS)
+	{
+		for (int c = 0; c < m_projections.m_lagrangeMultipliers.size(); ++c)
+		{
+			// S[k,k] = e_k^T * C A_d^-1 C^T * e_k
+			const LagrangeMultiplier& lm = m_projections.m_lagrangeMultipliers[c];
+			btVector3& t = diagS[c];
+			t.setZero();
+			for (int j = 0; j < lm.m_num_constraints; ++j)
+			{
+				for (int i = 0; i < lm.m_num_nodes; ++i)
+				{
+					for (int d = 0; d < 3; ++d)
+					{
+						t[j] += inv_A[lm.m_indices[i]][d] * lm.m_dirs[j][d] * lm.m_dirs[j][d] * lm.m_weights[i] * lm.m_weights[i];
+					}
+				}
+			}
+		}
+	}
+//#define USE_FULL_PRECONDITIONER
 #ifndef USE_FULL_PRECONDITIONER
-    virtual void operator()(const TVStack& x, TVStack& b)
-    {
-        btAssert(b.size() == x.size());
-        for (int i = 0; i < m_inv_A.size(); ++i)
-        {
-            b[i] = x[i] * m_inv_A[i];
-        }
-        int offset = m_inv_A.size();
-        for (int i = 0; i < m_inv_S.size(); ++i)
-        {
-            b[i+offset] = x[i+offset] * m_inv_S[i];
-        }
-    }
+	virtual void operator()(const TVStack& x, TVStack& b)
+	{
+		btAssert(b.size() == x.size());
+		for (int i = 0; i < m_inv_A.size(); ++i)
+		{
+			b[i] = x[i] * m_inv_A[i];
+		}
+		int offset = m_inv_A.size();
+		for (int i = 0; i < m_inv_S.size(); ++i)
+		{
+			b[i + offset] = x[i + offset] * m_inv_S[i];
+		}
+	}
 #else
-    virtual void operator()(const TVStack& x, TVStack& b)
-    {
-        btAssert(b.size() == x.size());
-        int offset = m_inv_A.size();
+	virtual void operator()(const TVStack& x, TVStack& b)
+	{
+		btAssert(b.size() == x.size());
+		int offset = m_inv_A.size();
 
-        for (int i = 0; i < m_inv_A.size(); ++i)
-        {
-            b[i] = x[i] * m_inv_A[i];
-        }
+		for (int i = 0; i < m_inv_A.size(); ++i)
+		{
+			b[i] = x[i] * m_inv_A[i];
+		}
 
-        for (int i = 0; i < m_inv_S.size(); ++i)
-        {
-            b[i+offset].setZero();
-        }
+		for (int i = 0; i < m_inv_S.size(); ++i)
+		{
+			b[i + offset].setZero();
+		}
 
-        for (int c = 0; c < m_projections.m_lagrangeMultipliers.size(); ++c)
-        {
-            const LagrangeMultiplier& lm = m_projections.m_lagrangeMultipliers[c];
-            // C * x
-            for (int d = 0; d < lm.m_num_constraints; ++d)
-            {
-                for (int i = 0; i < lm.m_num_nodes; ++i)
-                {
-                    b[offset+c][d] += lm.m_weights[i] * b[lm.m_indices[i]].dot(lm.m_dirs[d]);
-                }
-            }
-        }
+		for (int c = 0; c < m_projections.m_lagrangeMultipliers.size(); ++c)
+		{
+			const LagrangeMultiplier& lm = m_projections.m_lagrangeMultipliers[c];
+			// C * x
+			for (int d = 0; d < lm.m_num_constraints; ++d)
+			{
+				for (int i = 0; i < lm.m_num_nodes; ++i)
+				{
+					b[offset + c][d] += lm.m_weights[i] * b[lm.m_indices[i]].dot(lm.m_dirs[d]);
+				}
+			}
+		}
 
-        for (int i = 0; i < m_inv_S.size(); ++i)
-        {
-            b[i+offset] = b[i+offset] * m_inv_S[i];
-        }
+		for (int i = 0; i < m_inv_S.size(); ++i)
+		{
+			b[i + offset] = b[i + offset] * m_inv_S[i];
+		}
 
-        for (int i = 0; i < m_inv_A.size(); ++i)
-        {
-            b[i].setZero();
-        }
+		for (int i = 0; i < m_inv_A.size(); ++i)
+		{
+			b[i].setZero();
+		}
 
-        for (int c = 0; c < m_projections.m_lagrangeMultipliers.size(); ++c)
-        {
-            // C^T * lambda
-            const LagrangeMultiplier& lm = m_projections.m_lagrangeMultipliers[c];
-            for (int i = 0; i < lm.m_num_nodes; ++i)
-            {
-                for (int j = 0; j < lm.m_num_constraints; ++j)
-                {
-                    b[lm.m_indices[i]] += b[offset+c][j] * lm.m_weights[i] * lm.m_dirs[j];
-                }
-            }
-        }
+		for (int c = 0; c < m_projections.m_lagrangeMultipliers.size(); ++c)
+		{
+			// C^T * lambda
+			const LagrangeMultiplier& lm = m_projections.m_lagrangeMultipliers[c];
+			for (int i = 0; i < lm.m_num_nodes; ++i)
+			{
+				for (int j = 0; j < lm.m_num_constraints; ++j)
+				{
+					b[lm.m_indices[i]] += b[offset + c][j] * lm.m_weights[i] * lm.m_dirs[j];
+				}
+			}
+		}
 
-        for (int i = 0; i < m_inv_A.size(); ++i)
-        {
-            b[i] = (x[i] - b[i]) * m_inv_A[i];
-        }
+		for (int i = 0; i < m_inv_A.size(); ++i)
+		{
+			b[i] = (x[i] - b[i]) * m_inv_A[i];
+		}
 
-        TVStack t;
-        t.resize(b.size());
-        for (int i = 0; i < m_inv_S.size(); ++i)
-        {
-            t[i+offset] = x[i+offset] * m_inv_S[i];
-        }
-        for (int i = 0; i < m_inv_A.size(); ++i)
-        {
-            t[i].setZero();
-        }
-        for (int c = 0; c < m_projections.m_lagrangeMultipliers.size(); ++c)
-        {
-            // C^T * lambda
-            const LagrangeMultiplier& lm = m_projections.m_lagrangeMultipliers[c];
-            for (int i = 0; i < lm.m_num_nodes; ++i)
-            {
-                for (int j = 0; j < lm.m_num_constraints; ++j)
-                {
-                    t[lm.m_indices[i]] += t[offset+c][j] * lm.m_weights[i] * lm.m_dirs[j];
-                }
-            }
-        }
-        for (int i = 0; i < m_inv_A.size(); ++i)
-        {
-            b[i] += t[i] * m_inv_A[i];
-        }
+		TVStack t;
+		t.resize(b.size());
+		for (int i = 0; i < m_inv_S.size(); ++i)
+		{
+			t[i + offset] = x[i + offset] * m_inv_S[i];
+		}
+		for (int i = 0; i < m_inv_A.size(); ++i)
+		{
+			t[i].setZero();
+		}
+		for (int c = 0; c < m_projections.m_lagrangeMultipliers.size(); ++c)
+		{
+			// C^T * lambda
+			const LagrangeMultiplier& lm = m_projections.m_lagrangeMultipliers[c];
+			for (int i = 0; i < lm.m_num_nodes; ++i)
+			{
+				for (int j = 0; j < lm.m_num_constraints; ++j)
+				{
+					t[lm.m_indices[i]] += t[offset + c][j] * lm.m_weights[i] * lm.m_dirs[j];
+				}
+			}
+		}
+		for (int i = 0; i < m_inv_A.size(); ++i)
+		{
+			b[i] += t[i] * m_inv_A[i];
+		}
 
-        for (int i = 0; i < m_inv_S.size(); ++i)
-        {
-            b[i+offset] -= x[i+offset] * m_inv_S[i];
-        }
-    }
+		for (int i = 0; i < m_inv_S.size(); ++i)
+		{
+			b[i + offset] -= x[i + offset] * m_inv_S[i];
+		}
+	}
 #endif
 };
 
diff --git a/thirdparty/bullet/BulletSoftBody/btSoftBody.cpp b/thirdparty/bullet/BulletSoftBody/btSoftBody.cpp
index 81b846d7f8..d1980ea6c5 100644
--- a/thirdparty/bullet/BulletSoftBody/btSoftBody.cpp
+++ b/thirdparty/bullet/BulletSoftBody/btSoftBody.cpp
@@ -37,12 +37,12 @@ static inline btDbvtNode* buildTreeBottomUp(btAlignedObjectArray<btDbvtNode*>& l
 	{
 		btAlignedObjectArray<bool> marked;
 		btAlignedObjectArray<btDbvtNode*> newLeafNodes;
-		btAlignedObjectArray<std::pair<int,int> > childIds;
+		btAlignedObjectArray<std::pair<int, int> > childIds;
 		btAlignedObjectArray<btAlignedObjectArray<int> > newAdj;
 		marked.resize(N);
 		for (int i = 0; i < N; ++i)
 			marked[i] = false;
-		
+
 		// pair adjacent nodes into new(parent) node
 		for (int i = 0; i < N; ++i)
 		{
@@ -61,7 +61,7 @@ static inline btDbvtNode* buildTreeBottomUp(btAlignedObjectArray<btDbvtNode*>& l
 					leafNodes[i]->parent = node;
 					leafNodes[n]->parent = node;
 					newLeafNodes.push_back(node);
-					childIds.push_back(std::make_pair(i,n));
+					childIds.push_back(std::make_pair(i, n));
 					merged = true;
 					marked[n] = true;
 					break;
@@ -70,7 +70,7 @@ static inline btDbvtNode* buildTreeBottomUp(btAlignedObjectArray<btDbvtNode*>& l
 			if (!merged)
 			{
 				newLeafNodes.push_back(leafNodes[i]);
-				childIds.push_back(std::make_pair(i,-1));
+				childIds.push_back(std::make_pair(i, -1));
 			}
 			marked[i] = true;
 		}
@@ -78,7 +78,7 @@ static inline btDbvtNode* buildTreeBottomUp(btAlignedObjectArray<btDbvtNode*>& l
 		newAdj.resize(newLeafNodes.size());
 		for (int i = 0; i < newLeafNodes.size(); ++i)
 		{
-			for (int j = i+1; j < newLeafNodes.size(); ++j)
+			for (int j = i + 1; j < newLeafNodes.size(); ++j)
 			{
 				bool neighbor = false;
 				const btAlignedObjectArray<int>& leftChildNeighbors = adj[childIds[i].first];
@@ -143,7 +143,7 @@ btSoftBody::btSoftBody(btSoftBodyWorldInfo* worldInfo, int node_count, const btV
 	/* Nodes			*/
 	const btScalar margin = getCollisionShape()->getMargin();
 	m_nodes.resize(node_count);
-    m_X.resize(node_count);
+	m_X.resize(node_count);
 	for (int i = 0, ni = node_count; i < ni; ++i)
 	{
 		Node& n = m_nodes[i];
@@ -154,7 +154,7 @@ btSoftBody::btSoftBody(btSoftBodyWorldInfo* worldInfo, int node_count, const btV
 		n.m_im = n.m_im > 0 ? 1 / n.m_im : 0;
 		n.m_leaf = m_ndbvt.insert(btDbvtVolume::FromCR(n.m_x, margin), &n);
 		n.m_material = pm;
-        m_X[i] = n.m_x;
+		m_X[i] = n.m_x;
 	}
 	updateBounds();
 	setCollisionQuadrature(3);
@@ -195,8 +195,8 @@ void btSoftBody::initDefaults()
 	m_cfg.piterations = 1;
 	m_cfg.diterations = 0;
 	m_cfg.citerations = 4;
-    m_cfg.drag = 0;
-    m_cfg.m_maxStress = 0;
+	m_cfg.drag = 0;
+	m_cfg.m_maxStress = 0;
 	m_cfg.collisions = fCollision::Default;
 	m_pose.m_bvolume = false;
 	m_pose.m_bframe = false;
@@ -222,12 +222,14 @@ void btSoftBody::initDefaults()
 	m_windVelocity = btVector3(0, 0, 0);
 	m_restLengthScale = btScalar(1.0);
 	m_dampingCoefficient = 1.0;
-	m_sleepingThreshold = .4;
+	m_sleepingThreshold = .04;
 	m_useSelfCollision = false;
 	m_collisionFlags = 0;
 	m_softSoftCollision = false;
 	m_maxSpeedSquared = 0;
 	m_repulsionStiffness = 0.5;
+	m_gravityFactor = 1;
+	m_cacheBarycenter = false;
 	m_fdbvnt = 0;
 }
 
@@ -436,7 +438,7 @@ void btSoftBody::appendFace(int model, Material* mat)
 		ZeroInitialize(f);
 		f.m_material = mat ? mat : m_materials[0];
 	}
-    m_faces.push_back(f);
+	m_faces.push_back(f);
 }
 
 //
@@ -525,94 +527,111 @@ void btSoftBody::appendAnchor(int node, btRigidBody* body, const btVector3& loca
 //
 void btSoftBody::appendDeformableAnchor(int node, btRigidBody* body)
 {
-    DeformableNodeRigidAnchor c;
-    btSoftBody::Node& n = m_nodes[node];
-    const btScalar ima = n.m_im;
-    const btScalar imb = body->getInvMass();
-    btVector3 nrm;
-    const btCollisionShape* shp = body->getCollisionShape();
-    const btTransform& wtr = body->getWorldTransform();
-    btScalar dst =
-    m_worldInfo->m_sparsesdf.Evaluate(
-                                      wtr.invXform(m_nodes[node].m_x),
-                                      shp,
-                                      nrm,
-                                      0);
-
-    c.m_cti.m_colObj = body;
-    c.m_cti.m_normal = wtr.getBasis() * nrm;
-    c.m_cti.m_offset = dst;
-    c.m_node = &m_nodes[node];
-    const btScalar fc = m_cfg.kDF * body->getFriction();
-    c.m_c2 = ima;
-    c.m_c3 = fc;
-    c.m_c4 = body->isStaticOrKinematicObject() ? m_cfg.kKHR : m_cfg.kCHR;
-    static const btMatrix3x3 iwiStatic(0, 0, 0, 0, 0, 0, 0, 0, 0);
-    const btMatrix3x3& iwi = body->getInvInertiaTensorWorld();
-    const btVector3 ra = n.m_x - wtr.getOrigin();
-
-    c.m_c0 = ImpulseMatrix(1, ima, imb, iwi, ra);
-    c.m_c1 = ra;
-    c.m_local = body->getWorldTransform().inverse() * m_nodes[node].m_x;
-    c.m_node->m_battach = 1;
-    m_deformableAnchors.push_back(c);
+	DeformableNodeRigidAnchor c;
+	btSoftBody::Node& n = m_nodes[node];
+	const btScalar ima = n.m_im;
+	const btScalar imb = body->getInvMass();
+	btVector3 nrm;
+	const btCollisionShape* shp = body->getCollisionShape();
+	const btTransform& wtr = body->getWorldTransform();
+	btScalar dst =
+		m_worldInfo->m_sparsesdf.Evaluate(
+			wtr.invXform(m_nodes[node].m_x),
+			shp,
+			nrm,
+			0);
+
+	c.m_cti.m_colObj = body;
+	c.m_cti.m_normal = wtr.getBasis() * nrm;
+	c.m_cti.m_offset = dst;
+	c.m_node = &m_nodes[node];
+	const btScalar fc = m_cfg.kDF * body->getFriction();
+	c.m_c2 = ima;
+	c.m_c3 = fc;
+	c.m_c4 = body->isStaticOrKinematicObject() ? m_cfg.kKHR : m_cfg.kCHR;
+	static const btMatrix3x3 iwiStatic(0, 0, 0, 0, 0, 0, 0, 0, 0);
+	const btMatrix3x3& iwi = body->getInvInertiaTensorWorld();
+	const btVector3 ra = n.m_x - wtr.getOrigin();
+
+	c.m_c0 = ImpulseMatrix(1, ima, imb, iwi, ra);
+	c.m_c1 = ra;
+	c.m_local = body->getWorldTransform().inverse() * m_nodes[node].m_x;
+	c.m_node->m_battach = 1;
+	m_deformableAnchors.push_back(c);
+}
+
+void btSoftBody::removeAnchor(int node)
+{
+	const btSoftBody::Node& n = m_nodes[node];
+	for (int i = 0; i < m_deformableAnchors.size();)
+	{
+		const DeformableNodeRigidAnchor& c = m_deformableAnchors[i];
+		if (c.m_node == &n)
+		{
+			m_deformableAnchors.removeAtIndex(i);
+		}
+		else
+		{
+			i++;
+		}
+	}
 }
 
 //
 void btSoftBody::appendDeformableAnchor(int node, btMultiBodyLinkCollider* link)
 {
-    DeformableNodeRigidAnchor c;
-    btSoftBody::Node& n = m_nodes[node];
-    const btScalar ima = n.m_im;
-    btVector3 nrm;
-    const btCollisionShape* shp = link->getCollisionShape();
-    const btTransform& wtr = link->getWorldTransform();
-    btScalar dst =
-    m_worldInfo->m_sparsesdf.Evaluate(
-                                      wtr.invXform(m_nodes[node].m_x),
-                                      shp,
-                                      nrm,
-                                      0);
-    c.m_cti.m_colObj = link;
-    c.m_cti.m_normal = wtr.getBasis() * nrm;
-    c.m_cti.m_offset = dst;
-    c.m_node = &m_nodes[node];
-    const btScalar fc = m_cfg.kDF * link->getFriction();
-    c.m_c2 = ima;
-    c.m_c3 = fc;
-    c.m_c4 = link->isStaticOrKinematicObject() ? m_cfg.kKHR : m_cfg.kCHR;
-    btVector3 normal = c.m_cti.m_normal;
-    btVector3 t1 = generateUnitOrthogonalVector(normal);
-    btVector3 t2 = btCross(normal, t1);
-    btMultiBodyJacobianData jacobianData_normal, jacobianData_t1, jacobianData_t2;
-    findJacobian(link, jacobianData_normal, c.m_node->m_x, normal);
-    findJacobian(link, jacobianData_t1, c.m_node->m_x, t1);
-    findJacobian(link, jacobianData_t2, c.m_node->m_x, t2);
-    
-    btScalar* J_n = &jacobianData_normal.m_jacobians[0];
-    btScalar* J_t1 = &jacobianData_t1.m_jacobians[0];
-    btScalar* J_t2 = &jacobianData_t2.m_jacobians[0];
-    
-    btScalar* u_n = &jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
-    btScalar* u_t1 = &jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
-    btScalar* u_t2 = &jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
-    
-    btMatrix3x3 rot(normal.getX(), normal.getY(), normal.getZ(),
-                    t1.getX(), t1.getY(), t1.getZ(),
-                    t2.getX(), t2.getY(), t2.getZ()); // world frame to local frame
-    const int ndof = link->m_multiBody->getNumDofs() + 6;
-    btMatrix3x3 local_impulse_matrix = (Diagonal(n.m_im) + OuterProduct(J_n, J_t1, J_t2, u_n, u_t1, u_t2, ndof)).inverse();
-    c.m_c0 =  rot.transpose() * local_impulse_matrix * rot;
-    c.jacobianData_normal = jacobianData_normal;
-    c.jacobianData_t1 = jacobianData_t1;
-    c.jacobianData_t2 = jacobianData_t2;
-    c.t1 = t1;
-    c.t2 = t2;
-    const btVector3 ra = n.m_x - wtr.getOrigin();
-    c.m_c1 = ra;
-    c.m_local = link->getWorldTransform().inverse() * m_nodes[node].m_x;
-    c.m_node->m_battach = 1;
-    m_deformableAnchors.push_back(c);
+	DeformableNodeRigidAnchor c;
+	btSoftBody::Node& n = m_nodes[node];
+	const btScalar ima = n.m_im;
+	btVector3 nrm;
+	const btCollisionShape* shp = link->getCollisionShape();
+	const btTransform& wtr = link->getWorldTransform();
+	btScalar dst =
+		m_worldInfo->m_sparsesdf.Evaluate(
+			wtr.invXform(m_nodes[node].m_x),
+			shp,
+			nrm,
+			0);
+	c.m_cti.m_colObj = link;
+	c.m_cti.m_normal = wtr.getBasis() * nrm;
+	c.m_cti.m_offset = dst;
+	c.m_node = &m_nodes[node];
+	const btScalar fc = m_cfg.kDF * link->getFriction();
+	c.m_c2 = ima;
+	c.m_c3 = fc;
+	c.m_c4 = link->isStaticOrKinematicObject() ? m_cfg.kKHR : m_cfg.kCHR;
+	btVector3 normal = c.m_cti.m_normal;
+	btVector3 t1 = generateUnitOrthogonalVector(normal);
+	btVector3 t2 = btCross(normal, t1);
+	btMultiBodyJacobianData jacobianData_normal, jacobianData_t1, jacobianData_t2;
+	findJacobian(link, jacobianData_normal, c.m_node->m_x, normal);
+	findJacobian(link, jacobianData_t1, c.m_node->m_x, t1);
+	findJacobian(link, jacobianData_t2, c.m_node->m_x, t2);
+
+	btScalar* J_n = &jacobianData_normal.m_jacobians[0];
+	btScalar* J_t1 = &jacobianData_t1.m_jacobians[0];
+	btScalar* J_t2 = &jacobianData_t2.m_jacobians[0];
+
+	btScalar* u_n = &jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
+	btScalar* u_t1 = &jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
+	btScalar* u_t2 = &jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
+
+	btMatrix3x3 rot(normal.getX(), normal.getY(), normal.getZ(),
+					t1.getX(), t1.getY(), t1.getZ(),
+					t2.getX(), t2.getY(), t2.getZ());  // world frame to local frame
+	const int ndof = link->m_multiBody->getNumDofs() + 6;
+	btMatrix3x3 local_impulse_matrix = (Diagonal(n.m_im) + OuterProduct(J_n, J_t1, J_t2, u_n, u_t1, u_t2, ndof)).inverse();
+	c.m_c0 = rot.transpose() * local_impulse_matrix * rot;
+	c.jacobianData_normal = jacobianData_normal;
+	c.jacobianData_t1 = jacobianData_t1;
+	c.jacobianData_t2 = jacobianData_t2;
+	c.t1 = t1;
+	c.t2 = t2;
+	const btVector3 ra = n.m_x - wtr.getOrigin();
+	c.m_c1 = ra;
+	c.m_local = link->getWorldTransform().inverse() * m_nodes[node].m_x;
+	c.m_node->m_battach = 1;
+	m_deformableAnchors.push_back(c);
 }
 //
 void btSoftBody::appendLinearJoint(const LJoint::Specs& specs, Cluster* body0, Body body1)
@@ -731,7 +750,7 @@ void btSoftBody::addAeroForceToNode(const btVector3& windVelocity, int nodeIndex
 					fDrag = 0.5f * kDG * medium.m_density * rel_v2 * tri_area * n_dot_v * (-rel_v_nrm);
 
 					// Check angle of attack
-					// cos(10�) = 0.98480
+					// cos(10º) = 0.98480
 					if (0 < n_dot_v && n_dot_v < 0.98480f)
 						fLift = 0.5f * kLF * medium.m_density * rel_v_len * tri_area * btSqrt(1.0f - n_dot_v * n_dot_v) * (nrm.cross(rel_v_nrm).cross(rel_v_nrm));
 
@@ -817,7 +836,7 @@ void btSoftBody::addAeroForceToFace(const btVector3& windVelocity, int faceIndex
 				fDrag = 0.5f * kDG * medium.m_density * rel_v2 * tri_area * n_dot_v * (-rel_v_nrm);
 
 				// Check angle of attack
-				// cos(10�) = 0.98480
+				// cos(10º) = 0.98480
 				if (0 < n_dot_v && n_dot_v < 0.98480f)
 					fLift = 0.5f * kLF * medium.m_density * rel_v_len * tri_area * btSqrt(1.0f - n_dot_v * n_dot_v) * (nrm.cross(rel_v_nrm).cross(rel_v_nrm));
 
@@ -882,6 +901,7 @@ void btSoftBody::setVelocity(const btVector3& velocity)
 		if (n.m_im > 0)
 		{
 			n.m_v = velocity;
+			n.m_vn = velocity;
 		}
 	}
 }
@@ -1010,66 +1030,70 @@ void btSoftBody::setVolumeDensity(btScalar density)
 //
 btVector3 btSoftBody::getLinearVelocity()
 {
-    btVector3 total_momentum = btVector3(0,0,0);
-    for (int i = 0; i < m_nodes.size(); ++i)
-    {
-        btScalar mass = m_nodes[i].m_im == 0 ? 0 : 1.0/m_nodes[i].m_im;
-        total_momentum += mass * m_nodes[i].m_v;
-    }
-    btScalar total_mass = getTotalMass();
-    return total_mass == 0 ? total_momentum : total_momentum / total_mass;
+	btVector3 total_momentum = btVector3(0, 0, 0);
+	for (int i = 0; i < m_nodes.size(); ++i)
+	{
+		btScalar mass = m_nodes[i].m_im == 0 ? 0 : 1.0 / m_nodes[i].m_im;
+		total_momentum += mass * m_nodes[i].m_v;
+	}
+	btScalar total_mass = getTotalMass();
+	return total_mass == 0 ? total_momentum : total_momentum / total_mass;
 }
 
 //
 void btSoftBody::setLinearVelocity(const btVector3& linVel)
 {
-    btVector3 old_vel = getLinearVelocity();
-    btVector3 diff = linVel - old_vel;
-    for (int i = 0; i < m_nodes.size(); ++i)
-        m_nodes[i].m_v += diff;
+	btVector3 old_vel = getLinearVelocity();
+	btVector3 diff = linVel - old_vel;
+	for (int i = 0; i < m_nodes.size(); ++i)
+		m_nodes[i].m_v += diff;
 }
 
 //
 void btSoftBody::setAngularVelocity(const btVector3& angVel)
 {
-    btVector3 old_vel = getLinearVelocity();
-    btVector3 com = getCenterOfMass();
-    for (int i = 0; i < m_nodes.size(); ++i)
-    {
-        m_nodes[i].m_v = angVel.cross(m_nodes[i].m_x - com) + old_vel;
-    }
+	btVector3 old_vel = getLinearVelocity();
+	btVector3 com = getCenterOfMass();
+	for (int i = 0; i < m_nodes.size(); ++i)
+	{
+		m_nodes[i].m_v = angVel.cross(m_nodes[i].m_x - com) + old_vel;
+	}
 }
 
 //
 btTransform btSoftBody::getRigidTransform()
 {
-    btVector3 t = getCenterOfMass();
-    btMatrix3x3 S;
-    S.setZero();
-    // get rotation that minimizes L2 difference: \sum_i || RX_i + t - x_i ||
-    for (int i = 0; i < m_nodes.size(); ++i)
-    {
-        S += OuterProduct(m_X[i], t-m_nodes[i].m_x);
-    }
-    btVector3 sigma;
-    btMatrix3x3 U,V;
-    singularValueDecomposition(S,U,sigma,V);
-    btMatrix3x3 R = V * U.transpose();
-    btTransform trs;
-    trs.setIdentity();
-    trs.setOrigin(t);
-    trs.setBasis(R);
-    return trs;
+	btVector3 t = getCenterOfMass();
+	btMatrix3x3 S;
+	S.setZero();
+	// Get rotation that minimizes L2 difference: \sum_i || RX_i + t - x_i ||
+	// It's important to make sure that S has the correct signs.
+	// SVD is only unique up to the ordering of singular values.
+	// SVD will manipulate U and V to ensure the ordering of singular values. If all three singular
+	// vaues are negative, SVD will permute colums of U to make two of them positive.
+	for (int i = 0; i < m_nodes.size(); ++i)
+	{
+		S -= OuterProduct(m_X[i], t - m_nodes[i].m_x);
+	}
+	btVector3 sigma;
+	btMatrix3x3 U, V;
+	singularValueDecomposition(S, U, sigma, V);
+	btMatrix3x3 R = V * U.transpose();
+	btTransform trs;
+	trs.setIdentity();
+	trs.setOrigin(t);
+	trs.setBasis(R);
+	return trs;
 }
 
 //
 void btSoftBody::transformTo(const btTransform& trs)
 {
-    // get the current best rigid fit
-    btTransform current_transform = getRigidTransform();
-    // apply transform in material space
-    btTransform new_transform = trs * current_transform.inverse();
-    transform(new_transform);
+	// get the current best rigid fit
+	btTransform current_transform = getRigidTransform();
+	// apply transform in material space
+	btTransform new_transform = trs * current_transform.inverse();
+	transform(new_transform);
 }
 
 //
@@ -1130,7 +1154,7 @@ void btSoftBody::scale(const btVector3& scl)
 	updateNormals();
 	updateBounds();
 	updateConstants();
-    initializeDmInverse();
+	initializeDmInverse();
 }
 
 //
@@ -2010,22 +2034,22 @@ bool btSoftBody::rayTest(const btVector3& rayFrom,
 }
 
 bool btSoftBody::rayFaceTest(const btVector3& rayFrom,
-                         const btVector3& rayTo,
-                         sRayCast& results)
+							 const btVector3& rayTo,
+							 sRayCast& results)
 {
 	if (m_faces.size() == 0)
 		return false;
 	else
 	{
-    	if (m_fdbvt.empty())
-        	initializeFaceTree();
+		if (m_fdbvt.empty())
+			initializeFaceTree();
 	}
-    
-    results.body = this;
-    results.fraction = 1.f;
-    results.index = -1;
-    
-    return (rayFaceTest(rayFrom, rayTo, results.fraction,  results.index) != 0);
+
+	results.body = this;
+	results.fraction = 1.f;
+	results.index = -1;
+
+	return (rayFaceTest(rayFrom, rayTo, results.fraction, results.index) != 0);
 }
 
 //
@@ -2056,112 +2080,111 @@ void btSoftBody::setSolver(eSolverPresets::_ preset)
 
 void btSoftBody::predictMotion(btScalar dt)
 {
-    int i, ni;
-    
-    /* Update                */
-    if (m_bUpdateRtCst)
-    {
-        m_bUpdateRtCst = false;
-        updateConstants();
-        m_fdbvt.clear();
-        if (m_cfg.collisions & fCollision::VF_SS)
-        {
-            initializeFaceTree();
-        }
-    }
-    
-    /* Prepare                */
-    m_sst.sdt = dt * m_cfg.timescale;
-    m_sst.isdt = 1 / m_sst.sdt;
-    m_sst.velmrg = m_sst.sdt * 3;
-    m_sst.radmrg = getCollisionShape()->getMargin();
-    m_sst.updmrg = m_sst.radmrg * (btScalar)0.25;
-    /* Forces                */
-    addVelocity(m_worldInfo->m_gravity * m_sst.sdt);
-    applyForces();
-    /* Integrate            */
-    for (i = 0, ni = m_nodes.size(); i < ni; ++i)
-    {
-        Node& n = m_nodes[i];
-        n.m_q = n.m_x;
-        btVector3 deltaV = n.m_f * n.m_im * m_sst.sdt;
-        {
-            btScalar maxDisplacement = m_worldInfo->m_maxDisplacement;
-            btScalar clampDeltaV = maxDisplacement / m_sst.sdt;
-            for (int c = 0; c < 3; c++)
-            {
-                if (deltaV[c] > clampDeltaV)
-                {
-                    deltaV[c] = clampDeltaV;
-                }
-                if (deltaV[c] < -clampDeltaV)
-                {
-                    deltaV[c] = -clampDeltaV;
-                }
-            }
-        }
-        n.m_v += deltaV;
-        n.m_x += n.m_v * m_sst.sdt;
-        n.m_f = btVector3(0, 0, 0);
-    }
-    /* Clusters                */
-    updateClusters();
-    /* Bounds                */
-    updateBounds();
-    /* Nodes                */
-    ATTRIBUTE_ALIGNED16(btDbvtVolume)
-    vol;
-    for (i = 0, ni = m_nodes.size(); i < ni; ++i)
-    {
-        Node& n = m_nodes[i];
-        vol = btDbvtVolume::FromCR(n.m_x, m_sst.radmrg);
-        m_ndbvt.update(n.m_leaf,
-                       vol,
-                       n.m_v * m_sst.velmrg,
-                       m_sst.updmrg);
-    }
-    /* Faces                */
-    if (!m_fdbvt.empty())
-    {
-        for (int i = 0; i < m_faces.size(); ++i)
-        {
-            Face& f = m_faces[i];
-            const btVector3 v = (f.m_n[0]->m_v +
-                                 f.m_n[1]->m_v +
-                                 f.m_n[2]->m_v) /
-            3;
-            vol = VolumeOf(f, m_sst.radmrg);
-            m_fdbvt.update(f.m_leaf,
-                           vol,
-                           v * m_sst.velmrg,
-                           m_sst.updmrg);
-        }
-    }
-    /* Pose                    */
-    updatePose();
-    /* Match                */
-    if (m_pose.m_bframe && (m_cfg.kMT > 0))
-    {
-        const btMatrix3x3 posetrs = m_pose.m_rot;
-        for (int i = 0, ni = m_nodes.size(); i < ni; ++i)
-        {
-            Node& n = m_nodes[i];
-            if (n.m_im > 0)
-            {
-                const btVector3 x = posetrs * m_pose.m_pos[i] + m_pose.m_com;
-                n.m_x = Lerp(n.m_x, x, m_cfg.kMT);
-            }
-        }
-    }
-    /* Clear contacts        */
-    m_rcontacts.resize(0);
-    m_scontacts.resize(0);
-    /* Optimize dbvt's        */
-    m_ndbvt.optimizeIncremental(1);
-    m_fdbvt.optimizeIncremental(1);
-    m_cdbvt.optimizeIncremental(1);
-}
+	int i, ni;
 
+	/* Update                */
+	if (m_bUpdateRtCst)
+	{
+		m_bUpdateRtCst = false;
+		updateConstants();
+		m_fdbvt.clear();
+		if (m_cfg.collisions & fCollision::VF_SS)
+		{
+			initializeFaceTree();
+		}
+	}
+
+	/* Prepare                */
+	m_sst.sdt = dt * m_cfg.timescale;
+	m_sst.isdt = 1 / m_sst.sdt;
+	m_sst.velmrg = m_sst.sdt * 3;
+	m_sst.radmrg = getCollisionShape()->getMargin();
+	m_sst.updmrg = m_sst.radmrg * (btScalar)0.25;
+	/* Forces                */
+	addVelocity(m_worldInfo->m_gravity * m_sst.sdt);
+	applyForces();
+	/* Integrate            */
+	for (i = 0, ni = m_nodes.size(); i < ni; ++i)
+	{
+		Node& n = m_nodes[i];
+		n.m_q = n.m_x;
+		btVector3 deltaV = n.m_f * n.m_im * m_sst.sdt;
+		{
+			btScalar maxDisplacement = m_worldInfo->m_maxDisplacement;
+			btScalar clampDeltaV = maxDisplacement / m_sst.sdt;
+			for (int c = 0; c < 3; c++)
+			{
+				if (deltaV[c] > clampDeltaV)
+				{
+					deltaV[c] = clampDeltaV;
+				}
+				if (deltaV[c] < -clampDeltaV)
+				{
+					deltaV[c] = -clampDeltaV;
+				}
+			}
+		}
+		n.m_v += deltaV;
+		n.m_x += n.m_v * m_sst.sdt;
+		n.m_f = btVector3(0, 0, 0);
+	}
+	/* Clusters                */
+	updateClusters();
+	/* Bounds                */
+	updateBounds();
+	/* Nodes                */
+	ATTRIBUTE_ALIGNED16(btDbvtVolume)
+	vol;
+	for (i = 0, ni = m_nodes.size(); i < ni; ++i)
+	{
+		Node& n = m_nodes[i];
+		vol = btDbvtVolume::FromCR(n.m_x, m_sst.radmrg);
+		m_ndbvt.update(n.m_leaf,
+					   vol,
+					   n.m_v * m_sst.velmrg,
+					   m_sst.updmrg);
+	}
+	/* Faces                */
+	if (!m_fdbvt.empty())
+	{
+		for (int i = 0; i < m_faces.size(); ++i)
+		{
+			Face& f = m_faces[i];
+			const btVector3 v = (f.m_n[0]->m_v +
+								 f.m_n[1]->m_v +
+								 f.m_n[2]->m_v) /
+								3;
+			vol = VolumeOf(f, m_sst.radmrg);
+			m_fdbvt.update(f.m_leaf,
+						   vol,
+						   v * m_sst.velmrg,
+						   m_sst.updmrg);
+		}
+	}
+	/* Pose                    */
+	updatePose();
+	/* Match                */
+	if (m_pose.m_bframe && (m_cfg.kMT > 0))
+	{
+		const btMatrix3x3 posetrs = m_pose.m_rot;
+		for (int i = 0, ni = m_nodes.size(); i < ni; ++i)
+		{
+			Node& n = m_nodes[i];
+			if (n.m_im > 0)
+			{
+				const btVector3 x = posetrs * m_pose.m_pos[i] + m_pose.m_com;
+				n.m_x = Lerp(n.m_x, x, m_cfg.kMT);
+			}
+		}
+	}
+	/* Clear contacts        */
+	m_rcontacts.resize(0);
+	m_scontacts.resize(0);
+	/* Optimize dbvt's        */
+	m_ndbvt.optimizeIncremental(1);
+	m_fdbvt.optimizeIncremental(1);
+	m_cdbvt.optimizeIncremental(1);
+}
 
 //
 void btSoftBody::solveConstraints()
@@ -2534,12 +2557,12 @@ int btSoftBody::rayTest(const btVector3& rayFrom, const btVector3& rayTo,
 }
 
 int btSoftBody::rayFaceTest(const btVector3& rayFrom, const btVector3& rayTo,
-						btScalar& mint, int& index) const
+							btScalar& mint, int& index) const
 {
 	int cnt = 0;
 	{ /* Use dbvt	*/
 		RayFromToCaster collider(rayFrom, rayTo, mint);
-		
+
 		btDbvt::rayTest(m_fdbvt.m_root, rayFrom, rayTo, collider);
 		if (collider.m_face)
 		{
@@ -2551,7 +2574,6 @@ int btSoftBody::rayFaceTest(const btVector3& rayFrom, const btVector3& rayTo,
 	return (cnt);
 }
 
-
 //
 static inline btDbvntNode* copyToDbvnt(const btDbvtNode* n)
 {
@@ -2580,7 +2602,7 @@ static inline void calculateNormalCone(btDbvntNode* root)
 	}
 	else
 	{
-		btVector3 n0(0,0,0), n1(0,0,0);
+		btVector3 n0(0, 0, 0), n1(0, 0, 0);
 		btScalar a0 = 0, a1 = 0;
 		if (root->childs[0])
 		{
@@ -2594,8 +2616,8 @@ static inline void calculateNormalCone(btDbvntNode* root)
 			n1 = root->childs[1]->normal;
 			a1 = root->childs[1]->angle;
 		}
-		root->normal = (n0+n1).safeNormalize();
-		root->angle = btMax(a0,a1) + btAngle(n0, n1)*0.5;
+		root->normal = (n0 + n1).safeNormalize();
+		root->angle = btMax(a0, a1) + btAngle(n0, n1) * 0.5;
 	}
 }
 
@@ -2609,7 +2631,8 @@ void btSoftBody::initializeFaceTree()
 	for (int i = 0; i < m_faces.size(); ++i)
 	{
 		Face& f = m_faces[i];
-		ATTRIBUTE_ALIGNED16(btDbvtVolume) vol = VolumeOf(f, 0);
+		ATTRIBUTE_ALIGNED16(btDbvtVolume)
+		vol = VolumeOf(f, 0);
 		btDbvtNode* node = new (btAlignedAlloc(sizeof(btDbvtNode), 16)) btDbvtNode();
 		node->parent = NULL;
 		node->data = &f;
@@ -2623,7 +2646,7 @@ void btSoftBody::initializeFaceTree()
 	// construct the adjacency list for triangles
 	for (int i = 0; i < adj.size(); ++i)
 	{
-		for (int j = i+1; j < adj.size(); ++j)
+		for (int j = i + 1; j < adj.size(); ++j)
 		{
 			int dup = 0;
 			for (int k = 0; k < 3; ++k)
@@ -2661,7 +2684,8 @@ void btSoftBody::rebuildNodeTree()
 	for (int i = 0; i < m_nodes.size(); ++i)
 	{
 		Node& n = m_nodes[i];
-		ATTRIBUTE_ALIGNED16(btDbvtVolume) vol = btDbvtVolume::FromCR(n.m_x, 0);
+		ATTRIBUTE_ALIGNED16(btDbvtVolume)
+		vol = btDbvtVolume::FromCR(n.m_x, 0);
 		btDbvtNode* node = new (btAlignedAlloc(sizeof(btDbvtNode), 16)) btDbvtNode();
 		node->parent = NULL;
 		node->data = &n;
@@ -2704,61 +2728,61 @@ btVector3 btSoftBody::evaluateCom() const
 }
 
 bool btSoftBody::checkContact(const btCollisionObjectWrapper* colObjWrap,
-                              const btVector3& x,
-                              btScalar margin,
-                              btSoftBody::sCti& cti) const
-{
-    btVector3 nrm;
-    const btCollisionShape* shp = colObjWrap->getCollisionShape();
-    //    const btRigidBody *tmpRigid = btRigidBody::upcast(colObjWrap->getCollisionObject());
-    //const btTransform &wtr = tmpRigid ? tmpRigid->getWorldTransform() : colObjWrap->getWorldTransform();
-    const btTransform& wtr = colObjWrap->getWorldTransform();
-    //todo: check which transform is needed here
-    
-    btScalar dst =
-    m_worldInfo->m_sparsesdf.Evaluate(
-                                      wtr.invXform(x),
-                                      shp,
-                                      nrm,
-                                      margin);
-    if (dst < 0)
-    {
-        cti.m_colObj = colObjWrap->getCollisionObject();
-        cti.m_normal = wtr.getBasis() * nrm;
-        cti.m_offset = -btDot(cti.m_normal, x - cti.m_normal * dst);
-        return (true);
-    }
-    return (false);
+							  const btVector3& x,
+							  btScalar margin,
+							  btSoftBody::sCti& cti) const
+{
+	btVector3 nrm;
+	const btCollisionShape* shp = colObjWrap->getCollisionShape();
+	//    const btRigidBody *tmpRigid = btRigidBody::upcast(colObjWrap->getCollisionObject());
+	//const btTransform &wtr = tmpRigid ? tmpRigid->getWorldTransform() : colObjWrap->getWorldTransform();
+	const btTransform& wtr = colObjWrap->getWorldTransform();
+	//todo: check which transform is needed here
+
+	btScalar dst =
+		m_worldInfo->m_sparsesdf.Evaluate(
+			wtr.invXform(x),
+			shp,
+			nrm,
+			margin);
+	if (dst < 0)
+	{
+		cti.m_colObj = colObjWrap->getCollisionObject();
+		cti.m_normal = wtr.getBasis() * nrm;
+		cti.m_offset = -btDot(cti.m_normal, x - cti.m_normal * dst);
+		return (true);
+	}
+	return (false);
 }
 
 //
 bool btSoftBody::checkDeformableContact(const btCollisionObjectWrapper* colObjWrap,
-							  const btVector3& x,
-							  btScalar margin,
-							  btSoftBody::sCti& cti, bool predict) const
+										const btVector3& x,
+										btScalar margin,
+										btSoftBody::sCti& cti, bool predict) const
 {
 	btVector3 nrm;
 	const btCollisionShape* shp = colObjWrap->getCollisionShape();
-    const btCollisionObject* tmpCollisionObj = colObjWrap->getCollisionObject();
-    // use the position x_{n+1}^* = x_n + dt * v_{n+1}^* where v_{n+1}^* = v_n + dtg for collision detect
-    // but resolve contact at x_n
-    btTransform wtr = (predict) ?
-    (colObjWrap->m_preTransform != NULL ? tmpCollisionObj->getInterpolationWorldTransform()*(*colObjWrap->m_preTransform) : tmpCollisionObj->getInterpolationWorldTransform())
-                 : colObjWrap->getWorldTransform();
+	const btCollisionObject* tmpCollisionObj = colObjWrap->getCollisionObject();
+	// use the position x_{n+1}^* = x_n + dt * v_{n+1}^* where v_{n+1}^* = v_n + dtg for collision detect
+	// but resolve contact at x_n
+	btTransform wtr = (predict) ? (colObjWrap->m_preTransform != NULL ? tmpCollisionObj->getInterpolationWorldTransform() * (*colObjWrap->m_preTransform) : tmpCollisionObj->getInterpolationWorldTransform())
+								: colObjWrap->getWorldTransform();
 	btScalar dst =
 		m_worldInfo->m_sparsesdf.Evaluate(
 			wtr.invXform(x),
 			shp,
 			nrm,
 			margin);
+
 	if (!predict)
 	{
 		cti.m_colObj = colObjWrap->getCollisionObject();
 		cti.m_normal = wtr.getBasis() * nrm;
-        cti.m_offset = dst;
+		cti.m_offset = dst;
 	}
-    if (dst < 0)
-        return true;
+	if (dst < 0)
+		return true;
 	return (false);
 }
 
@@ -2767,175 +2791,131 @@ bool btSoftBody::checkDeformableContact(const btCollisionObjectWrapper* colObjWr
 // point p with respect to triangle (a, b, c)
 static void getBarycentric(const btVector3& p, btVector3& a, btVector3& b, btVector3& c, btVector3& bary)
 {
-    btVector3 v0 = b - a, v1 = c - a, v2 = p - a;
-    btScalar d00 = v0.dot(v0);
-    btScalar d01 = v0.dot(v1);
-    btScalar d11 = v1.dot(v1);
-    btScalar d20 = v2.dot(v0);
-    btScalar d21 = v2.dot(v1);
-    btScalar denom = d00 * d11 - d01 * d01;
-    bary.setY((d11 * d20 - d01 * d21) / denom);
-    bary.setZ((d00 * d21 - d01 * d20) / denom);
-    bary.setX(btScalar(1) - bary.getY() - bary.getZ());
+	btVector3 v0 = b - a, v1 = c - a, v2 = p - a;
+	btScalar d00 = v0.dot(v0);
+	btScalar d01 = v0.dot(v1);
+	btScalar d11 = v1.dot(v1);
+	btScalar d20 = v2.dot(v0);
+	btScalar d21 = v2.dot(v1);
+	btScalar denom = d00 * d11 - d01 * d01;
+	bary.setY((d11 * d20 - d01 * d21) / denom);
+	bary.setZ((d00 * d21 - d01 * d20) / denom);
+	bary.setX(btScalar(1) - bary.getY() - bary.getZ());
 }
 
 //
 bool btSoftBody::checkDeformableFaceContact(const btCollisionObjectWrapper* colObjWrap,
-                                        Face& f,
-                                        btVector3& contact_point,
-                                        btVector3& bary,
-                                        btScalar margin,
-                                        btSoftBody::sCti& cti, bool predict) const
-{
-    btVector3 nrm;
-    const btCollisionShape* shp = colObjWrap->getCollisionShape();
-    const btCollisionObject* tmpCollisionObj = colObjWrap->getCollisionObject();
-    // use the position x_{n+1}^* = x_n + dt * v_{n+1}^* where v_{n+1}^* = v_n + dtg for collision detect
-    // but resolve contact at x_n
-    btTransform wtr = (predict) ?
-    (colObjWrap->m_preTransform != NULL ? tmpCollisionObj->getInterpolationWorldTransform()*(*colObjWrap->m_preTransform) : tmpCollisionObj->getInterpolationWorldTransform())
-    : colObjWrap->getWorldTransform();
-    btScalar dst;
-    
-//#define USE_QUADRATURE 1
-//#define CACHE_PREV_COLLISION
-    
-    // use the contact position of the previous collision
-#ifdef CACHE_PREV_COLLISION
-    if (f.m_pcontact[3] != 0)
-    {
-        for (int i = 0; i < 3; ++i)
-            bary[i] = f.m_pcontact[i];
-        contact_point = BaryEval(f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, bary);
-        dst = m_worldInfo->m_sparsesdf.Evaluate(
-                                          wtr.invXform(contact_point),
-                                          shp,
-                                          nrm,
-                                          margin);
-        nrm = wtr.getBasis() * nrm;
-        cti.m_colObj = colObjWrap->getCollisionObject();
-        // use cached contact point
-    }
-    else
-    {
-        btGjkEpaSolver2::sResults results;
-        btTransform triangle_transform;
-        triangle_transform.setIdentity();
-        triangle_transform.setOrigin(f.m_n[0]->m_x);
-        btTriangleShape triangle(btVector3(0,0,0), f.m_n[1]->m_x-f.m_n[0]->m_x, f.m_n[2]->m_x-f.m_n[0]->m_x);
-        btVector3 guess(0,0,0);
-        const btConvexShape* csh = static_cast<const btConvexShape*>(shp);
-        btGjkEpaSolver2::SignedDistance(&triangle, triangle_transform, csh, wtr, guess, results);
-        dst = results.distance - margin;
-        contact_point = results.witnesses[0];
-        getBarycentric(contact_point, f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, bary);
-        nrm = results.normal;
-        cti.m_colObj = colObjWrap->getCollisionObject();
-        for (int i = 0; i < 3; ++i)
-            f.m_pcontact[i] = bary[i];
-    }
-    return (dst < 0);
-#endif
+											Face& f,
+											btVector3& contact_point,
+											btVector3& bary,
+											btScalar margin,
+											btSoftBody::sCti& cti, bool predict) const
+{
+	btVector3 nrm;
+	const btCollisionShape* shp = colObjWrap->getCollisionShape();
+	const btCollisionObject* tmpCollisionObj = colObjWrap->getCollisionObject();
+	// use the position x_{n+1}^* = x_n + dt * v_{n+1}^* where v_{n+1}^* = v_n + dtg for collision detect
+	// but resolve contact at x_n
+	btTransform wtr = (predict) ? (colObjWrap->m_preTransform != NULL ? tmpCollisionObj->getInterpolationWorldTransform() * (*colObjWrap->m_preTransform) : tmpCollisionObj->getInterpolationWorldTransform())
+								: colObjWrap->getWorldTransform();
+	btScalar dst;
+	btGjkEpaSolver2::sResults results;
+
+//	#define USE_QUADRATURE 1
 
-    // use collision quadrature point
+	// use collision quadrature point
 #ifdef USE_QUADRATURE
-    {
-        dst = SIMD_INFINITY;
-        btVector3 local_nrm;
-        for (int q = 0; q < m_quads.size(); ++q)
-        {
-            btVector3 p;
-            if (predict)
-                p = BaryEval(f.m_n[0]->m_q, f.m_n[1]->m_q, f.m_n[2]->m_q, m_quads[q]);
-            else
-                p = BaryEval(f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, m_quads[q]);
-            btScalar local_dst = m_worldInfo->m_sparsesdf.Evaluate(
-                                                    wtr.invXform(p),
-                                                    shp,
-                                                    local_nrm,
-                                                    margin);
-            if (local_dst < dst)
-            {
-                if (local_dst < 0 && predict)
-                    return true;
-                dst = local_dst;
-                contact_point = p;
-                bary = m_quads[q];
-                nrm = local_nrm;
-            }
-            if (!predict)
-            {
-                cti.m_colObj = colObjWrap->getCollisionObject();
-                cti.m_normal = wtr.getBasis() * nrm;
-                cti.m_offset = dst;
-            }
-        }
-        return (dst < 0);
-    }
+	{
+		dst = SIMD_INFINITY;
+		btVector3 local_nrm;
+		for (int q = 0; q < m_quads.size(); ++q)
+		{
+			btVector3 p;
+			if (predict)
+				p = BaryEval(f.m_n[0]->m_q, f.m_n[1]->m_q, f.m_n[2]->m_q, m_quads[q]);
+			else
+				p = BaryEval(f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, m_quads[q]);
+			btScalar local_dst = m_worldInfo->m_sparsesdf.Evaluate(
+				wtr.invXform(p),
+				shp,
+				local_nrm,
+				margin);
+			if (local_dst < dst)
+			{
+				if (local_dst < 0 && predict)
+					return true;
+				dst = local_dst;
+				contact_point = p;
+				bary = m_quads[q];
+				nrm = local_nrm;
+			}
+			if (!predict)
+			{
+				cti.m_colObj = colObjWrap->getCollisionObject();
+				cti.m_normal = wtr.getBasis() * nrm;
+				cti.m_offset = dst;
+			}
+		}
+		return (dst < 0);
+	}
 #endif
-    
-//    // regular face contact
-//    {
-//        btGjkEpaSolver2::sResults results;
-//        btTransform triangle_transform;
-//        triangle_transform.setIdentity();
-//        triangle_transform.setOrigin(f.m_n[0]->m_x);
-//        btTriangleShape triangle(btVector3(0,0,0), f.m_n[1]->m_x-f.m_n[0]->m_x, f.m_n[2]->m_x-f.m_n[0]->m_x);
-//        btVector3 guess(0,0,0);
-//        if (predict)
-//        {
-//            triangle_transform.setOrigin(f.m_n[0]->m_q);
-//            triangle = btTriangleShape(btVector3(0,0,0), f.m_n[1]->m_q-f.m_n[0]->m_q, f.m_n[2]->m_q-f.m_n[0]->m_q);
-//        }
-//        const btConvexShape* csh = static_cast<const btConvexShape*>(shp);
-////        btGjkEpaSolver2::SignedDistance(&triangle, triangle_transform, csh, wtr, guess, results);
-////        dst = results.distance - margin;
-////        contact_point = results.witnesses[0];
-//        btGjkEpaSolver2::Penetration(&triangle, triangle_transform, csh, wtr, guess, results);
-//        if (results.status == btGjkEpaSolver2::sResults::Separated)
-//            return false;
-//        dst = results.distance - margin;
-//        contact_point = results.witnesses[1];
-//        getBarycentric(contact_point, f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, bary);
-//        nrm = results.normal;
-//        for (int i = 0; i < 3; ++i)
-//            f.m_pcontact[i] = bary[i];
-//    }
-//
-//    if (!predict)
-//    {
-//        cti.m_colObj = colObjWrap->getCollisionObject();
-//        cti.m_normal = nrm;
-//        cti.m_offset = dst;
-//    }
-//
-    
-    // regular face contact
-    {
-        btGjkEpaSolver2::sResults results;
-        btTransform triangle_transform;
-        triangle_transform.setIdentity();
-        triangle_transform.setOrigin(f.m_n[0]->m_q);
-        btTriangleShape triangle(btVector3(0,0,0), f.m_n[1]->m_q-f.m_n[0]->m_q, f.m_n[2]->m_q-f.m_n[0]->m_q);
-        btVector3 guess(0,0,0);
-        const btConvexShape* csh = static_cast<const btConvexShape*>(shp);
-        btGjkEpaSolver2::SignedDistance(&triangle, triangle_transform, csh, wtr, guess, results);
-        dst = results.distance-csh->getMargin();
-        dst -= margin;
-        if (dst >= 0)
-            return false;
-        contact_point = results.witnesses[0];
-        getBarycentric(contact_point, f.m_n[0]->m_q, f.m_n[1]->m_q, f.m_n[2]->m_q, bary);
-        btVector3 curr = BaryEval(f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, bary);
-        nrm = results.normal;
-        cti.m_colObj = colObjWrap->getCollisionObject();
-        cti.m_normal = nrm;
-        cti.m_offset = dst + (curr - contact_point).dot(nrm);
-    }
-    return (dst < 0);
+
+	// collision detection using x*
+	btTransform triangle_transform;
+	triangle_transform.setIdentity();
+	triangle_transform.setOrigin(f.m_n[0]->m_q);
+	btTriangleShape triangle(btVector3(0, 0, 0), f.m_n[1]->m_q - f.m_n[0]->m_q, f.m_n[2]->m_q - f.m_n[0]->m_q);
+	btVector3 guess(0, 0, 0);
+	const btConvexShape* csh = static_cast<const btConvexShape*>(shp);
+	btGjkEpaSolver2::SignedDistance(&triangle, triangle_transform, csh, wtr, guess, results);
+	dst = results.distance - 2.0 * csh->getMargin() - margin;  // margin padding so that the distance = the actual distance between face and rigid - margin of rigid - margin of deformable
+	if (dst >= 0)
+		return false;
+
+	// Use consistent barycenter to recalculate distance.
+	if (this->m_cacheBarycenter)
+	{
+		if (f.m_pcontact[3] != 0)
+		{
+			for (int i = 0; i < 3; ++i)
+				bary[i] = f.m_pcontact[i];
+			contact_point = BaryEval(f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, bary);
+			const btConvexShape* csh = static_cast<const btConvexShape*>(shp);
+			btGjkEpaSolver2::SignedDistance(contact_point, margin, csh, wtr, results);
+			cti.m_colObj = colObjWrap->getCollisionObject();
+			dst = results.distance;
+			cti.m_normal = results.normal;
+			cti.m_offset = dst;
+
+			//point-convex CD
+			wtr = colObjWrap->getWorldTransform();
+			btTriangleShape triangle2(btVector3(0, 0, 0), f.m_n[1]->m_x - f.m_n[0]->m_x, f.m_n[2]->m_x - f.m_n[0]->m_x);
+			triangle_transform.setOrigin(f.m_n[0]->m_x);
+			btGjkEpaSolver2::SignedDistance(&triangle2, triangle_transform, csh, wtr, guess, results);
+
+			dst = results.distance - csh->getMargin() - margin;
+			return true;
+		}
+	}
+
+	// Use triangle-convex CD.
+	wtr = colObjWrap->getWorldTransform();
+	btTriangleShape triangle2(btVector3(0, 0, 0), f.m_n[1]->m_x - f.m_n[0]->m_x, f.m_n[2]->m_x - f.m_n[0]->m_x);
+	triangle_transform.setOrigin(f.m_n[0]->m_x);
+	btGjkEpaSolver2::SignedDistance(&triangle2, triangle_transform, csh, wtr, guess, results);
+	contact_point = results.witnesses[0];
+	getBarycentric(contact_point, f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, bary);
+
+	for (int i = 0; i < 3; ++i)
+		f.m_pcontact[i] = bary[i];
+
+	dst = results.distance - csh->getMargin() - margin;
+	cti.m_colObj = colObjWrap->getCollisionObject();
+	cti.m_normal = results.normal;
+	cti.m_offset = dst;
+	return true;
 }
 
-//
 void btSoftBody::updateNormals()
 {
 	const btVector3 zv(0, 0, 0);
@@ -2979,63 +2959,63 @@ void btSoftBody::updateBounds()
 		m_bounds[1] = btVector3(1000, 1000, 1000);
 
 	} else {*/
-//    if (m_ndbvt.m_root)
-//    {
-//        const btVector3& mins = m_ndbvt.m_root->volume.Mins();
-//        const btVector3& maxs = m_ndbvt.m_root->volume.Maxs();
-//        const btScalar csm = getCollisionShape()->getMargin();
-//        const btVector3 mrg = btVector3(csm,
-//                                        csm,
-//                                        csm) *
-//                              1;  // ??? to investigate...
-//        m_bounds[0] = mins - mrg;
-//        m_bounds[1] = maxs + mrg;
-//        if (0 != getBroadphaseHandle())
-//        {
-//            m_worldInfo->m_broadphase->setAabb(getBroadphaseHandle(),
-//                                               m_bounds[0],
-//                                               m_bounds[1],
-//                                               m_worldInfo->m_dispatcher);
-//        }
-//    }
-//    else
-//    {
-//        m_bounds[0] =
-//            m_bounds[1] = btVector3(0, 0, 0);
-//    }
-    if (m_nodes.size())
-    {
-        btVector3 mins = m_nodes[0].m_x;
-        btVector3 maxs = m_nodes[0].m_x;
-        for (int i = 1; i < m_nodes.size(); ++i)
-        {
-            for (int d = 0; d < 3; ++d)
-            {
-                if (m_nodes[i].m_x[d] > maxs[d])
-                    maxs[d] = m_nodes[i].m_x[d];
-                if (m_nodes[i].m_x[d] < mins[d])
-                    mins[d] = m_nodes[i].m_x[d];
-            }
-        }
-        const btScalar csm = getCollisionShape()->getMargin();
-        const btVector3 mrg = btVector3(csm,
-                                        csm,
-                                        csm);
-        m_bounds[0] = mins - mrg;
-        m_bounds[1] = maxs + mrg;
-        if (0 != getBroadphaseHandle())
-        {
-            m_worldInfo->m_broadphase->setAabb(getBroadphaseHandle(),
-                                               m_bounds[0],
-                                               m_bounds[1],
-                                               m_worldInfo->m_dispatcher);
-        }
-    }
-    else
-    {
-        m_bounds[0] =
-        m_bounds[1] = btVector3(0, 0, 0);
-    }
+	//    if (m_ndbvt.m_root)
+	//    {
+	//        const btVector3& mins = m_ndbvt.m_root->volume.Mins();
+	//        const btVector3& maxs = m_ndbvt.m_root->volume.Maxs();
+	//        const btScalar csm = getCollisionShape()->getMargin();
+	//        const btVector3 mrg = btVector3(csm,
+	//                                        csm,
+	//                                        csm) *
+	//                              1;  // ??? to investigate...
+	//        m_bounds[0] = mins - mrg;
+	//        m_bounds[1] = maxs + mrg;
+	//        if (0 != getBroadphaseHandle())
+	//        {
+	//            m_worldInfo->m_broadphase->setAabb(getBroadphaseHandle(),
+	//                                               m_bounds[0],
+	//                                               m_bounds[1],
+	//                                               m_worldInfo->m_dispatcher);
+	//        }
+	//    }
+	//    else
+	//    {
+	//        m_bounds[0] =
+	//            m_bounds[1] = btVector3(0, 0, 0);
+	//    }
+	if (m_nodes.size())
+	{
+		btVector3 mins = m_nodes[0].m_x;
+		btVector3 maxs = m_nodes[0].m_x;
+		for (int i = 1; i < m_nodes.size(); ++i)
+		{
+			for (int d = 0; d < 3; ++d)
+			{
+				if (m_nodes[i].m_x[d] > maxs[d])
+					maxs[d] = m_nodes[i].m_x[d];
+				if (m_nodes[i].m_x[d] < mins[d])
+					mins[d] = m_nodes[i].m_x[d];
+			}
+		}
+		const btScalar csm = getCollisionShape()->getMargin();
+		const btVector3 mrg = btVector3(csm,
+										csm,
+										csm);
+		m_bounds[0] = mins - mrg;
+		m_bounds[1] = maxs + mrg;
+		if (0 != getBroadphaseHandle())
+		{
+			m_worldInfo->m_broadphase->setAabb(getBroadphaseHandle(),
+											   m_bounds[0],
+											   m_bounds[1],
+											   m_worldInfo->m_dispatcher);
+		}
+	}
+	else
+	{
+		m_bounds[0] =
+			m_bounds[1] = btVector3(0, 0, 0);
+	}
 }
 
 //
@@ -3454,60 +3434,120 @@ void btSoftBody::dampClusters()
 
 void btSoftBody::setSpringStiffness(btScalar k)
 {
-    for (int i = 0; i < m_links.size(); ++i)
-    {
-        m_links[i].Feature::m_material->m_kLST = k;
-    }
-    m_repulsionStiffness = k;
+	for (int i = 0; i < m_links.size(); ++i)
+	{
+		m_links[i].Feature::m_material->m_kLST = k;
+	}
+	m_repulsionStiffness = k;
+}
+
+void btSoftBody::setGravityFactor(btScalar gravFactor)
+{
+	m_gravityFactor = gravFactor;
+}
+
+void btSoftBody::setCacheBarycenter(bool cacheBarycenter)
+{
+	m_cacheBarycenter = cacheBarycenter;
 }
 
 void btSoftBody::initializeDmInverse()
 {
-    btScalar unit_simplex_measure = 1./6.;
-    
-    for (int i = 0; i < m_tetras.size(); ++i)
-    {
-        Tetra &t = m_tetras[i];
-        btVector3 c1 = t.m_n[1]->m_x - t.m_n[0]->m_x;
-        btVector3 c2 = t.m_n[2]->m_x - t.m_n[0]->m_x;
-        btVector3 c3 = t.m_n[3]->m_x - t.m_n[0]->m_x;
-        btMatrix3x3 Dm(c1.getX(), c2.getX(), c3.getX(),
-                       c1.getY(), c2.getY(), c3.getY(),
-                       c1.getZ(), c2.getZ(), c3.getZ());
-        t.m_element_measure = Dm.determinant() * unit_simplex_measure;
-        t.m_Dm_inverse = Dm.inverse();
-    }
+	btScalar unit_simplex_measure = 1. / 6.;
+
+	for (int i = 0; i < m_tetras.size(); ++i)
+	{
+		Tetra& t = m_tetras[i];
+		btVector3 c1 = t.m_n[1]->m_x - t.m_n[0]->m_x;
+		btVector3 c2 = t.m_n[2]->m_x - t.m_n[0]->m_x;
+		btVector3 c3 = t.m_n[3]->m_x - t.m_n[0]->m_x;
+		btMatrix3x3 Dm(c1.getX(), c2.getX(), c3.getX(),
+					   c1.getY(), c2.getY(), c3.getY(),
+					   c1.getZ(), c2.getZ(), c3.getZ());
+		t.m_element_measure = Dm.determinant() * unit_simplex_measure;
+		t.m_Dm_inverse = Dm.inverse();
+
+		// calculate the first three columns of P^{-1}
+		btVector3 a = t.m_n[0]->m_x;
+		btVector3 b = t.m_n[1]->m_x;
+		btVector3 c = t.m_n[2]->m_x;
+		btVector3 d = t.m_n[3]->m_x;
+
+		btScalar det = 1 / (a[0] * b[1] * c[2] - a[0] * b[1] * d[2] - a[0] * b[2] * c[1] + a[0] * b[2] * d[1] + a[0] * c[1] * d[2] - a[0] * c[2] * d[1] + a[1] * (-b[0] * c[2] + b[0] * d[2] + b[2] * c[0] - b[2] * d[0] - c[0] * d[2] + c[2] * d[0]) + a[2] * (b[0] * c[1] - b[0] * d[1] + b[1] * (d[0] - c[0]) + c[0] * d[1] - c[1] * d[0]) - b[0] * c[1] * d[2] + b[0] * c[2] * d[1] + b[1] * c[0] * d[2] - b[1] * c[2] * d[0] - b[2] * c[0] * d[1] + b[2] * c[1] * d[0]);
+
+		btScalar P11 = -b[2] * c[1] + d[2] * c[1] + b[1] * c[2] + b[2] * d[1] - c[2] * d[1] - b[1] * d[2];
+		btScalar P12 = b[2] * c[0] - d[2] * c[0] - b[0] * c[2] - b[2] * d[0] + c[2] * d[0] + b[0] * d[2];
+		btScalar P13 = -b[1] * c[0] + d[1] * c[0] + b[0] * c[1] + b[1] * d[0] - c[1] * d[0] - b[0] * d[1];
+		btScalar P21 = a[2] * c[1] - d[2] * c[1] - a[1] * c[2] - a[2] * d[1] + c[2] * d[1] + a[1] * d[2];
+		btScalar P22 = -a[2] * c[0] + d[2] * c[0] + a[0] * c[2] + a[2] * d[0] - c[2] * d[0] - a[0] * d[2];
+		btScalar P23 = a[1] * c[0] - d[1] * c[0] - a[0] * c[1] - a[1] * d[0] + c[1] * d[0] + a[0] * d[1];
+		btScalar P31 = -a[2] * b[1] + d[2] * b[1] + a[1] * b[2] + a[2] * d[1] - b[2] * d[1] - a[1] * d[2];
+		btScalar P32 = a[2] * b[0] - d[2] * b[0] - a[0] * b[2] - a[2] * d[0] + b[2] * d[0] + a[0] * d[2];
+		btScalar P33 = -a[1] * b[0] + d[1] * b[0] + a[0] * b[1] + a[1] * d[0] - b[1] * d[0] - a[0] * d[1];
+		btScalar P41 = a[2] * b[1] - c[2] * b[1] - a[1] * b[2] - a[2] * c[1] + b[2] * c[1] + a[1] * c[2];
+		btScalar P42 = -a[2] * b[0] + c[2] * b[0] + a[0] * b[2] + a[2] * c[0] - b[2] * c[0] - a[0] * c[2];
+		btScalar P43 = a[1] * b[0] - c[1] * b[0] - a[0] * b[1] - a[1] * c[0] + b[1] * c[0] + a[0] * c[1];
+
+		btVector4 p1(P11 * det, P21 * det, P31 * det, P41 * det);
+		btVector4 p2(P12 * det, P22 * det, P32 * det, P42 * det);
+		btVector4 p3(P13 * det, P23 * det, P33 * det, P43 * det);
+
+		t.m_P_inv[0] = p1;
+		t.m_P_inv[1] = p2;
+		t.m_P_inv[2] = p3;
+	}
+}
+
+static btScalar Dot4(const btVector4& a, const btVector4& b)
+{
+	return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 }
 
 void btSoftBody::updateDeformation()
 {
-    for (int i = 0; i < m_tetras.size(); ++i)
-    {
-        btSoftBody::Tetra& t = m_tetras[i];
-        btVector3 c1 = t.m_n[1]->m_q - t.m_n[0]->m_q;
-        btVector3 c2 = t.m_n[2]->m_q - t.m_n[0]->m_q;
-        btVector3 c3 = t.m_n[3]->m_q - t.m_n[0]->m_q;
-        btMatrix3x3 Ds(c1.getX(), c2.getX(), c3.getX(),
-                       c1.getY(), c2.getY(), c3.getY(),
-                       c1.getZ(), c2.getZ(), c3.getZ());
-        t.m_F = Ds * t.m_Dm_inverse;
-        
-        btSoftBody::TetraScratch& s = m_tetraScratches[i];
-        s.m_F = t.m_F;
-        s.m_J = t.m_F.determinant();
-        btMatrix3x3 C = t.m_F.transpose()*t.m_F;
-        s.m_trace = C[0].getX() + C[1].getY() + C[2].getZ();
-        s.m_cofF = t.m_F.adjoint().transpose();
-    }
+	btQuaternion q;
+	for (int i = 0; i < m_tetras.size(); ++i)
+	{
+		btSoftBody::Tetra& t = m_tetras[i];
+		btVector3 c1 = t.m_n[1]->m_q - t.m_n[0]->m_q;
+		btVector3 c2 = t.m_n[2]->m_q - t.m_n[0]->m_q;
+		btVector3 c3 = t.m_n[3]->m_q - t.m_n[0]->m_q;
+		btMatrix3x3 Ds(c1.getX(), c2.getX(), c3.getX(),
+					   c1.getY(), c2.getY(), c3.getY(),
+					   c1.getZ(), c2.getZ(), c3.getZ());
+		t.m_F = Ds * t.m_Dm_inverse;
+
+		btSoftBody::TetraScratch& s = m_tetraScratches[i];
+		s.m_F = t.m_F;
+		s.m_J = t.m_F.determinant();
+		btMatrix3x3 C = t.m_F.transpose() * t.m_F;
+		s.m_trace = C[0].getX() + C[1].getY() + C[2].getZ();
+		s.m_cofF = t.m_F.adjoint().transpose();
+
+		btVector3 a = t.m_n[0]->m_q;
+		btVector3 b = t.m_n[1]->m_q;
+		btVector3 c = t.m_n[2]->m_q;
+		btVector3 d = t.m_n[3]->m_q;
+		btVector4 q1(a[0], b[0], c[0], d[0]);
+		btVector4 q2(a[1], b[1], c[1], d[1]);
+		btVector4 q3(a[2], b[2], c[2], d[2]);
+		btMatrix3x3 B(Dot4(q1, t.m_P_inv[0]), Dot4(q1, t.m_P_inv[1]), Dot4(q1, t.m_P_inv[2]),
+					  Dot4(q2, t.m_P_inv[0]), Dot4(q2, t.m_P_inv[1]), Dot4(q2, t.m_P_inv[2]),
+					  Dot4(q3, t.m_P_inv[0]), Dot4(q3, t.m_P_inv[1]), Dot4(q3, t.m_P_inv[2]));
+		q.setRotation(btVector3(0, 0, 1), 0);
+		B.extractRotation(q, 0.01);  // precision of the rotation is not very important for visual correctness.
+		btMatrix3x3 Q(q);
+		s.m_corotation = Q;
+	}
 }
 
 void btSoftBody::advanceDeformation()
 {
-    updateDeformation();
-    for (int i = 0; i < m_tetras.size(); ++i)
-    {
-        m_tetraScratchesTn[i] = m_tetraScratches[i];
-    }
+	updateDeformation();
+	for (int i = 0; i < m_tetras.size(); ++i)
+	{
+		m_tetraScratchesTn[i] = m_tetraScratches[i];
+	}
 }
 //
 void btSoftBody::Joint::Prepare(btScalar dt, int)
@@ -3750,7 +3790,7 @@ void btSoftBody::applyForces()
 //
 void btSoftBody::setMaxStress(btScalar maxStress)
 {
-    m_cfg.m_maxStress = maxStress;
+	m_cfg.m_maxStress = maxStress;
 }
 
 //
@@ -3765,7 +3805,7 @@ void btSoftBody::interpolateRenderMesh()
 			const Node* p2 = m_renderNodesParents[i][2];
 			btVector3 normal = btCross(p1->m_x - p0->m_x, p2->m_x - p0->m_x);
 			btVector3 unit_normal = normal.normalized();
-			Node& n = m_renderNodes[i];
+			RenderNode& n = m_renderNodes[i];
 			n.m_x.setZero();
 			for (int j = 0; j < 3; ++j)
 			{
@@ -3778,7 +3818,7 @@ void btSoftBody::interpolateRenderMesh()
 	{
 		for (int i = 0; i < m_renderNodes.size(); ++i)
 		{
-			Node& n = m_renderNodes[i];
+			RenderNode& n = m_renderNodes[i];
 			n.m_x.setZero();
 			for (int j = 0; j < 4; ++j)
 			{
@@ -3793,13 +3833,13 @@ void btSoftBody::interpolateRenderMesh()
 
 void btSoftBody::setCollisionQuadrature(int N)
 {
-    for (int i = 0; i <= N; ++i)
-    {
-        for (int j = 0; i+j <= N; ++j)
-        {
-            m_quads.push_back(btVector3(btScalar(i)/btScalar(N), btScalar(j)/btScalar(N), btScalar(N-i-j)/btScalar(N)));
-        }
-    }
+	for (int i = 0; i <= N; ++i)
+	{
+		for (int j = 0; i + j <= N; ++j)
+		{
+			m_quads.push_back(btVector3(btScalar(i) / btScalar(N), btScalar(j) / btScalar(N), btScalar(N - i - j) / btScalar(N)));
+		}
+	}
 }
 
 //
@@ -4006,12 +4046,12 @@ btSoftBody::vsolver_t btSoftBody::getSolver(eVSolver::_ solver)
 
 void btSoftBody::setSelfCollision(bool useSelfCollision)
 {
-    m_useSelfCollision = useSelfCollision;
+	m_useSelfCollision = useSelfCollision;
 }
 
 bool btSoftBody::useSelfCollision()
 {
-   return m_useSelfCollision;
+	return m_useSelfCollision;
 }
 
 //
@@ -4052,23 +4092,23 @@ void btSoftBody::defaultCollisionHandler(const btCollisionObjectWrapper* pcoWrap
 			collider.ProcessColObj(this, pcoWrap);
 		}
 		break;
-        case fCollision::SDF_RD:
-        {
-            btRigidBody* prb1 = (btRigidBody*)btRigidBody::upcast(pcoWrap->getCollisionObject());
-            if (pcoWrap->getCollisionObject()->isActive() || this->isActive())
-            {
-                const btTransform wtr = pcoWrap->getWorldTransform();
-                const btScalar timemargin = 0;
-                const btScalar basemargin = getCollisionShape()->getMargin();
-                btVector3 mins;
-                btVector3 maxs;
-                ATTRIBUTE_ALIGNED16(btDbvtVolume)
-                volume;
-                pcoWrap->getCollisionShape()->getAabb(wtr,
-                                                      mins,
-                                                      maxs);
-                volume = btDbvtVolume::FromMM(mins, maxs);
-                volume.Expand(btVector3(basemargin, basemargin, basemargin));
+		case fCollision::SDF_RD:
+		{
+			btRigidBody* prb1 = (btRigidBody*)btRigidBody::upcast(pcoWrap->getCollisionObject());
+			if (pcoWrap->getCollisionObject()->isActive() || this->isActive())
+			{
+				const btTransform wtr = pcoWrap->getWorldTransform();
+				const btScalar timemargin = 0;
+				const btScalar basemargin = getCollisionShape()->getMargin();
+				btVector3 mins;
+				btVector3 maxs;
+				ATTRIBUTE_ALIGNED16(btDbvtVolume)
+				volume;
+				pcoWrap->getCollisionShape()->getAabb(wtr,
+													  mins,
+													  maxs);
+				volume = btDbvtVolume::FromMM(mins, maxs);
+				volume.Expand(btVector3(basemargin, basemargin, basemargin));
 				if (m_cfg.collisions & fCollision::SDF_RDN)
 				{
 					btSoftColliders::CollideSDF_RD docollideNode;
@@ -4080,26 +4120,26 @@ void btSoftBody::defaultCollisionHandler(const btCollisionObjectWrapper* pcoWrap
 					m_ndbvt.collideTV(m_ndbvt.m_root, volume, docollideNode);
 				}
 
-                if (((pcoWrap->getCollisionObject()->getInternalType() == CO_RIGID_BODY) && (m_cfg.collisions & fCollision::SDF_RDF)) || ((pcoWrap->getCollisionObject()->getInternalType() == CO_FEATHERSTONE_LINK) && (m_cfg.collisions & fCollision::SDF_MDF)))
-                {
-                    btSoftColliders::CollideSDF_RDF docollideFace;
-                    docollideFace.psb = this;
-                    docollideFace.m_colObj1Wrap = pcoWrap;
-                    docollideFace.m_rigidBody = prb1;
+				if (((pcoWrap->getCollisionObject()->getInternalType() == CO_RIGID_BODY) && (m_cfg.collisions & fCollision::SDF_RDF)) || ((pcoWrap->getCollisionObject()->getInternalType() == CO_FEATHERSTONE_LINK) && (m_cfg.collisions & fCollision::SDF_MDF)))
+				{
+					btSoftColliders::CollideSDF_RDF docollideFace;
+					docollideFace.psb = this;
+					docollideFace.m_colObj1Wrap = pcoWrap;
+					docollideFace.m_rigidBody = prb1;
 					docollideFace.dynmargin = basemargin + timemargin;
 					docollideFace.stamargin = basemargin;
-                    m_fdbvt.collideTV(m_fdbvt.m_root, volume, docollideFace);
-                }
-            }
-        }
-        break;
+					m_fdbvt.collideTV(m_fdbvt.m_root, volume, docollideFace);
+				}
+			}
+		}
+		break;
 	}
 }
 
 //
 void btSoftBody::defaultCollisionHandler(btSoftBody* psb)
 {
-    BT_PROFILE("Deformable Collision");
+	BT_PROFILE("Deformable Collision");
 	const int cf = m_cfg.collisions & psb->m_cfg.collisions;
 	switch (cf & fCollision::SVSmask)
 	{
@@ -4137,60 +4177,60 @@ void btSoftBody::defaultCollisionHandler(btSoftBody* psb)
 			}
 		}
 		break;
-        case fCollision::VF_DD:
-        {
-            if (!psb->m_softSoftCollision)
-                return;
-            if (psb->isActive() || this->isActive())
-            {
-                if (this != psb)
-                {
-                    btSoftColliders::CollideVF_DD docollide;
-                    /* common                    */
-                    docollide.mrg = getCollisionShape()->getMargin() +
-                    psb->getCollisionShape()->getMargin();
-                    /* psb0 nodes vs psb1 faces    */
-                    if (psb->m_tetras.size() > 0)
-                        docollide.useFaceNormal = true;
-                    else
-                        docollide.useFaceNormal = false;
-                    docollide.psb[0] = this;
-                    docollide.psb[1] = psb;
-                    docollide.psb[0]->m_ndbvt.collideTT(docollide.psb[0]->m_ndbvt.m_root,
-                                                        docollide.psb[1]->m_fdbvt.m_root,
-                                                        docollide);
-
-                    /* psb1 nodes vs psb0 faces    */
-                    if (this->m_tetras.size() > 0)
-                        docollide.useFaceNormal = true;
-                    else
-                        docollide.useFaceNormal = false;
-                    docollide.psb[0] = psb;
-                    docollide.psb[1] = this;
-                    docollide.psb[0]->m_ndbvt.collideTT(docollide.psb[0]->m_ndbvt.m_root,
-                                                        docollide.psb[1]->m_fdbvt.m_root,
-                                                        docollide);
-                }
-                else
-                {
-                    if (psb->useSelfCollision())
-                    {
-                        btSoftColliders::CollideFF_DD docollide;
-                        docollide.mrg = 2*getCollisionShape()->getMargin();
-                        docollide.psb[0] = this;
-                        docollide.psb[1] = psb;
-                        if (this->m_tetras.size() > 0)
-                            docollide.useFaceNormal = true;
-                        else
-                            docollide.useFaceNormal = false;
-                        /* psb0 faces vs psb0 faces    */
-                        calculateNormalCone(this->m_fdbvnt);
-                        this->m_fdbvt.selfCollideT(m_fdbvnt,docollide);
-                    }
-                }
-            }
-        }
-        break;
+		case fCollision::VF_DD:
+		{
+			if (!psb->m_softSoftCollision)
+				return;
+			if (psb->isActive() || this->isActive())
+			{
+				if (this != psb)
+				{
+					btSoftColliders::CollideVF_DD docollide;
+					/* common                    */
+					docollide.mrg = getCollisionShape()->getMargin() +
+									psb->getCollisionShape()->getMargin();
+					/* psb0 nodes vs psb1 faces    */
+					if (psb->m_tetras.size() > 0)
+						docollide.useFaceNormal = true;
+					else
+						docollide.useFaceNormal = false;
+					docollide.psb[0] = this;
+					docollide.psb[1] = psb;
+					docollide.psb[0]->m_ndbvt.collideTT(docollide.psb[0]->m_ndbvt.m_root,
+														docollide.psb[1]->m_fdbvt.m_root,
+														docollide);
+
+					/* psb1 nodes vs psb0 faces    */
+					if (this->m_tetras.size() > 0)
+						docollide.useFaceNormal = true;
+					else
+						docollide.useFaceNormal = false;
+					docollide.psb[0] = psb;
+					docollide.psb[1] = this;
+					docollide.psb[0]->m_ndbvt.collideTT(docollide.psb[0]->m_ndbvt.m_root,
+														docollide.psb[1]->m_fdbvt.m_root,
+														docollide);
+				}
+				else
+				{
+					if (psb->useSelfCollision())
+					{
+						btSoftColliders::CollideFF_DD docollide;
+						docollide.mrg = 2 * getCollisionShape()->getMargin();
+						docollide.psb[0] = this;
+						docollide.psb[1] = psb;
+						if (this->m_tetras.size() > 0)
+							docollide.useFaceNormal = true;
+						else
+							docollide.useFaceNormal = false;
+						/* psb0 faces vs psb0 faces    */
+						calculateNormalCone(this->m_fdbvnt);
+						this->m_fdbvt.selfCollideT(m_fdbvnt, docollide);
+					}
+				}
+			}
+		}
+		break;
 		default:
 		{
 		}
@@ -4205,7 +4245,7 @@ void btSoftBody::geometricCollisionHandler(btSoftBody* psb)
 		{
 			btSoftColliders::CollideCCD docollide;
 			/* common                    */
-			docollide.mrg = SAFE_EPSILON; // for rounding error instead of actual margin
+			docollide.mrg = SAFE_EPSILON;  // for rounding error instead of actual margin
 			docollide.dt = psb->m_sst.sdt;
 			/* psb0 nodes vs psb1 faces    */
 			if (psb->m_tetras.size() > 0)
@@ -4215,8 +4255,8 @@ void btSoftBody::geometricCollisionHandler(btSoftBody* psb)
 			docollide.psb[0] = this;
 			docollide.psb[1] = psb;
 			docollide.psb[0]->m_ndbvt.collideTT(docollide.psb[0]->m_ndbvt.m_root,
-												 docollide.psb[1]->m_fdbvt.m_root,
-												 docollide);
+												docollide.psb[1]->m_fdbvt.m_root,
+												docollide);
 			/* psb1 nodes vs psb0 faces    */
 			if (this->m_tetras.size() > 0)
 				docollide.useFaceNormal = true;
@@ -4225,8 +4265,8 @@ void btSoftBody::geometricCollisionHandler(btSoftBody* psb)
 			docollide.psb[0] = psb;
 			docollide.psb[1] = this;
 			docollide.psb[0]->m_ndbvt.collideTT(docollide.psb[0]->m_ndbvt.m_root,
-												 docollide.psb[1]->m_fdbvt.m_root,
-												 docollide);
+												docollide.psb[1]->m_fdbvt.m_root,
+												docollide);
 		}
 		else
 		{
@@ -4236,14 +4276,14 @@ void btSoftBody::geometricCollisionHandler(btSoftBody* psb)
 				docollide.mrg = SAFE_EPSILON;
 				docollide.psb[0] = this;
 				docollide.psb[1] = psb;
-                docollide.dt = psb->m_sst.sdt;
+				docollide.dt = psb->m_sst.sdt;
 				if (this->m_tetras.size() > 0)
 					docollide.useFaceNormal = true;
 				else
 					docollide.useFaceNormal = false;
 				/* psb0 faces vs psb0 faces    */
 				calculateNormalCone(this->m_fdbvnt);  // should compute this outside of this scope
-				this->m_fdbvt.selfCollideT(m_fdbvnt,docollide);
+				this->m_fdbvt.selfCollideT(m_fdbvnt, docollide);
 			}
 		}
 	}
@@ -4648,44 +4688,43 @@ const char* btSoftBody::serialize(void* dataBuffer, class btSerializer* serializ
 
 void btSoftBody::updateDeactivation(btScalar timeStep)
 {
-    if ((getActivationState() == ISLAND_SLEEPING) || (getActivationState() == DISABLE_DEACTIVATION))
-        return;
+	if ((getActivationState() == ISLAND_SLEEPING) || (getActivationState() == DISABLE_DEACTIVATION))
+		return;
 
-    if (m_maxSpeedSquared < m_sleepingThreshold * m_sleepingThreshold)
-    {
-        m_deactivationTime += timeStep;
-    }
-    else
-    {
-        m_deactivationTime = btScalar(0.);
-        setActivationState(0);
-    }
+	if (m_maxSpeedSquared < m_sleepingThreshold * m_sleepingThreshold)
+	{
+		m_deactivationTime += timeStep;
+	}
+	else
+	{
+		m_deactivationTime = btScalar(0.);
+		setActivationState(0);
+	}
 }
 
-
 void btSoftBody::setZeroVelocity()
 {
-    for (int i = 0; i < m_nodes.size(); ++i)
-    {
-        m_nodes[i].m_v.setZero();
-    }
+	for (int i = 0; i < m_nodes.size(); ++i)
+	{
+		m_nodes[i].m_v.setZero();
+	}
 }
 
 bool btSoftBody::wantsSleeping()
 {
-    if (getActivationState() == DISABLE_DEACTIVATION)
-        return false;
+	if (getActivationState() == DISABLE_DEACTIVATION)
+		return false;
 
-    //disable deactivation
-    if (gDisableDeactivation || (gDeactivationTime == btScalar(0.)))
-        return false;
+	//disable deactivation
+	if (gDisableDeactivation || (gDeactivationTime == btScalar(0.)))
+		return false;
 
-    if ((getActivationState() == ISLAND_SLEEPING) || (getActivationState() == WANTS_DEACTIVATION))
-        return true;
+	if ((getActivationState() == ISLAND_SLEEPING) || (getActivationState() == WANTS_DEACTIVATION))
+		return true;
 
-    if (m_deactivationTime > gDeactivationTime)
-    {
-        return true;
-    }
-    return false;
+	if (m_deactivationTime > gDeactivationTime)
+	{
+		return true;
+	}
+	return false;
 }
diff --git a/thirdparty/bullet/BulletSoftBody/btSoftBody.h b/thirdparty/bullet/BulletSoftBody/btSoftBody.h
index 6a55eccbd2..f578487b8c 100644
--- a/thirdparty/bullet/BulletSoftBody/btSoftBody.h
+++ b/thirdparty/bullet/BulletSoftBody/btSoftBody.h
@@ -35,7 +35,7 @@ subject to the following restrictions:
 //#else
 #define btSoftBodyData btSoftBodyFloatData
 #define btSoftBodyDataName "btSoftBodyFloatData"
-static const btScalar  OVERLAP_REDUCTION_FACTOR = 0.1;
+static const btScalar OVERLAP_REDUCTION_FACTOR = 0.1;
 static unsigned long seed = 243703;
 //#endif //BT_USE_DOUBLE_PRECISION
 
@@ -171,10 +171,10 @@ public:
 			CL_SELF = 0x0040,  ///Cluster soft body self collision
 			VF_DD = 0x0080,    ///Vertex vs face soft vs soft handling
 
-			RVDFmask = 0x0f00, /// Rigid versus deformable face mask
-			SDF_RDF = 0x0100,  /// GJK based Rigid vs. deformable face
-			SDF_MDF = 0x0200,  /// GJK based Multibody vs. deformable face
-            SDF_RDN = 0x0400,  /// SDF based Rigid vs. deformable node
+			RVDFmask = 0x0f00,  /// Rigid versus deformable face mask
+			SDF_RDF = 0x0100,   /// GJK based Rigid vs. deformable face
+			SDF_MDF = 0x0200,   /// GJK based Multibody vs. deformable face
+			SDF_RDN = 0x0400,   /// SDF based Rigid vs. deformable node
 			/* presets	*/
 			Default = SDF_RS,
 			END
@@ -226,7 +226,7 @@ public:
 		const btCollisionObject* m_colObj; /* Rigid body			*/
 		btVector3 m_normal;                /* Outward normal		*/
 		btScalar m_offset;                 /* Offset from origin	*/
-        btVector3 m_bary;                  /* Barycentric weights for faces */
+		btVector3 m_bary;                  /* Barycentric weights for faces */
 	};
 
 	/* sMedium		*/
@@ -258,20 +258,29 @@ public:
 		Material* m_material;  // Material
 	};
 	/* Node			*/
+	struct RenderNode
+	{
+		btVector3 m_x;
+		btVector3 m_uv1;
+		btVector3 m_normal;
+	};
 	struct Node : Feature
 	{
 		btVector3 m_x;       // Position
 		btVector3 m_q;       // Previous step position/Test position
 		btVector3 m_v;       // Velocity
-        btVector3 m_vn;      // Previous step velocity
+		btVector3 m_vn;      // Previous step velocity
 		btVector3 m_f;       // Force accumulator
 		btVector3 m_n;       // Normal
 		btScalar m_im;       // 1/mass
 		btScalar m_area;     // Area
 		btDbvtNode* m_leaf;  // Leaf data
-		btScalar m_penetration;   // depth of penetration
+		int m_constrained;   // depth of penetration
 		int m_battach : 1;   // Attached
-        int index;
+		int index;
+		btVector3 m_splitv;               // velocity associated with split impulse
+		btMatrix3x3 m_effectiveMass;      // effective mass in contact
+		btMatrix3x3 m_effectiveMass_inv;  // inverse of effective mass
 	};
 	/* Link			*/
 	ATTRIBUTE_ALIGNED16(struct)
@@ -287,40 +296,47 @@ public:
 
 		BT_DECLARE_ALIGNED_ALLOCATOR();
 	};
+	struct RenderFace
+	{
+		RenderNode* m_n[3];          // Node pointers
+	};
+
 	/* Face			*/
 	struct Face : Feature
 	{
-		Node* m_n[3];        // Node pointers
-		btVector3 m_normal;  // Normal
-		btScalar m_ra;       // Rest area
-		btDbvtNode* m_leaf;  // Leaf data
-        btVector4 m_pcontact; // barycentric weights of the persistent contact
-        btVector3 m_n0, m_n1, m_vn;
-        int m_index;
+		Node* m_n[3];          // Node pointers
+		btVector3 m_normal;    // Normal
+		btScalar m_ra;         // Rest area
+		btDbvtNode* m_leaf;    // Leaf data
+		btVector4 m_pcontact;  // barycentric weights of the persistent contact
+		btVector3 m_n0, m_n1, m_vn;
+		int m_index;
 	};
 	/* Tetra		*/
 	struct Tetra : Feature
 	{
-		Node* m_n[4];        // Node pointers
-		btScalar m_rv;       // Rest volume
-		btDbvtNode* m_leaf;  // Leaf data
-		btVector3 m_c0[4];   // gradients
-		btScalar m_c1;       // (4*kVST)/(im0+im1+im2+im3)
-		btScalar m_c2;       // m_c1/sum(|g0..3|^2)
-        btMatrix3x3 m_Dm_inverse; // rest Dm^-1
-        btMatrix3x3 m_F;
-        btScalar m_element_measure;
+		Node* m_n[4];              // Node pointers
+		btScalar m_rv;             // Rest volume
+		btDbvtNode* m_leaf;        // Leaf data
+		btVector3 m_c0[4];         // gradients
+		btScalar m_c1;             // (4*kVST)/(im0+im1+im2+im3)
+		btScalar m_c2;             // m_c1/sum(|g0..3|^2)
+		btMatrix3x3 m_Dm_inverse;  // rest Dm^-1
+		btMatrix3x3 m_F;
+		btScalar m_element_measure;
+		btVector4 m_P_inv[3];  // first three columns of P_inv matrix
+	};
+
+	/*  TetraScratch  */
+	struct TetraScratch
+	{
+		btMatrix3x3 m_F;           // deformation gradient F
+		btScalar m_trace;          // trace of F^T * F
+		btScalar m_J;              // det(F)
+		btMatrix3x3 m_cofF;        // cofactor of F
+		btMatrix3x3 m_corotation;  // corotatio of the tetra
 	};
-    
-    /*  TetraScratch  */
-    struct TetraScratch
-    {
-        btMatrix3x3 m_F;                // deformation gradient F
-        btScalar m_trace;               // trace of F^T * F
-        btScalar m_J;                   // det(F)
-        btMatrix3x3 m_cofF;             // cofactor of F
-    };
-    
+
 	/* RContact		*/
 	struct RContact
 	{
@@ -331,67 +347,68 @@ public:
 		btScalar m_c2;     // ima*dt
 		btScalar m_c3;     // Friction
 		btScalar m_c4;     // Hardness
-        
-        // jacobians and unit impulse responses for multibody
-        btMultiBodyJacobianData jacobianData_normal;
-        btMultiBodyJacobianData jacobianData_t1;
-        btMultiBodyJacobianData jacobianData_t2;
-        btVector3 t1;
-        btVector3 t2;
+
+		// jacobians and unit impulse responses for multibody
+		btMultiBodyJacobianData jacobianData_normal;
+		btMultiBodyJacobianData jacobianData_t1;
+		btMultiBodyJacobianData jacobianData_t2;
+		btVector3 t1;
+		btVector3 t2;
 	};
-    
-    class DeformableRigidContact
-    {
-    public:
-        sCti m_cti;        // Contact infos
-        btMatrix3x3 m_c0;  // Impulse matrix
-        btVector3 m_c1;    // Relative anchor
-        btScalar m_c2;     // inverse mass of node/face
-        btScalar m_c3;     // Friction
-        btScalar m_c4;     // Hardness
-        
-        // jacobians and unit impulse responses for multibody
-        btMultiBodyJacobianData jacobianData_normal;
-        btMultiBodyJacobianData jacobianData_t1;
-        btMultiBodyJacobianData jacobianData_t2;
-        btVector3 t1;
-        btVector3 t2;
-    };
-    
-    class DeformableNodeRigidContact : public DeformableRigidContact
-    {
-    public:
-        Node* m_node;      // Owner node
-    };
-    
-    class DeformableNodeRigidAnchor : public DeformableNodeRigidContact
-    {
-    public:
-        btVector3 m_local;    // Anchor position in body space
-    };
-    
-    class DeformableFaceRigidContact : public DeformableRigidContact
-    {
-    public:
-        Face* m_face;                   // Owner face
-        btVector3 m_contactPoint;       // Contact point
-        btVector3 m_bary;               // Barycentric weights
-        btVector3 m_weights;            // v_contactPoint * m_weights[i] = m_face->m_node[i]->m_v;
-    };
-    
-    struct DeformableFaceNodeContact
-    {
-        Node* m_node;         // Node
-        Face* m_face;         // Face
-        btVector3 m_bary;     // Barycentric weights
-        btVector3 m_weights;  // v_contactPoint * m_weights[i] = m_face->m_node[i]->m_v;
-        btVector3 m_normal;   // Normal
-        btScalar m_margin;    // Margin
-        btScalar m_friction;  // Friction
-        btScalar m_imf;       // inverse mass of the face at contact point
-        btScalar m_c0;        // scale of the impulse matrix;
-    };
-    
+
+	class DeformableRigidContact
+	{
+	public:
+		sCti m_cti;        // Contact infos
+		btMatrix3x3 m_c0;  // Impulse matrix
+		btVector3 m_c1;    // Relative anchor
+		btScalar m_c2;     // inverse mass of node/face
+		btScalar m_c3;     // Friction
+		btScalar m_c4;     // Hardness
+		btMatrix3x3 m_c5;  // inverse effective mass
+
+		// jacobians and unit impulse responses for multibody
+		btMultiBodyJacobianData jacobianData_normal;
+		btMultiBodyJacobianData jacobianData_t1;
+		btMultiBodyJacobianData jacobianData_t2;
+		btVector3 t1;
+		btVector3 t2;
+	};
+
+	class DeformableNodeRigidContact : public DeformableRigidContact
+	{
+	public:
+		Node* m_node;  // Owner node
+	};
+
+	class DeformableNodeRigidAnchor : public DeformableNodeRigidContact
+	{
+	public:
+		btVector3 m_local;  // Anchor position in body space
+	};
+
+	class DeformableFaceRigidContact : public DeformableRigidContact
+	{
+	public:
+		Face* m_face;              // Owner face
+		btVector3 m_contactPoint;  // Contact point
+		btVector3 m_bary;          // Barycentric weights
+		btVector3 m_weights;       // v_contactPoint * m_weights[i] = m_face->m_node[i]->m_v;
+	};
+
+	struct DeformableFaceNodeContact
+	{
+		Node* m_node;         // Node
+		Face* m_face;         // Face
+		btVector3 m_bary;     // Barycentric weights
+		btVector3 m_weights;  // v_contactPoint * m_weights[i] = m_face->m_node[i]->m_v;
+		btVector3 m_normal;   // Normal
+		btScalar m_margin;    // Margin
+		btScalar m_friction;  // Friction
+		btScalar m_imf;       // inverse mass of the face at contact point
+		btScalar m_c0;        // scale of the impulse matrix;
+	};
+
 	/* SContact		*/
 	struct SContact
 	{
@@ -718,19 +735,19 @@ public:
 		tVSolverArray m_vsequence;  // Velocity solvers sequence
 		tPSolverArray m_psequence;  // Position solvers sequence
 		tPSolverArray m_dsequence;  // Drift solvers sequence
-        btScalar drag;           // deformable air drag
-        btScalar m_maxStress;       // Maximum principle first Piola stress
+		btScalar drag;              // deformable air drag
+		btScalar m_maxStress;       // Maximum principle first Piola stress
 	};
 	/* SolverState	*/
 	struct SolverState
 	{
 		//if you add new variables, always initialize them!
 		SolverState()
-			:sdt(0),
-			isdt(0),
-			velmrg(0),
-			radmrg(0),
-			updmrg(0)
+			: sdt(0),
+			  isdt(0),
+			  velmrg(0),
+			  radmrg(0),
+			  updmrg(0)
 		{
 		}
 		btScalar sdt;     // dt*timescale
@@ -769,9 +786,11 @@ public:
 	typedef btAlignedObjectArray<Cluster*> tClusterArray;
 	typedef btAlignedObjectArray<Note> tNoteArray;
 	typedef btAlignedObjectArray<Node> tNodeArray;
+	typedef btAlignedObjectArray< RenderNode> tRenderNodeArray;
 	typedef btAlignedObjectArray<btDbvtNode*> tLeafArray;
 	typedef btAlignedObjectArray<Link> tLinkArray;
 	typedef btAlignedObjectArray<Face> tFaceArray;
+	typedef btAlignedObjectArray<RenderFace> tRenderFaceArray;
 	typedef btAlignedObjectArray<Tetra> tTetraArray;
 	typedef btAlignedObjectArray<Anchor> tAnchorArray;
 	typedef btAlignedObjectArray<RContact> tRContactArray;
@@ -791,40 +810,42 @@ public:
 	btSoftBodyWorldInfo* m_worldInfo;  // World info
 	tNoteArray m_notes;                // Notes
 	tNodeArray m_nodes;                // Nodes
-    tNodeArray m_renderNodes;                // Nodes
+	tRenderNodeArray m_renderNodes;    // Render Nodes
 	tLinkArray m_links;                // Links
 	tFaceArray m_faces;                // Faces
-    tFaceArray m_renderFaces;                // Faces
+	tRenderFaceArray m_renderFaces;          // Faces
 	tTetraArray m_tetras;              // Tetras
-    btAlignedObjectArray<TetraScratch> m_tetraScratches;
-    btAlignedObjectArray<TetraScratch> m_tetraScratchesTn;
-	tAnchorArray m_anchors;            // Anchors
-    btAlignedObjectArray<DeformableNodeRigidAnchor> m_deformableAnchors;
-	tRContactArray m_rcontacts;        // Rigid contacts
-    btAlignedObjectArray<DeformableNodeRigidContact> m_nodeRigidContacts;
-    btAlignedObjectArray<DeformableFaceNodeContact> m_faceNodeContacts;
-    btAlignedObjectArray<DeformableFaceRigidContact> m_faceRigidContacts;
-	tSContactArray m_scontacts;        // Soft contacts
-	tJointArray m_joints;              // Joints
-	tMaterialArray m_materials;        // Materials
-	btScalar m_timeacc;                // Time accumulator
-	btVector3 m_bounds[2];             // Spatial bounds
-	bool m_bUpdateRtCst;               // Update runtime constants
-	btDbvt m_ndbvt;                    // Nodes tree
-	btDbvt m_fdbvt;                    // Faces tree
-	btDbvntNode* m_fdbvnt;              // Faces tree with normals
-	btDbvt m_cdbvt;                    // Clusters tree
-	tClusterArray m_clusters;          // Clusters
-	btScalar m_dampingCoefficient;     // Damping Coefficient
+	btAlignedObjectArray<TetraScratch> m_tetraScratches;
+	btAlignedObjectArray<TetraScratch> m_tetraScratchesTn;
+	tAnchorArray m_anchors;  // Anchors
+	btAlignedObjectArray<DeformableNodeRigidAnchor> m_deformableAnchors;
+	tRContactArray m_rcontacts;  // Rigid contacts
+	btAlignedObjectArray<DeformableNodeRigidContact> m_nodeRigidContacts;
+	btAlignedObjectArray<DeformableFaceNodeContact> m_faceNodeContacts;
+	btAlignedObjectArray<DeformableFaceRigidContact> m_faceRigidContacts;
+	tSContactArray m_scontacts;     // Soft contacts
+	tJointArray m_joints;           // Joints
+	tMaterialArray m_materials;     // Materials
+	btScalar m_timeacc;             // Time accumulator
+	btVector3 m_bounds[2];          // Spatial bounds
+	bool m_bUpdateRtCst;            // Update runtime constants
+	btDbvt m_ndbvt;                 // Nodes tree
+	btDbvt m_fdbvt;                 // Faces tree
+	btDbvntNode* m_fdbvnt;          // Faces tree with normals
+	btDbvt m_cdbvt;                 // Clusters tree
+	tClusterArray m_clusters;       // Clusters
+	btScalar m_dampingCoefficient;  // Damping Coefficient
 	btScalar m_sleepingThreshold;
 	btScalar m_maxSpeedSquared;
-	btAlignedObjectArray<btVector3> m_quads; // quadrature points for collision detection
+	btAlignedObjectArray<btVector3> m_quads;  // quadrature points for collision detection
 	btScalar m_repulsionStiffness;
-    btAlignedObjectArray<btVector3> m_X;   // initial positions
+	btScalar m_gravityFactor;
+	bool m_cacheBarycenter;
+	btAlignedObjectArray<btVector3> m_X;  // initial positions
 
 	btAlignedObjectArray<btVector4> m_renderNodesInterpolationWeights;
 	btAlignedObjectArray<btAlignedObjectArray<const btSoftBody::Node*> > m_renderNodesParents;
-	btAlignedObjectArray<btScalar> m_z; // vertical distance used in extrapolation
+	btAlignedObjectArray<btScalar> m_z;  // vertical distance used in extrapolation
 	bool m_useSelfCollision;
 	bool m_softSoftCollision;
 
@@ -856,11 +877,11 @@ public:
 	{
 		return m_worldInfo;
 	}
-    
-    void setDampingCoefficient(btScalar damping_coeff)
-    {
-        m_dampingCoefficient = damping_coeff;
-    }
+
+	void setDampingCoefficient(btScalar damping_coeff)
+	{
+		m_dampingCoefficient = damping_coeff;
+	}
 
 	///@todo: avoid internal softbody shape hack and move collision code to collision library
 	virtual void setCollisionShape(btCollisionShape* collisionShape)
@@ -921,11 +942,12 @@ public:
 					 Material* mat = 0);
 
 	/* Append anchor														*/
-    void appendDeformableAnchor(int node, btRigidBody* body);
-    void appendDeformableAnchor(int node, btMultiBodyLinkCollider* link);
-    void appendAnchor(int node,
+	void appendDeformableAnchor(int node, btRigidBody* body);
+	void appendDeformableAnchor(int node, btMultiBodyLinkCollider* link);
+	void appendAnchor(int node,
 					  btRigidBody* body, bool disableCollisionBetweenLinkedBodies = false, btScalar influence = 1);
 	void appendAnchor(int node, btRigidBody* body, const btVector3& localPivot, bool disableCollisionBetweenLinkedBodies = false, btScalar influence = 1);
+	void removeAnchor(int node);
 	/* Append linear joint													*/
 	void appendLinearJoint(const LJoint::Specs& specs, Cluster* body0, Body body1);
 	void appendLinearJoint(const LJoint::Specs& specs, Body body = Body());
@@ -976,10 +998,10 @@ public:
 	void setLinearVelocity(const btVector3& linVel);
 	/* Set the angular velocity of the center of mass                       */
 	void setAngularVelocity(const btVector3& angVel);
-    /* Get best fit rigid transform                                         */
-    btTransform getRigidTransform();
-    /* Transform to given pose                                              */
-    void transformTo(const btTransform& trs);
+	/* Get best fit rigid transform                                         */
+	btTransform getRigidTransform();
+	/* Transform to given pose                                              */
+	void transformTo(const btTransform& trs);
 	/* Transform															*/
 	void transform(const btTransform& trs);
 	/* Translate															*/
@@ -1068,11 +1090,11 @@ public:
 	/* defaultCollisionHandlers												*/
 	void defaultCollisionHandler(const btCollisionObjectWrapper* pcoWrap);
 	void defaultCollisionHandler(btSoftBody* psb);
-    void setSelfCollision(bool useSelfCollision);
-    bool useSelfCollision();
-    void updateDeactivation(btScalar timeStep);
-    void setZeroVelocity();
-    bool wantsSleeping();
+	void setSelfCollision(bool useSelfCollision);
+	bool useSelfCollision();
+	void updateDeactivation(btScalar timeStep);
+	void setZeroVelocity();
+	bool wantsSleeping();
 
 	//
 	// Functionality to deal with new accelerated solvers.
@@ -1151,8 +1173,8 @@ public:
 	void rebuildNodeTree();
 	btVector3 evaluateCom() const;
 	bool checkDeformableContact(const btCollisionObjectWrapper* colObjWrap, const btVector3& x, btScalar margin, btSoftBody::sCti& cti, bool predict = false) const;
-    bool checkDeformableFaceContact(const btCollisionObjectWrapper* colObjWrap, Face& f, btVector3& contact_point, btVector3& bary, btScalar margin, btSoftBody::sCti& cti, bool predict = false) const;
-    bool checkContact(const btCollisionObjectWrapper* colObjWrap, const btVector3& x, btScalar margin, btSoftBody::sCti& cti) const;
+	bool checkDeformableFaceContact(const btCollisionObjectWrapper* colObjWrap, Face& f, btVector3& contact_point, btVector3& bary, btScalar margin, btSoftBody::sCti& cti, bool predict = false) const;
+	bool checkContact(const btCollisionObjectWrapper* colObjWrap, const btVector3& x, btScalar margin, btSoftBody::sCti& cti) const;
 	void updateNormals();
 	void updateBounds();
 	void updatePose();
@@ -1166,14 +1188,16 @@ public:
 	void solveClusters(btScalar sor);
 	void applyClusters(bool drift);
 	void dampClusters();
-    void setSpringStiffness(btScalar k);
-    void initializeDmInverse();
-    void updateDeformation();
-    void advanceDeformation();
+	void setSpringStiffness(btScalar k);
+	void setGravityFactor(btScalar gravFactor);
+	void setCacheBarycenter(bool cacheBarycenter);
+	void initializeDmInverse();
+	void updateDeformation();
+	void advanceDeformation();
 	void applyForces();
-    void setMaxStress(btScalar maxStress);
-    void interpolateRenderMesh();
-    void setCollisionQuadrature(int N);
+	void setMaxStress(btScalar maxStress);
+	void interpolateRenderMesh();
+	void setCollisionQuadrature(int N);
 	static void PSolve_Anchors(btSoftBody* psb, btScalar kst, btScalar ti);
 	static void PSolve_RContacts(btSoftBody* psb, btScalar kst, btScalar ti);
 	static void PSolve_SContacts(btSoftBody* psb, btScalar, btScalar ti);
@@ -1182,14 +1206,15 @@ public:
 	static psolver_t getSolver(ePSolver::_ solver);
 	static vsolver_t getSolver(eVSolver::_ solver);
 	void geometricCollisionHandler(btSoftBody* psb);
-#define SAFE_EPSILON SIMD_EPSILON*100.0
+#define SAFE_EPSILON SIMD_EPSILON * 100.0
 	void updateNode(btDbvtNode* node, bool use_velocity, bool margin)
 	{
 		if (node->isleaf())
 		{
 			btSoftBody::Node* n = (btSoftBody::Node*)(node->data);
-			ATTRIBUTE_ALIGNED16(btDbvtVolume) vol;
-			btScalar pad = margin ? m_sst.radmrg : SAFE_EPSILON; // use user defined margin or margin for floating point precision
+			ATTRIBUTE_ALIGNED16(btDbvtVolume)
+			vol;
+			btScalar pad = margin ? m_sst.radmrg : SAFE_EPSILON;  // use user defined margin or margin for floating point precision
 			if (use_velocity)
 			{
 				btVector3 points[2] = {n->m_x, n->m_x + m_sst.sdt * n->m_v};
@@ -1207,38 +1232,40 @@ public:
 		{
 			updateNode(node->childs[0], use_velocity, margin);
 			updateNode(node->childs[1], use_velocity, margin);
-			ATTRIBUTE_ALIGNED16(btDbvtVolume) vol;
+			ATTRIBUTE_ALIGNED16(btDbvtVolume)
+			vol;
 			Merge(node->childs[0]->volume, node->childs[1]->volume, vol);
 			node->volume = vol;
 		}
 	}
-	
-    void updateNodeTree(bool use_velocity, bool margin)
+
+	void updateNodeTree(bool use_velocity, bool margin)
 	{
 		if (m_ndbvt.m_root)
 			updateNode(m_ndbvt.m_root, use_velocity, margin);
 	}
 
-	template <class DBVTNODE> // btDbvtNode or btDbvntNode
+	template <class DBVTNODE>  // btDbvtNode or btDbvntNode
 	void updateFace(DBVTNODE* node, bool use_velocity, bool margin)
 	{
 		if (node->isleaf())
 		{
 			btSoftBody::Face* f = (btSoftBody::Face*)(node->data);
-			btScalar pad = margin ? m_sst.radmrg : SAFE_EPSILON; // use user defined margin or margin for floating point precision
-			ATTRIBUTE_ALIGNED16(btDbvtVolume) vol;
+			btScalar pad = margin ? m_sst.radmrg : SAFE_EPSILON;  // use user defined margin or margin for floating point precision
+			ATTRIBUTE_ALIGNED16(btDbvtVolume)
+			vol;
 			if (use_velocity)
 			{
 				btVector3 points[6] = {f->m_n[0]->m_x, f->m_n[0]->m_x + m_sst.sdt * f->m_n[0]->m_v,
-					f->m_n[1]->m_x, f->m_n[1]->m_x + m_sst.sdt * f->m_n[1]->m_v,
-					f->m_n[2]->m_x, f->m_n[2]->m_x + m_sst.sdt * f->m_n[2]->m_v};
+									   f->m_n[1]->m_x, f->m_n[1]->m_x + m_sst.sdt * f->m_n[1]->m_v,
+									   f->m_n[2]->m_x, f->m_n[2]->m_x + m_sst.sdt * f->m_n[2]->m_v};
 				vol = btDbvtVolume::FromPoints(points, 6);
 			}
 			else
 			{
 				btVector3 points[3] = {f->m_n[0]->m_x,
-					f->m_n[1]->m_x,
-					f->m_n[2]->m_x};
+									   f->m_n[1]->m_x,
+									   f->m_n[2]->m_x};
 				vol = btDbvtVolume::FromPoints(points, 3);
 			}
 			vol.Expand(btVector3(pad, pad, pad));
@@ -1249,7 +1276,8 @@ public:
 		{
 			updateFace(node->childs[0], use_velocity, margin);
 			updateFace(node->childs[1], use_velocity, margin);
-			ATTRIBUTE_ALIGNED16(btDbvtVolume) vol;
+			ATTRIBUTE_ALIGNED16(btDbvtVolume)
+			vol;
 			Merge(node->childs[0]->volume, node->childs[1]->volume, vol);
 			node->volume = vol;
 		}
@@ -1271,7 +1299,7 @@ public:
 		return (a * coord.x() + b * coord.y() + c * coord.z());
 	}
 
-    void applyRepulsionForce(btScalar timeStep, bool applySpringForce)
+	void applyRepulsionForce(btScalar timeStep, bool applySpringForce)
 	{
 		btAlignedObjectArray<int> indices;
 		{
@@ -1297,58 +1325,60 @@ public:
 			const btVector3& n = c.m_normal;
 			btVector3 l = node->m_x - BaryEval(face->m_n[0]->m_x, face->m_n[1]->m_x, face->m_n[2]->m_x, w);
 			btScalar d = c.m_margin - n.dot(l);
-			d = btMax(btScalar(0),d);
-			
+			d = btMax(btScalar(0), d);
+
 			const btVector3& va = node->m_v;
 			btVector3 vb = BaryEval(face->m_n[0]->m_v, face->m_n[1]->m_v, face->m_n[2]->m_v, w);
 			btVector3 vr = va - vb;
-			const btScalar vn = btDot(vr, n); // dn < 0 <==> opposing
+			const btScalar vn = btDot(vr, n);  // dn < 0 <==> opposing
 			if (vn > OVERLAP_REDUCTION_FACTOR * d / timeStep)
 				continue;
-			btVector3 vt = vr - vn*n;
+			btVector3 vt = vr - vn * n;
 			btScalar I = 0;
-			btScalar mass = node->m_im == 0 ? 0 : btScalar(1)/node->m_im;
+			btScalar mass = node->m_im == 0 ? 0 : btScalar(1) / node->m_im;
 			if (applySpringForce)
 				I = -btMin(m_repulsionStiffness * timeStep * d, mass * (OVERLAP_REDUCTION_FACTOR * d / timeStep - vn));
 			if (vn < 0)
 				I += 0.5 * mass * vn;
-			btScalar face_penetration = 0, node_penetration = node->m_penetration;
+			int face_penetration = 0, node_penetration = node->m_constrained;
 			for (int i = 0; i < 3; ++i)
-				face_penetration =  btMax(face_penetration, face->m_n[i]->m_penetration);
-			btScalar I_tilde = .5 *I /(1.0+w.length2());
-			
-//             double the impulse if node or face is constrained.
-            if (face_penetration > 0 || node_penetration > 0)
-                I_tilde *= 2.0;
-            if (face_penetration <= node_penetration)
+				face_penetration |= face->m_n[i]->m_constrained;
+			btScalar I_tilde = 2.0 * I / (1.0 + w.length2());
+
+			//             double the impulse if node or face is constrained.
+			if (face_penetration > 0 || node_penetration > 0)
+			{
+				I_tilde *= 2.0;
+			}
+			if (face_penetration <= 0)
 			{
 				for (int j = 0; j < 3; ++j)
-					face->m_n[j]->m_v += w[j]*n*I_tilde*node->m_im;
+					face->m_n[j]->m_v += w[j] * n * I_tilde * node->m_im;
 			}
-            if (face_penetration >= node_penetration)
+			if (node_penetration <= 0)
 			{
-				node->m_v -= I_tilde*node->m_im*n;
+				node->m_v -= I_tilde * node->m_im * n;
 			}
-			
+
 			// apply frictional impulse
 			btScalar vt_norm = vt.safeNorm();
 			if (vt_norm > SIMD_EPSILON)
 			{
 				btScalar delta_vn = -2 * I * node->m_im;
 				btScalar mu = c.m_friction;
-				btScalar vt_new = btMax(btScalar(1) - mu * delta_vn / (vt_norm + SIMD_EPSILON), btScalar(0))*vt_norm;
-				I = 0.5 * mass * (vt_norm-vt_new);
+				btScalar vt_new = btMax(btScalar(1) - mu * delta_vn / (vt_norm + SIMD_EPSILON), btScalar(0)) * vt_norm;
+				I = 0.5 * mass * (vt_norm - vt_new);
 				vt.safeNormalize();
-				I_tilde = .5 *I /(1.0+w.length2());
-//                 double the impulse if node or face is constrained.
-//                if (face_penetration > 0 || node_penetration > 0)
-//                    I_tilde *= 2.0;
-                if (face_penetration <= node_penetration)
+				I_tilde = 2.0 * I / (1.0 + w.length2());
+				//                 double the impulse if node or face is constrained.
+				if (face_penetration > 0 || node_penetration > 0)
+					I_tilde *= 2.0;
+				if (face_penetration <= 0)
 				{
 					for (int j = 0; j < 3; ++j)
 						face->m_n[j]->m_v += w[j] * vt * I_tilde * (face->m_n[j])->m_im;
 				}
-                if (face_penetration >= node_penetration)
+				if (node_penetration <= 0)
 				{
 					node->m_v -= I_tilde * node->m_im * vt;
 				}
@@ -1356,7 +1386,7 @@ public:
 		}
 	}
 	virtual int calculateSerializeBufferSize() const;
-  
+
 	///fills the dataBuffer and returns the struct name (and 0 on failure)
 	virtual const char* serialize(void* dataBuffer, class btSerializer* serializer) const;
 };
diff --git a/thirdparty/bullet/BulletSoftBody/btSoftBodyHelpers.cpp b/thirdparty/bullet/BulletSoftBody/btSoftBodyHelpers.cpp
index c1a87c7d57..f63e48f9a5 100644
--- a/thirdparty/bullet/BulletSoftBody/btSoftBodyHelpers.cpp
+++ b/thirdparty/bullet/BulletSoftBody/btSoftBodyHelpers.cpp
@@ -727,7 +727,7 @@ btSoftBody* btSoftBodyHelpers::CreatePatch(btSoftBodyWorldInfo& worldInfo, const
 										   int resy,
 										   int fixeds,
 										   bool gendiags,
-                                           btScalar perturbation)
+										   btScalar perturbation)
 {
 #define IDX(_x_, _y_) ((_y_)*rx + (_x_))
 	/* Create nodes	*/
@@ -747,12 +747,12 @@ btSoftBody* btSoftBodyHelpers::CreatePatch(btSoftBodyWorldInfo& worldInfo, const
 		for (int ix = 0; ix < rx; ++ix)
 		{
 			const btScalar tx = ix / (btScalar)(rx - 1);
-            btScalar pert = perturbation * btScalar(rand())/RAND_MAX;
-            btVector3 temp1 = py1;
-            temp1.setY(py1.getY() + pert);
-            btVector3 temp = py0;
-            pert = perturbation * btScalar(rand())/RAND_MAX;
-            temp.setY(py0.getY() + pert);
+			btScalar pert = perturbation * btScalar(rand()) / RAND_MAX;
+			btVector3 temp1 = py1;
+			temp1.setY(py1.getY() + pert);
+			btVector3 temp = py0;
+			pert = perturbation * btScalar(rand()) / RAND_MAX;
+			temp.setY(py0.getY() + pert);
 			x[IDX(ix, iy)] = lerp(temp, temp1, tx);
 			m[IDX(ix, iy)] = 1;
 		}
@@ -1233,9 +1233,9 @@ if(face&&face[0])
 			}
 		}
 	}
-    psb->initializeDmInverse();
-    psb->m_tetraScratches.resize(psb->m_tetras.size());
-    psb->m_tetraScratchesTn.resize(psb->m_tetras.size());
+	psb->initializeDmInverse();
+	psb->m_tetraScratches.resize(psb->m_tetras.size());
+	psb->m_tetraScratchesTn.resize(psb->m_tetras.size());
 	printf("Nodes:  %u\r\n", psb->m_nodes.size());
 	printf("Links:  %u\r\n", psb->m_links.size());
 	printf("Faces:  %u\r\n", psb->m_faces.size());
@@ -1245,61 +1245,62 @@ if(face&&face[0])
 
 btSoftBody* btSoftBodyHelpers::CreateFromVtkFile(btSoftBodyWorldInfo& worldInfo, const char* vtk_file)
 {
-    std::ifstream fs;
-    fs.open(vtk_file);
-    btAssert(fs);
-    
-    typedef btAlignedObjectArray<int> Index;
-    std::string line;
-    btAlignedObjectArray<btVector3> X;
-    btVector3 position;
-    btAlignedObjectArray<Index> indices;
-    bool reading_points = false;
-    bool reading_tets = false;
-    size_t n_points = 0;
-    size_t n_tets = 0;
-    size_t x_count = 0;
-    size_t indices_count = 0;
-    while (std::getline(fs, line))
-    {
-        std::stringstream ss(line);
-        if (line.size() == (size_t)(0))
-        {
-        }
-        else if (line.substr(0, 6) == "POINTS")
-        {
-            reading_points = true;
-            reading_tets = false;
-            ss.ignore(128, ' '); // ignore "POINTS"
-            ss >> n_points;
-            X.resize(n_points);
-        }
-        else if (line.substr(0, 5) == "CELLS")
-        {
-            reading_points = false;
-            reading_tets = true;
-            ss.ignore(128, ' '); // ignore "CELLS"
-            ss >> n_tets;
-            indices.resize(n_tets);
-        }
-        else if (line.substr(0, 10) == "CELL_TYPES")
-        {
-            reading_points = false;
-            reading_tets = false;
-        }
-        else if (reading_points)
-        {
-            btScalar p;
-            ss >> p;
-            position.setX(p);
-            ss >> p;
-            position.setY(p);
-            ss >> p;
-            position.setZ(p);
-            X[x_count++] = position;
-        }
-        else if (reading_tets)
-        {
+	std::ifstream fs;
+	fs.open(vtk_file);
+	btAssert(fs);
+
+	typedef btAlignedObjectArray<int> Index;
+	std::string line;
+	btAlignedObjectArray<btVector3> X;
+	btVector3 position;
+	btAlignedObjectArray<Index> indices;
+	bool reading_points = false;
+	bool reading_tets = false;
+	size_t n_points = 0;
+	size_t n_tets = 0;
+	size_t x_count = 0;
+	size_t indices_count = 0;
+	while (std::getline(fs, line))
+	{
+		std::stringstream ss(line);
+		if (line.size() == (size_t)(0))
+		{
+		}
+		else if (line.substr(0, 6) == "POINTS")
+		{
+			reading_points = true;
+			reading_tets = false;
+			ss.ignore(128, ' ');  // ignore "POINTS"
+			ss >> n_points;
+			X.resize(n_points);
+		}
+		else if (line.substr(0, 5) == "CELLS")
+		{
+			reading_points = false;
+			reading_tets = true;
+			ss.ignore(128, ' ');  // ignore "CELLS"
+			ss >> n_tets;
+			indices.resize(n_tets);
+		}
+		else if (line.substr(0, 10) == "CELL_TYPES")
+		{
+			reading_points = false;
+			reading_tets = false;
+		}
+		else if (reading_points)
+		{
+			btScalar p;
+			ss >> p;
+			position.setX(p);
+			ss >> p;
+			position.setY(p);
+			ss >> p;
+			position.setZ(p);
+			//printf("v %f %f %f\n", position.getX(), position.getY(), position.getZ());
+			X[x_count++] = position;
+		}
+		else if (reading_tets)
+		{
 			int d;
 			ss >> d;
 			if (d != 4)
@@ -1308,317 +1309,355 @@ btSoftBody* btSoftBodyHelpers::CreateFromVtkFile(btSoftBodyWorldInfo& worldInfo,
 				fs.close();
 				return 0;
 			}
-            ss.ignore(128, ' '); // ignore "4"
-            Index tet;
-            tet.resize(4);
-            for (size_t i = 0; i < 4; i++)
-            {
-                ss >> tet[i];
-                printf("%d ", tet[i]);
-            }
-            printf("\n");
-            indices[indices_count++] = tet;
-        }
-    }
-    btSoftBody* psb = new btSoftBody(&worldInfo, n_points, &X[0], 0);
-    
-    for (int i = 0; i < n_tets; ++i)
-    {
-        const Index& ni = indices[i];
-        psb->appendTetra(ni[0], ni[1], ni[2], ni[3]);
-        {
-            psb->appendLink(ni[0], ni[1], 0, true);
-            psb->appendLink(ni[1], ni[2], 0, true);
-            psb->appendLink(ni[2], ni[0], 0, true);
-            psb->appendLink(ni[0], ni[3], 0, true);
-            psb->appendLink(ni[1], ni[3], 0, true);
-            psb->appendLink(ni[2], ni[3], 0, true);
-        }
-    }
-    
-    
-    generateBoundaryFaces(psb);
-    psb->initializeDmInverse();
-    psb->m_tetraScratches.resize(psb->m_tetras.size());
-    psb->m_tetraScratchesTn.resize(psb->m_tetras.size());
-    printf("Nodes:  %u\r\n", psb->m_nodes.size());
-    printf("Links:  %u\r\n", psb->m_links.size());
-    printf("Faces:  %u\r\n", psb->m_faces.size());
-    printf("Tetras: %u\r\n", psb->m_tetras.size());
-
-    fs.close();
-    return psb;
+			ss.ignore(128, ' ');  // ignore "4"
+			Index tet;
+			tet.resize(4);
+			for (size_t i = 0; i < 4; i++)
+			{
+				ss >> tet[i];
+				//printf("%d ", tet[i]);
+			}
+			//printf("\n");
+			indices[indices_count++] = tet;
+		}
+	}
+	btSoftBody* psb = new btSoftBody(&worldInfo, n_points, &X[0], 0);
+
+	for (int i = 0; i < n_tets; ++i)
+	{
+		const Index& ni = indices[i];
+		psb->appendTetra(ni[0], ni[1], ni[2], ni[3]);
+		{
+			psb->appendLink(ni[0], ni[1], 0, true);
+			psb->appendLink(ni[1], ni[2], 0, true);
+			psb->appendLink(ni[2], ni[0], 0, true);
+			psb->appendLink(ni[0], ni[3], 0, true);
+			psb->appendLink(ni[1], ni[3], 0, true);
+			psb->appendLink(ni[2], ni[3], 0, true);
+		}
+	}
+
+	generateBoundaryFaces(psb);
+	psb->initializeDmInverse();
+	psb->m_tetraScratches.resize(psb->m_tetras.size());
+	psb->m_tetraScratchesTn.resize(psb->m_tetras.size());
+	printf("Nodes:  %u\r\n", psb->m_nodes.size());
+	printf("Links:  %u\r\n", psb->m_links.size());
+	printf("Faces:  %u\r\n", psb->m_faces.size());
+	printf("Tetras: %u\r\n", psb->m_tetras.size());
+
+	fs.close();
+	return psb;
 }
 
 void btSoftBodyHelpers::generateBoundaryFaces(btSoftBody* psb)
 {
-    int counter = 0;
-    for (int i = 0; i < psb->m_nodes.size(); ++i)
-    {
-        psb->m_nodes[i].index = counter++;
-    }
-    typedef btAlignedObjectArray<int> Index;
-    btAlignedObjectArray<Index> indices;
-    indices.resize(psb->m_tetras.size());
-    for (int i = 0; i < indices.size(); ++i)
-    {
-        Index index;
-        index.push_back(psb->m_tetras[i].m_n[0]->index);
-        index.push_back(psb->m_tetras[i].m_n[1]->index);
-        index.push_back(psb->m_tetras[i].m_n[2]->index);
-        index.push_back(psb->m_tetras[i].m_n[3]->index);
-        indices[i] = index;
-    }
-    
-    std::map<std::vector<int>, std::vector<int> > dict;
-    for (int i = 0; i < indices.size(); ++i)
-    {
-        for (int j = 0; j < 4; ++j)
-        {
-            std::vector<int> f;
-            if (j == 0)
-            {
-                f.push_back(indices[i][1]);
-                f.push_back(indices[i][0]);
-                f.push_back(indices[i][2]);
-            }
-            if (j == 1)
-            {
-                f.push_back(indices[i][3]);
-                f.push_back(indices[i][0]);
-                f.push_back(indices[i][1]);
-            }
-            if (j == 2)
-            {
-                f.push_back(indices[i][3]);
-                f.push_back(indices[i][1]);
-                f.push_back(indices[i][2]);
-            }
-            if (j == 3)
-            {
-                f.push_back(indices[i][2]);
-                f.push_back(indices[i][0]);
-                f.push_back(indices[i][3]);
-            }
-            std::vector<int> f_sorted = f;
-            std::sort(f_sorted.begin(), f_sorted.end());
-            if (dict.find(f_sorted) != dict.end())
-            {
-                dict.erase(f_sorted);
-            }
-            else
-            {
-                dict.insert(std::make_pair(f_sorted, f));
-            }
-        }
-    }
-    
-    for (std::map<std::vector<int>, std::vector<int> >::iterator it = dict.begin(); it != dict.end(); ++it)
-    {
-        std::vector<int> f = it->second;
-        psb->appendFace(f[0], f[1], f[2]);
-    }
+	int counter = 0;
+	for (int i = 0; i < psb->m_nodes.size(); ++i)
+	{
+		psb->m_nodes[i].index = counter++;
+	}
+	typedef btAlignedObjectArray<int> Index;
+	btAlignedObjectArray<Index> indices;
+	indices.resize(psb->m_tetras.size());
+	for (int i = 0; i < indices.size(); ++i)
+	{
+		Index index;
+		index.push_back(psb->m_tetras[i].m_n[0]->index);
+		index.push_back(psb->m_tetras[i].m_n[1]->index);
+		index.push_back(psb->m_tetras[i].m_n[2]->index);
+		index.push_back(psb->m_tetras[i].m_n[3]->index);
+		indices[i] = index;
+	}
+
+	std::map<std::vector<int>, std::vector<int> > dict;
+	for (int i = 0; i < indices.size(); ++i)
+	{
+		for (int j = 0; j < 4; ++j)
+		{
+			std::vector<int> f;
+			if (j == 0)
+			{
+				f.push_back(indices[i][1]);
+				f.push_back(indices[i][0]);
+				f.push_back(indices[i][2]);
+			}
+			if (j == 1)
+			{
+				f.push_back(indices[i][3]);
+				f.push_back(indices[i][0]);
+				f.push_back(indices[i][1]);
+			}
+			if (j == 2)
+			{
+				f.push_back(indices[i][3]);
+				f.push_back(indices[i][1]);
+				f.push_back(indices[i][2]);
+			}
+			if (j == 3)
+			{
+				f.push_back(indices[i][2]);
+				f.push_back(indices[i][0]);
+				f.push_back(indices[i][3]);
+			}
+			std::vector<int> f_sorted = f;
+			std::sort(f_sorted.begin(), f_sorted.end());
+			if (dict.find(f_sorted) != dict.end())
+			{
+				dict.erase(f_sorted);
+			}
+			else
+			{
+				dict.insert(std::make_pair(f_sorted, f));
+			}
+		}
+	}
+
+	for (std::map<std::vector<int>, std::vector<int> >::iterator it = dict.begin(); it != dict.end(); ++it)
+	{
+		std::vector<int> f = it->second;
+		psb->appendFace(f[0], f[1], f[2]);
+		//printf("f %d %d %d\n", f[0] + 1, f[1] + 1, f[2] + 1);
+	}
 }
 
+//Write the surface mesh to an obj file.
 void btSoftBodyHelpers::writeObj(const char* filename, const btSoftBody* psb)
 {
-    std::ofstream fs;
-    fs.open(filename);
-    btAssert(fs);
-    for (int i = 0; i < psb->m_nodes.size(); ++i)
-    {
-        fs << "v";
-        for (int d = 0; d < 3; d++)
-        {
-             fs << " " << psb->m_nodes[i].m_x[d];
-        }
-        fs << "\n";
-    }
-    
-    for (int i = 0; i < psb->m_faces.size(); ++i)
-    {
-        fs << "f";
-        for (int n = 0; n < 3; n++)
-        {
-            fs << " " << psb->m_faces[i].m_n[n]->index + 1;
-        }
-        fs << "\n";
-    }
-    fs.close();
+	std::ofstream fs;
+	fs.open(filename);
+	btAssert(fs);
+
+	if (psb->m_tetras.size() > 0)
+	{
+		// For tetrahedron mesh, we need to re-index the surface mesh for it to be in obj file/
+		std::map<int, int> dict;
+		for (int i = 0; i < psb->m_faces.size(); i++)
+		{
+			for (int d = 0; d < 3; d++)
+			{
+				int index = psb->m_faces[i].m_n[d]->index;
+				if (dict.find(index) == dict.end())
+				{
+					int dict_size = dict.size();
+					dict[index] = dict_size;
+					fs << "v";
+					for (int k = 0; k < 3; k++)
+					{
+						fs << " " << psb->m_nodes[index].m_x[k];
+					}
+					fs << "\n";
+				}
+			}
+		}
+		// Write surface mesh.
+		for (int i = 0; i < psb->m_faces.size(); ++i)
+		{
+			fs << "f";
+			for (int n = 0; n < 3; n++)
+			{
+				fs << " " << dict[psb->m_faces[i].m_n[n]->index] + 1;
+			}
+			fs << "\n";
+		}
+	}
+	else
+	{
+		// For trimesh, directly write out all the nodes and faces.xs
+		for (int i = 0; i < psb->m_nodes.size(); ++i)
+		{
+			fs << "v";
+			for (int d = 0; d < 3; d++)
+			{
+				fs << " " << psb->m_nodes[i].m_x[d];
+			}
+			fs << "\n";
+		}
+
+		for (int i = 0; i < psb->m_faces.size(); ++i)
+		{
+			fs << "f";
+			for (int n = 0; n < 3; n++)
+			{
+				fs << " " << psb->m_faces[i].m_n[n]->index + 1;
+			}
+			fs << "\n";
+		}
+	}
+	fs.close();
 }
 
 void btSoftBodyHelpers::duplicateFaces(const char* filename, const btSoftBody* psb)
 {
-    std::ifstream fs_read;
-    fs_read.open(filename);
-    std::string line;
-    btVector3 pos;
-    btAlignedObjectArray<btAlignedObjectArray<int> > additional_faces;
-    while (std::getline(fs_read, line))
-    {
-        std::stringstream ss(line);
-        if (line[0] == 'v')
-        {
-        }
-        else if (line[0] == 'f')
-        {
-            ss.ignore();
-            int id0, id1, id2;
-            ss >> id0;
-            ss >> id1;
-            ss >> id2;
-            btAlignedObjectArray<int> new_face;
-            new_face.push_back(id1);
-            new_face.push_back(id0);
-            new_face.push_back(id2);
-            additional_faces.push_back(new_face);
-        }
-    }
-    fs_read.close();
-
-    std::ofstream fs_write;
-    fs_write.open(filename, std::ios_base::app);
-    for (int i = 0; i < additional_faces.size(); ++i)
-    {
-        fs_write << "f";
-        for (int n = 0; n < 3; n++)
-        {
-            fs_write << " " << additional_faces[i][n];
-        }
-        fs_write << "\n";
-    }
-    fs_write.close();
+	std::ifstream fs_read;
+	fs_read.open(filename);
+	std::string line;
+	btVector3 pos;
+	btAlignedObjectArray<btAlignedObjectArray<int> > additional_faces;
+	while (std::getline(fs_read, line))
+	{
+		std::stringstream ss(line);
+		if (line[0] == 'v')
+		{
+		}
+		else if (line[0] == 'f')
+		{
+			ss.ignore();
+			int id0, id1, id2;
+			ss >> id0;
+			ss >> id1;
+			ss >> id2;
+			btAlignedObjectArray<int> new_face;
+			new_face.push_back(id1);
+			new_face.push_back(id0);
+			new_face.push_back(id2);
+			additional_faces.push_back(new_face);
+		}
+	}
+	fs_read.close();
+
+	std::ofstream fs_write;
+	fs_write.open(filename, std::ios_base::app);
+	for (int i = 0; i < additional_faces.size(); ++i)
+	{
+		fs_write << "f";
+		for (int n = 0; n < 3; n++)
+		{
+			fs_write << " " << additional_faces[i][n];
+		}
+		fs_write << "\n";
+	}
+	fs_write.close();
 }
 
 // Given a simplex with vertices a,b,c,d, find the barycentric weights of p in this simplex
 void btSoftBodyHelpers::getBarycentricWeights(const btVector3& a, const btVector3& b, const btVector3& c, const btVector3& d, const btVector3& p, btVector4& bary)
 {
-    btVector3 vap = p - a;
-    btVector3 vbp = p - b;
-    
-    btVector3 vab = b - a;
-    btVector3 vac = c - a;
-    btVector3 vad = d - a;
-    
-    btVector3 vbc = c - b;
-    btVector3 vbd = d - b;
-    btScalar va6 = (vbp.cross(vbd)).dot(vbc);
-    btScalar vb6 = (vap.cross(vac)).dot(vad);
-    btScalar vc6 = (vap.cross(vad)).dot(vab);
-    btScalar vd6 = (vap.cross(vab)).dot(vac);
-    btScalar v6 = btScalar(1) / (vab.cross(vac).dot(vad));
-    bary = btVector4(va6*v6, vb6*v6, vc6*v6, vd6*v6);
+	btVector3 vap = p - a;
+	btVector3 vbp = p - b;
+
+	btVector3 vab = b - a;
+	btVector3 vac = c - a;
+	btVector3 vad = d - a;
+
+	btVector3 vbc = c - b;
+	btVector3 vbd = d - b;
+	btScalar va6 = (vbp.cross(vbd)).dot(vbc);
+	btScalar vb6 = (vap.cross(vac)).dot(vad);
+	btScalar vc6 = (vap.cross(vad)).dot(vab);
+	btScalar vd6 = (vap.cross(vab)).dot(vac);
+	btScalar v6 = btScalar(1) / (vab.cross(vac).dot(vad));
+	bary = btVector4(va6 * v6, vb6 * v6, vc6 * v6, vd6 * v6);
 }
 
 // Given a simplex with vertices a,b,c, find the barycentric weights of p in this simplex. bary[3] = 0.
 void btSoftBodyHelpers::getBarycentricWeights(const btVector3& a, const btVector3& b, const btVector3& c, const btVector3& p, btVector4& bary)
 {
-    btVector3 v0 = b - a, v1 = c - a, v2 = p - a;
-    btScalar d00 = btDot(v0, v0);
-    btScalar d01 = btDot(v0, v1);
-    btScalar d11 = btDot(v1, v1);
-    btScalar d20 = btDot(v2, v0);
-    btScalar d21 = btDot(v2, v1);
-    btScalar invDenom = 1.0 / (d00 * d11 - d01 * d01);
-    bary[1] = (d11 * d20 - d01 * d21) * invDenom;
-    bary[2] = (d00 * d21 - d01 * d20) * invDenom;
-    bary[0] = 1.0 - bary[1] - bary[2];
-    bary[3] = 0;
+	btVector3 v0 = b - a, v1 = c - a, v2 = p - a;
+	btScalar d00 = btDot(v0, v0);
+	btScalar d01 = btDot(v0, v1);
+	btScalar d11 = btDot(v1, v1);
+	btScalar d20 = btDot(v2, v0);
+	btScalar d21 = btDot(v2, v1);
+	btScalar invDenom = 1.0 / (d00 * d11 - d01 * d01);
+	bary[1] = (d11 * d20 - d01 * d21) * invDenom;
+	bary[2] = (d00 * d21 - d01 * d20) * invDenom;
+	bary[0] = 1.0 - bary[1] - bary[2];
+	bary[3] = 0;
 }
 
 // Iterate through all render nodes to find the simulation tetrahedron that contains the render node and record the barycentric weights
 // If the node is not inside any tetrahedron, assign it to the tetrahedron in which the node has the least negative barycentric weight
 void btSoftBodyHelpers::interpolateBarycentricWeights(btSoftBody* psb)
 {
-    psb->m_z.resize(0);
-    psb->m_renderNodesInterpolationWeights.resize(psb->m_renderNodes.size());
-    psb->m_renderNodesParents.resize(psb->m_renderNodes.size());
-    for (int i = 0; i < psb->m_renderNodes.size(); ++i)
-    {
-        const btVector3& p = psb->m_renderNodes[i].m_x;
-        btVector4 bary;
-        btVector4 optimal_bary;
-        btScalar min_bary_weight = -1e3;
-        btAlignedObjectArray<const btSoftBody::Node*> optimal_parents;
-        for (int j = 0; j < psb->m_tetras.size(); ++j)
-        {
-            const btSoftBody::Tetra& t = psb->m_tetras[j];
-            getBarycentricWeights(t.m_n[0]->m_x, t.m_n[1]->m_x, t.m_n[2]->m_x, t.m_n[3]->m_x, p, bary);
-            btScalar new_min_bary_weight = bary[0];
-            for (int k = 1; k < 4; ++k)
-            {
-                new_min_bary_weight = btMin(new_min_bary_weight, bary[k]);
-            }
-            if (new_min_bary_weight > min_bary_weight)
-            {
-                btAlignedObjectArray<const btSoftBody::Node*> parents;
-                parents.push_back(t.m_n[0]);
-                parents.push_back(t.m_n[1]);
-                parents.push_back(t.m_n[2]);
-                parents.push_back(t.m_n[3]);
-                optimal_parents = parents;
-                optimal_bary = bary;
-                min_bary_weight = new_min_bary_weight;
-                // stop searching if p is inside the tetrahedron at hand
-                if (bary[0]>=0. && bary[1]>=0. && bary[2]>=0. && bary[3]>=0.)
-                {
-                    break;
-                }
-            }
-        }
-        psb->m_renderNodesInterpolationWeights[i] = optimal_bary;
-        psb->m_renderNodesParents[i] = optimal_parents;
-    }
+	psb->m_z.resize(0);
+	psb->m_renderNodesInterpolationWeights.resize(psb->m_renderNodes.size());
+	psb->m_renderNodesParents.resize(psb->m_renderNodes.size());
+	for (int i = 0; i < psb->m_renderNodes.size(); ++i)
+	{
+		const btVector3& p = psb->m_renderNodes[i].m_x;
+		btVector4 bary;
+		btVector4 optimal_bary;
+		btScalar min_bary_weight = -1e3;
+		btAlignedObjectArray<const btSoftBody::Node*> optimal_parents;
+		for (int j = 0; j < psb->m_tetras.size(); ++j)
+		{
+			const btSoftBody::Tetra& t = psb->m_tetras[j];
+			getBarycentricWeights(t.m_n[0]->m_x, t.m_n[1]->m_x, t.m_n[2]->m_x, t.m_n[3]->m_x, p, bary);
+			btScalar new_min_bary_weight = bary[0];
+			for (int k = 1; k < 4; ++k)
+			{
+				new_min_bary_weight = btMin(new_min_bary_weight, bary[k]);
+			}
+			if (new_min_bary_weight > min_bary_weight)
+			{
+				btAlignedObjectArray<const btSoftBody::Node*> parents;
+				parents.push_back(t.m_n[0]);
+				parents.push_back(t.m_n[1]);
+				parents.push_back(t.m_n[2]);
+				parents.push_back(t.m_n[3]);
+				optimal_parents = parents;
+				optimal_bary = bary;
+				min_bary_weight = new_min_bary_weight;
+				// stop searching if p is inside the tetrahedron at hand
+				if (bary[0] >= 0. && bary[1] >= 0. && bary[2] >= 0. && bary[3] >= 0.)
+				{
+					break;
+				}
+			}
+		}
+		psb->m_renderNodesInterpolationWeights[i] = optimal_bary;
+		psb->m_renderNodesParents[i] = optimal_parents;
+	}
 }
 
-
 // Iterate through all render nodes to find the simulation triangle that's closest to the node in the barycentric sense.
 void btSoftBodyHelpers::extrapolateBarycentricWeights(btSoftBody* psb)
 {
-    psb->m_renderNodesInterpolationWeights.resize(psb->m_renderNodes.size());
-    psb->m_renderNodesParents.resize(psb->m_renderNodes.size());
-    psb->m_z.resize(psb->m_renderNodes.size());
-    for (int i = 0; i < psb->m_renderNodes.size(); ++i)
-    {
-        const btVector3& p = psb->m_renderNodes[i].m_x;
-        btVector4 bary;
-        btVector4 optimal_bary;
-        btScalar min_bary_weight = -SIMD_INFINITY;
-        btAlignedObjectArray<const btSoftBody::Node*> optimal_parents;
-        btScalar dist = 0, optimal_dist = 0;
-        for (int j = 0; j < psb->m_faces.size(); ++j)
-        {
-            const btSoftBody::Face& f = psb->m_faces[j];
-            btVector3 n = btCross(f.m_n[1]->m_x - f.m_n[0]->m_x,  f.m_n[2]->m_x - f.m_n[0]->m_x);
-            btVector3 unit_n = n.normalized();
-            dist = (p-f.m_n[0]->m_x).dot(unit_n);
-            btVector3 proj_p = p - dist*unit_n;
-            getBarycentricWeights(f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, proj_p, bary);
-            btScalar new_min_bary_weight = bary[0];
-            for (int k = 1; k < 3; ++k)
-            {
-                new_min_bary_weight = btMin(new_min_bary_weight, bary[k]);
-            }
-
-            // p is out of the current best triangle, we found a traingle that's better
-            bool better_than_closest_outisde = (new_min_bary_weight > min_bary_weight && min_bary_weight<0.);
-            // p is inside of the current best triangle, we found a triangle that's better
-            bool better_than_best_inside = (new_min_bary_weight>=0 &&  min_bary_weight>=0 && btFabs(dist)<btFabs(optimal_dist));
-
-            if (better_than_closest_outisde || better_than_best_inside)
-            {
-                btAlignedObjectArray<const btSoftBody::Node*> parents;
-                parents.push_back(f.m_n[0]);
-                parents.push_back(f.m_n[1]);
-                parents.push_back(f.m_n[2]);
-                optimal_parents = parents;
-                optimal_bary = bary;
-                optimal_dist = dist;
-                min_bary_weight = new_min_bary_weight;
-            }
-        }
-        psb->m_renderNodesInterpolationWeights[i] = optimal_bary;
-        psb->m_renderNodesParents[i] = optimal_parents;
-        psb->m_z[i] = optimal_dist;
-    }
+	psb->m_renderNodesInterpolationWeights.resize(psb->m_renderNodes.size());
+	psb->m_renderNodesParents.resize(psb->m_renderNodes.size());
+	psb->m_z.resize(psb->m_renderNodes.size());
+	for (int i = 0; i < psb->m_renderNodes.size(); ++i)
+	{
+		const btVector3& p = psb->m_renderNodes[i].m_x;
+		btVector4 bary;
+		btVector4 optimal_bary;
+		btScalar min_bary_weight = -SIMD_INFINITY;
+		btAlignedObjectArray<const btSoftBody::Node*> optimal_parents;
+		btScalar dist = 0, optimal_dist = 0;
+		for (int j = 0; j < psb->m_faces.size(); ++j)
+		{
+			const btSoftBody::Face& f = psb->m_faces[j];
+			btVector3 n = btCross(f.m_n[1]->m_x - f.m_n[0]->m_x, f.m_n[2]->m_x - f.m_n[0]->m_x);
+			btVector3 unit_n = n.normalized();
+			dist = (p - f.m_n[0]->m_x).dot(unit_n);
+			btVector3 proj_p = p - dist * unit_n;
+			getBarycentricWeights(f.m_n[0]->m_x, f.m_n[1]->m_x, f.m_n[2]->m_x, proj_p, bary);
+			btScalar new_min_bary_weight = bary[0];
+			for (int k = 1; k < 3; ++k)
+			{
+				new_min_bary_weight = btMin(new_min_bary_weight, bary[k]);
+			}
+
+			// p is out of the current best triangle, we found a traingle that's better
+			bool better_than_closest_outisde = (new_min_bary_weight > min_bary_weight && min_bary_weight < 0.);
+			// p is inside of the current best triangle, we found a triangle that's better
+			bool better_than_best_inside = (new_min_bary_weight >= 0 && min_bary_weight >= 0 && btFabs(dist) < btFabs(optimal_dist));
+
+			if (better_than_closest_outisde || better_than_best_inside)
+			{
+				btAlignedObjectArray<const btSoftBody::Node*> parents;
+				parents.push_back(f.m_n[0]);
+				parents.push_back(f.m_n[1]);
+				parents.push_back(f.m_n[2]);
+				optimal_parents = parents;
+				optimal_bary = bary;
+				optimal_dist = dist;
+				min_bary_weight = new_min_bary_weight;
+			}
+		}
+		psb->m_renderNodesInterpolationWeights[i] = optimal_bary;
+		psb->m_renderNodesParents[i] = optimal_parents;
+		psb->m_z[i] = optimal_dist;
+	}
 }
diff --git a/thirdparty/bullet/BulletSoftBody/btSoftBodyHelpers.h b/thirdparty/bullet/BulletSoftBody/btSoftBodyHelpers.h
index abe1870890..237d29761d 100644
--- a/thirdparty/bullet/BulletSoftBody/btSoftBodyHelpers.h
+++ b/thirdparty/bullet/BulletSoftBody/btSoftBodyHelpers.h
@@ -93,7 +93,7 @@ struct btSoftBodyHelpers
 								   int resy,
 								   int fixeds,
 								   bool gendiags,
-                                   btScalar perturbation = 0.);
+								   btScalar perturbation = 0.);
 	/* Create a patch with UV Texture Coordinates	*/
 	static btSoftBody* CreatePatchUV(btSoftBodyWorldInfo& worldInfo,
 									 const btVector3& corner00,
@@ -142,21 +142,21 @@ struct btSoftBodyHelpers
 											bool bfacelinks,
 											bool btetralinks,
 											bool bfacesfromtetras);
-    static btSoftBody* CreateFromVtkFile(btSoftBodyWorldInfo& worldInfo, const char* vtk_file);
+	static btSoftBody* CreateFromVtkFile(btSoftBodyWorldInfo& worldInfo, const char* vtk_file);
 
-    static void writeObj(const char* file, const btSoftBody* psb);
-    
-    static void getBarycentricWeights(const btVector3& a, const btVector3& b, const btVector3& c, const btVector3& d, const btVector3& p, btVector4& bary);
-    
-    static void getBarycentricWeights(const btVector3& a, const btVector3& b, const btVector3& c, const btVector3& p, btVector4& bary);
-    
-    static void interpolateBarycentricWeights(btSoftBody* psb);
-    
-    static void extrapolateBarycentricWeights(btSoftBody* psb);
-    
-    static void generateBoundaryFaces(btSoftBody* psb);
-    
-    static void duplicateFaces(const char* filename, const btSoftBody* psb);
+	static void writeObj(const char* file, const btSoftBody* psb);
+
+	static void getBarycentricWeights(const btVector3& a, const btVector3& b, const btVector3& c, const btVector3& d, const btVector3& p, btVector4& bary);
+
+	static void getBarycentricWeights(const btVector3& a, const btVector3& b, const btVector3& c, const btVector3& p, btVector4& bary);
+
+	static void interpolateBarycentricWeights(btSoftBody* psb);
+
+	static void extrapolateBarycentricWeights(btSoftBody* psb);
+
+	static void generateBoundaryFaces(btSoftBody* psb);
+
+	static void duplicateFaces(const char* filename, const btSoftBody* psb);
 	/// Sort the list of links to move link calculations that are dependent upon earlier
 	/// ones as far as possible away from the calculation of those values
 	/// This tends to make adjacent loop iterations not dependent upon one another,
diff --git a/thirdparty/bullet/BulletSoftBody/btSoftBodyInternals.h b/thirdparty/bullet/BulletSoftBody/btSoftBodyInternals.h
index b9ebc95b6b..c17bbb5cd4 100644
--- a/thirdparty/bullet/BulletSoftBody/btSoftBodyInternals.h
+++ b/thirdparty/bullet/BulletSoftBody/btSoftBodyInternals.h
@@ -32,86 +32,85 @@ subject to the following restrictions:
 
 // Given a multibody link, a contact point and a contact direction, fill in the jacobian data needed to calculate the velocity change given an impulse in the contact direction
 static SIMD_FORCE_INLINE void findJacobian(const btMultiBodyLinkCollider* multibodyLinkCol,
-                         btMultiBodyJacobianData& jacobianData,
-                         const btVector3& contact_point,
-                         const btVector3& dir)
-{
-    const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
-    jacobianData.m_jacobians.resize(ndof);
-    jacobianData.m_deltaVelocitiesUnitImpulse.resize(ndof);
-    btScalar* jac = &jacobianData.m_jacobians[0];
-    
-    multibodyLinkCol->m_multiBody->fillContactJacobianMultiDof(multibodyLinkCol->m_link, contact_point, dir, jac, jacobianData.scratch_r, jacobianData.scratch_v, jacobianData.scratch_m);
-    multibodyLinkCol->m_multiBody->calcAccelerationDeltasMultiDof(&jacobianData.m_jacobians[0], &jacobianData.m_deltaVelocitiesUnitImpulse[0], jacobianData.scratch_r, jacobianData.scratch_v);
+										   btMultiBodyJacobianData& jacobianData,
+										   const btVector3& contact_point,
+										   const btVector3& dir)
+{
+	const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
+	jacobianData.m_jacobians.resize(ndof);
+	jacobianData.m_deltaVelocitiesUnitImpulse.resize(ndof);
+	btScalar* jac = &jacobianData.m_jacobians[0];
+
+	multibodyLinkCol->m_multiBody->fillContactJacobianMultiDof(multibodyLinkCol->m_link, contact_point, dir, jac, jacobianData.scratch_r, jacobianData.scratch_v, jacobianData.scratch_m);
+	multibodyLinkCol->m_multiBody->calcAccelerationDeltasMultiDof(&jacobianData.m_jacobians[0], &jacobianData.m_deltaVelocitiesUnitImpulse[0], jacobianData.scratch_r, jacobianData.scratch_v);
 }
 static SIMD_FORCE_INLINE btVector3 generateUnitOrthogonalVector(const btVector3& u)
 {
-    btScalar ux = u.getX();
-    btScalar uy = u.getY();
-    btScalar uz = u.getZ();
-    btScalar ax = std::abs(ux);
-    btScalar ay = std::abs(uy);
-    btScalar az = std::abs(uz);
-    btVector3 v;
-    if (ax <= ay && ax <= az)
-        v = btVector3(0, -uz, uy);
-    else if (ay <= ax && ay <= az)
-        v = btVector3(-uz, 0, ux);
-    else
-        v = btVector3(-uy, ux, 0);
-    v.normalize();
-    return v;
+	btScalar ux = u.getX();
+	btScalar uy = u.getY();
+	btScalar uz = u.getZ();
+	btScalar ax = std::abs(ux);
+	btScalar ay = std::abs(uy);
+	btScalar az = std::abs(uz);
+	btVector3 v;
+	if (ax <= ay && ax <= az)
+		v = btVector3(0, -uz, uy);
+	else if (ay <= ax && ay <= az)
+		v = btVector3(-uz, 0, ux);
+	else
+		v = btVector3(-uy, ux, 0);
+	v.normalize();
+	return v;
 }
 
 static SIMD_FORCE_INLINE bool proximityTest(const btVector3& x1, const btVector3& x2, const btVector3& x3, const btVector3& x4, const btVector3& normal, const btScalar& mrg, btVector3& bary)
 {
-    btVector3 x43 = x4-x3;
-    if (std::abs(x43.dot(normal)) > mrg)
-        return false;
-    btVector3 x13 = x1-x3;
-    btVector3 x23 = x2-x3;
-    btScalar a11 = x13.length2();
-    btScalar a22 = x23.length2();
-    btScalar a12 = x13.dot(x23);
-    btScalar b1 = x13.dot(x43);
-    btScalar b2 = x23.dot(x43);
-    btScalar det = a11*a22 - a12*a12;
-    if (det < SIMD_EPSILON)
-        return false;
-    btScalar w1 = (b1*a22-b2*a12)/det;
-    btScalar w2 = (b2*a11-b1*a12)/det;
-    btScalar w3 = 1-w1-w2;
-    btScalar delta = mrg / std::sqrt(0.5*std::abs(x13.cross(x23).safeNorm()));
-    bary = btVector3(w1,w2,w3);
-    for (int i = 0; i < 3; ++i)
-    {
-        if (bary[i] < -delta || bary[i] > 1+delta)
-            return false;
-    }
-    return true;
+	btVector3 x43 = x4 - x3;
+	if (std::abs(x43.dot(normal)) > mrg)
+		return false;
+	btVector3 x13 = x1 - x3;
+	btVector3 x23 = x2 - x3;
+	btScalar a11 = x13.length2();
+	btScalar a22 = x23.length2();
+	btScalar a12 = x13.dot(x23);
+	btScalar b1 = x13.dot(x43);
+	btScalar b2 = x23.dot(x43);
+	btScalar det = a11 * a22 - a12 * a12;
+	if (det < SIMD_EPSILON)
+		return false;
+	btScalar w1 = (b1 * a22 - b2 * a12) / det;
+	btScalar w2 = (b2 * a11 - b1 * a12) / det;
+	btScalar w3 = 1 - w1 - w2;
+	btScalar delta = mrg / std::sqrt(0.5 * std::abs(x13.cross(x23).safeNorm()));
+	bary = btVector3(w1, w2, w3);
+	for (int i = 0; i < 3; ++i)
+	{
+		if (bary[i] < -delta || bary[i] > 1 + delta)
+			return false;
+	}
+	return true;
 }
 static const int KDOP_COUNT = 13;
-static btVector3 dop[KDOP_COUNT]={btVector3(1,0,0),
-	btVector3(0,1,0),
-	btVector3(0,0,1),
-	btVector3(1,1,0),
-	btVector3(1,0,1),
-	btVector3(0,1,1),
-	btVector3(1,-1,0),
-	btVector3(1,0,-1),
-	btVector3(0,1,-1),
-	btVector3(1,1,1),
-	btVector3(1,-1,1),
-	btVector3(1,1,-1),
-	btVector3(1,-1,-1)
-};
+static btVector3 dop[KDOP_COUNT] = {btVector3(1, 0, 0),
+									btVector3(0, 1, 0),
+									btVector3(0, 0, 1),
+									btVector3(1, 1, 0),
+									btVector3(1, 0, 1),
+									btVector3(0, 1, 1),
+									btVector3(1, -1, 0),
+									btVector3(1, 0, -1),
+									btVector3(0, 1, -1),
+									btVector3(1, 1, 1),
+									btVector3(1, -1, 1),
+									btVector3(1, 1, -1),
+									btVector3(1, -1, -1)};
 
 static inline int getSign(const btVector3& n, const btVector3& x)
 {
 	btScalar d = n.dot(x);
-	if (d>SIMD_EPSILON)
+	if (d > SIMD_EPSILON)
 		return 1;
-	if (d<-SIMD_EPSILON)
+	if (d < -SIMD_EPSILON)
 		return -1;
 	return 0;
 }
@@ -119,13 +118,12 @@ static inline int getSign(const btVector3& n, const btVector3& x)
 static SIMD_FORCE_INLINE bool hasSeparatingPlane(const btSoftBody::Face* face, const btSoftBody::Node* node, const btScalar& dt)
 {
 	btVector3 hex[6] = {face->m_n[0]->m_x - node->m_x,
-		face->m_n[1]->m_x - node->m_x,
-		face->m_n[2]->m_x - node->m_x,
-		face->m_n[0]->m_x + dt*face->m_n[0]->m_v - node->m_x,
-		face->m_n[1]->m_x + dt*face->m_n[1]->m_v - node->m_x,
-		face->m_n[2]->m_x + dt*face->m_n[2]->m_v - node->m_x
-	};
-	btVector3 segment = dt*node->m_v;
+						face->m_n[1]->m_x - node->m_x,
+						face->m_n[2]->m_x - node->m_x,
+						face->m_n[0]->m_x + dt * face->m_n[0]->m_v - node->m_x,
+						face->m_n[1]->m_x + dt * face->m_n[1]->m_v - node->m_x,
+						face->m_n[2]->m_x + dt * face->m_n[2]->m_v - node->m_x};
+	btVector3 segment = dt * node->m_v;
 	for (int i = 0; i < KDOP_COUNT; ++i)
 	{
 		int s = getSign(dop[i], segment);
@@ -143,488 +141,494 @@ static SIMD_FORCE_INLINE bool hasSeparatingPlane(const btSoftBody::Face* face, c
 
 static SIMD_FORCE_INLINE bool nearZero(const btScalar& a)
 {
-    return (a>-SAFE_EPSILON && a<SAFE_EPSILON);
+	return (a > -SAFE_EPSILON && a < SAFE_EPSILON);
 }
 static SIMD_FORCE_INLINE bool sameSign(const btScalar& a, const btScalar& b)
 {
-    return (nearZero(a) || nearZero(b) || (a>SAFE_EPSILON && b>SAFE_EPSILON) || (a<-SAFE_EPSILON && b<-SAFE_EPSILON));
+	return (nearZero(a) || nearZero(b) || (a > SAFE_EPSILON && b > SAFE_EPSILON) || (a < -SAFE_EPSILON && b < -SAFE_EPSILON));
 }
 static SIMD_FORCE_INLINE bool diffSign(const btScalar& a, const btScalar& b)
 {
-    return !sameSign(a, b);
-}
-inline btScalar evaluateBezier2(const btScalar &p0, const btScalar &p1, const btScalar &p2, const btScalar &t, const btScalar &s)
-{
-    btScalar s2 = s*s;
-    btScalar t2 = t*t;
-    
-    return p0*s2+p1*btScalar(2.0)*s*t+p2*t2;
-}
-inline btScalar evaluateBezier(const btScalar &p0, const btScalar &p1, const btScalar &p2, const btScalar &p3, const btScalar &t, const btScalar &s)
-{
-    btScalar s2 = s*s;
-    btScalar s3 = s2*s;
-    btScalar t2 = t*t;
-    btScalar t3 = t2*t;
-    
-    return p0*s3+p1*btScalar(3.0)*s2*t+p2*btScalar(3.0)*s*t2+p3*t3;
-}
-static SIMD_FORCE_INLINE bool getSigns(bool type_c, const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btScalar& t0, const btScalar& t1, btScalar &lt0, btScalar &lt1)
-{
-    if (sameSign(t0, t1)) {
-        lt0 = t0;
-        lt1 = t0;
-        return true;
-    }
-    
-    if (type_c || diffSign(k0, k3)) {
-        btScalar ft = evaluateBezier(k0, k1, k2, k3, t0, -t1);
-        if (t0<-0)
-            ft = -ft;
-        
-        if (sameSign(ft, k0)) {
-            lt0 = t1;
-            lt1 = t1;
-        }
-        else {
-            lt0 = t0;
-            lt1 = t0;
-        }
-        return true;
-    }
-    
-    if (!type_c) {
-        btScalar ft = evaluateBezier(k0, k1, k2, k3, t0, -t1);
-        if (t0<-0)
-            ft = -ft;
-        
-        if (diffSign(ft, k0)) {
-            lt0 = t0;
-            lt1 = t1;
-            return true;
-        }
-        
-        btScalar fk = evaluateBezier2(k1-k0, k2-k1, k3-k2, t0, -t1);
-        
-        if (sameSign(fk, k1-k0))
-            lt0 = lt1 = t1;
-        else
-            lt0 = lt1 = t0;
-        
-        return true;
-    }
-    return false;
+	return !sameSign(a, b);
+}
+inline btScalar evaluateBezier2(const btScalar& p0, const btScalar& p1, const btScalar& p2, const btScalar& t, const btScalar& s)
+{
+	btScalar s2 = s * s;
+	btScalar t2 = t * t;
+
+	return p0 * s2 + p1 * btScalar(2.0) * s * t + p2 * t2;
+}
+inline btScalar evaluateBezier(const btScalar& p0, const btScalar& p1, const btScalar& p2, const btScalar& p3, const btScalar& t, const btScalar& s)
+{
+	btScalar s2 = s * s;
+	btScalar s3 = s2 * s;
+	btScalar t2 = t * t;
+	btScalar t3 = t2 * t;
+
+	return p0 * s3 + p1 * btScalar(3.0) * s2 * t + p2 * btScalar(3.0) * s * t2 + p3 * t3;
+}
+static SIMD_FORCE_INLINE bool getSigns(bool type_c, const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btScalar& t0, const btScalar& t1, btScalar& lt0, btScalar& lt1)
+{
+	if (sameSign(t0, t1))
+	{
+		lt0 = t0;
+		lt1 = t0;
+		return true;
+	}
+
+	if (type_c || diffSign(k0, k3))
+	{
+		btScalar ft = evaluateBezier(k0, k1, k2, k3, t0, -t1);
+		if (t0 < -0)
+			ft = -ft;
+
+		if (sameSign(ft, k0))
+		{
+			lt0 = t1;
+			lt1 = t1;
+		}
+		else
+		{
+			lt0 = t0;
+			lt1 = t0;
+		}
+		return true;
+	}
+
+	if (!type_c)
+	{
+		btScalar ft = evaluateBezier(k0, k1, k2, k3, t0, -t1);
+		if (t0 < -0)
+			ft = -ft;
+
+		if (diffSign(ft, k0))
+		{
+			lt0 = t0;
+			lt1 = t1;
+			return true;
+		}
+
+		btScalar fk = evaluateBezier2(k1 - k0, k2 - k1, k3 - k2, t0, -t1);
+
+		if (sameSign(fk, k1 - k0))
+			lt0 = lt1 = t1;
+		else
+			lt0 = lt1 = t0;
+
+		return true;
+	}
+	return false;
 }
 
 static SIMD_FORCE_INLINE void getBernsteinCoeff(const btSoftBody::Face* face, const btSoftBody::Node* node, const btScalar& dt, btScalar& k0, btScalar& k1, btScalar& k2, btScalar& k3)
 {
-    const btVector3& n0 = face->m_n0;
-    const btVector3& n1 = face->m_n1;
-    btVector3 n_hat = n0 + n1 - face->m_vn;
-    btVector3 p0ma0 = node->m_x - face->m_n[0]->m_x;
-    btVector3 p1ma1 = node->m_q - face->m_n[0]->m_q;
-    k0 = (p0ma0).dot(n0) * 3.0;
-    k1 = (p0ma0).dot(n_hat) + (p1ma1).dot(n0);
-    k2 = (p1ma1).dot(n_hat) + (p0ma0).dot(n1);
-    k3 = (p1ma1).dot(n1) * 3.0;
+	const btVector3& n0 = face->m_n0;
+	const btVector3& n1 = face->m_n1;
+	btVector3 n_hat = n0 + n1 - face->m_vn;
+	btVector3 p0ma0 = node->m_x - face->m_n[0]->m_x;
+	btVector3 p1ma1 = node->m_q - face->m_n[0]->m_q;
+	k0 = (p0ma0).dot(n0) * 3.0;
+	k1 = (p0ma0).dot(n_hat) + (p1ma1).dot(n0);
+	k2 = (p1ma1).dot(n_hat) + (p0ma0).dot(n1);
+	k3 = (p1ma1).dot(n1) * 3.0;
 }
 
 static SIMD_FORCE_INLINE void polyDecomposition(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btScalar& j0, const btScalar& j1, const btScalar& j2, btScalar& u0, btScalar& u1, btScalar& v0, btScalar& v1)
 {
-    btScalar denom = 4.0 * (j1-j2) * (j1-j0) + (j2-j0) * (j2-j0);
-    u0 = (2.0*(j1-j2)*(3.0*k1-2.0*k0-k3) - (j0-j2)*(3.0*k2-2.0*k3-k0)) / denom;
-    u1 = (2.0*(j1-j0)*(3.0*k2-2.0*k3-k0) - (j2-j0)*(3.0*k1-2.0*k0-k3)) / denom;
-    v0 = k0-u0*j0;
-    v1 = k3-u1*j2;
+	btScalar denom = 4.0 * (j1 - j2) * (j1 - j0) + (j2 - j0) * (j2 - j0);
+	u0 = (2.0 * (j1 - j2) * (3.0 * k1 - 2.0 * k0 - k3) - (j0 - j2) * (3.0 * k2 - 2.0 * k3 - k0)) / denom;
+	u1 = (2.0 * (j1 - j0) * (3.0 * k2 - 2.0 * k3 - k0) - (j2 - j0) * (3.0 * k1 - 2.0 * k0 - k3)) / denom;
+	v0 = k0 - u0 * j0;
+	v1 = k3 - u1 * j2;
 }
 
 static SIMD_FORCE_INLINE bool rootFindingLemma(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3)
 {
-    btScalar u0, u1, v0, v1;
-    btScalar j0 = 3.0*(k1-k0);
-    btScalar j1 = 3.0*(k2-k1);
-    btScalar j2 = 3.0*(k3-k2);
-    polyDecomposition(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1);
-    if (sameSign(v0, v1))
-    {
-        btScalar Ypa = j0*(1.0-v0)*(1.0-v0) + 2.0*j1*v0*(1.0-v0) + j2*v0*v0; // Y'(v0)
-        if (sameSign(Ypa, j0))
-        {
-            return (diffSign(k0,v1));
-        }
-    }
-    return diffSign(k0,v0);
-}
-
-static SIMD_FORCE_INLINE void getJs(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btSoftBody::Node* a, const btSoftBody::Node* b, const btSoftBody::Node* c, const btSoftBody::Node* p, const btScalar& dt, btScalar& j0,  btScalar& j1, btScalar& j2)
-{
-    const btVector3& a0 = a->m_x;
-    const btVector3& b0 = b->m_x;
-    const btVector3& c0 = c->m_x;
-    const btVector3& va = a->m_v;
-    const btVector3& vb = b->m_v;
-    const btVector3& vc = c->m_v;
-    const btVector3 a1 = a0 + dt*va;
-    const btVector3 b1 = b0 + dt*vb;
-    const btVector3 c1 = c0 + dt*vc;
-    btVector3 n0 = (b0-a0).cross(c0-a0);
-    btVector3 n1 = (b1-a1).cross(c1-a1);
-    btVector3 n_hat = n0+n1 - dt*dt*(vb-va).cross(vc-va);
-    const btVector3& p0 = p->m_x;
-    const btVector3& vp = p->m_v;
-    btVector3 p1 = p0 + dt*vp;
-    btVector3 m0 = (b0-p0).cross(c0-p0);
-    btVector3 m1 = (b1-p1).cross(c1-p1);
-    btVector3 m_hat = m0+m1 - dt*dt*(vb-vp).cross(vc-vp);
-    btScalar l0 = m0.dot(n0);
-    btScalar l1 = 0.25 * (m0.dot(n_hat) + m_hat.dot(n0));
-    btScalar l2 = btScalar(1)/btScalar(6)*(m0.dot(n1) + m_hat.dot(n_hat) + m1.dot(n0));
-    btScalar l3 = 0.25 * (m_hat.dot(n1) + m1.dot(n_hat));
-    btScalar l4 = m1.dot(n1);
-    
-    btScalar k1p = 0.25 * k0 + 0.75 * k1;
-    btScalar k2p = 0.5 * k1 + 0.5 * k2;
-    btScalar k3p = 0.75 * k2 + 0.25 * k3;
-    
-    btScalar s0 = (l1 * k0 - l0 * k1p)*4.0;
-    btScalar s1 = (l2 * k0 - l0 * k2p)*2.0;
-    btScalar s2 = (l3 * k0 - l0 * k3p)*btScalar(4)/btScalar(3);
-    btScalar s3 = l4 * k0 - l0 * k3;
-    
-    j0 = (s1*k0 - s0*k1) * 3.0;
-    j1 = (s2*k0 - s0*k2) * 1.5;
-    j2 = (s3*k0 - s0*k3);
+	btScalar u0, u1, v0, v1;
+	btScalar j0 = 3.0 * (k1 - k0);
+	btScalar j1 = 3.0 * (k2 - k1);
+	btScalar j2 = 3.0 * (k3 - k2);
+	polyDecomposition(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1);
+	if (sameSign(v0, v1))
+	{
+		btScalar Ypa = j0 * (1.0 - v0) * (1.0 - v0) + 2.0 * j1 * v0 * (1.0 - v0) + j2 * v0 * v0;  // Y'(v0)
+		if (sameSign(Ypa, j0))
+		{
+			return (diffSign(k0, v1));
+		}
+	}
+	return diffSign(k0, v0);
+}
+
+static SIMD_FORCE_INLINE void getJs(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btSoftBody::Node* a, const btSoftBody::Node* b, const btSoftBody::Node* c, const btSoftBody::Node* p, const btScalar& dt, btScalar& j0, btScalar& j1, btScalar& j2)
+{
+	const btVector3& a0 = a->m_x;
+	const btVector3& b0 = b->m_x;
+	const btVector3& c0 = c->m_x;
+	const btVector3& va = a->m_v;
+	const btVector3& vb = b->m_v;
+	const btVector3& vc = c->m_v;
+	const btVector3 a1 = a0 + dt * va;
+	const btVector3 b1 = b0 + dt * vb;
+	const btVector3 c1 = c0 + dt * vc;
+	btVector3 n0 = (b0 - a0).cross(c0 - a0);
+	btVector3 n1 = (b1 - a1).cross(c1 - a1);
+	btVector3 n_hat = n0 + n1 - dt * dt * (vb - va).cross(vc - va);
+	const btVector3& p0 = p->m_x;
+	const btVector3& vp = p->m_v;
+	btVector3 p1 = p0 + dt * vp;
+	btVector3 m0 = (b0 - p0).cross(c0 - p0);
+	btVector3 m1 = (b1 - p1).cross(c1 - p1);
+	btVector3 m_hat = m0 + m1 - dt * dt * (vb - vp).cross(vc - vp);
+	btScalar l0 = m0.dot(n0);
+	btScalar l1 = 0.25 * (m0.dot(n_hat) + m_hat.dot(n0));
+	btScalar l2 = btScalar(1) / btScalar(6) * (m0.dot(n1) + m_hat.dot(n_hat) + m1.dot(n0));
+	btScalar l3 = 0.25 * (m_hat.dot(n1) + m1.dot(n_hat));
+	btScalar l4 = m1.dot(n1);
+
+	btScalar k1p = 0.25 * k0 + 0.75 * k1;
+	btScalar k2p = 0.5 * k1 + 0.5 * k2;
+	btScalar k3p = 0.75 * k2 + 0.25 * k3;
+
+	btScalar s0 = (l1 * k0 - l0 * k1p) * 4.0;
+	btScalar s1 = (l2 * k0 - l0 * k2p) * 2.0;
+	btScalar s2 = (l3 * k0 - l0 * k3p) * btScalar(4) / btScalar(3);
+	btScalar s3 = l4 * k0 - l0 * k3;
+
+	j0 = (s1 * k0 - s0 * k1) * 3.0;
+	j1 = (s2 * k0 - s0 * k2) * 1.5;
+	j2 = (s3 * k0 - s0 * k3);
 }
 
 static SIMD_FORCE_INLINE bool signDetermination1Internal(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btScalar& u0, const btScalar& u1, const btScalar& v0, const btScalar& v1)
 {
-    btScalar Yu0 = k0*(1.0-u0)*(1.0-u0)*(1.0-u0) + 3.0*k1*u0*(1.0-u0)*(1.0-u0) + 3.0*k2*u0*u0*(1.0-u0) + k3*u0*u0*u0; // Y(u0)
-    btScalar Yv0 = k0*(1.0-v0)*(1.0-v0)*(1.0-v0) + 3.0*k1*v0*(1.0-v0)*(1.0-v0) + 3.0*k2*v0*v0*(1.0-v0) + k3*v0*v0*v0; // Y(v0)
+	btScalar Yu0 = k0 * (1.0 - u0) * (1.0 - u0) * (1.0 - u0) + 3.0 * k1 * u0 * (1.0 - u0) * (1.0 - u0) + 3.0 * k2 * u0 * u0 * (1.0 - u0) + k3 * u0 * u0 * u0;  // Y(u0)
+	btScalar Yv0 = k0 * (1.0 - v0) * (1.0 - v0) * (1.0 - v0) + 3.0 * k1 * v0 * (1.0 - v0) * (1.0 - v0) + 3.0 * k2 * v0 * v0 * (1.0 - v0) + k3 * v0 * v0 * v0;  // Y(v0)
 
-    btScalar sign_Ytp = (u0 > u1) ? Yu0 : -Yu0;
-    btScalar L = sameSign(sign_Ytp, k0) ? u1 : u0;
-    sign_Ytp = (v0 > v1) ? Yv0 : -Yv0;
-    btScalar K = (sameSign(sign_Ytp,k0)) ? v1 : v0;
-    return diffSign(L,K);
+	btScalar sign_Ytp = (u0 > u1) ? Yu0 : -Yu0;
+	btScalar L = sameSign(sign_Ytp, k0) ? u1 : u0;
+	sign_Ytp = (v0 > v1) ? Yv0 : -Yv0;
+	btScalar K = (sameSign(sign_Ytp, k0)) ? v1 : v0;
+	return diffSign(L, K);
 }
 
 static SIMD_FORCE_INLINE bool signDetermination2Internal(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btScalar& j0, const btScalar& j1, const btScalar& j2, const btScalar& u0, const btScalar& u1, const btScalar& v0, const btScalar& v1)
 {
-    btScalar Yu0 = k0*(1.0-u0)*(1.0-u0)*(1.0-u0) + 3.0*k1*u0*(1.0-u0)*(1.0-u0) + 3.0*k2*u0*u0*(1.0-u0) + k3*u0*u0*u0; // Y(u0)
-    btScalar sign_Ytp = (u0 > u1) ? Yu0 : -Yu0, L1, L2;
-    if (diffSign(sign_Ytp,k0))
-    {
-        L1 = u0;
-        L2 = u1;
-    }
-    else
-    {
-        btScalar Yp_u0 = j0*(1.0-u0)*(1.0-u0) + 2.0*j1*(1.0-u0)*u0 + j2*u0*u0;
-        if (sameSign(Yp_u0,j0))
-        {
-            L1 = u1;
-            L2 = u1;
-        }
-        else
-        {
-            L1 = u0;
-            L2 = u0;
-        }
-    }
-    btScalar Yv0 = k0*(1.0-v0)*(1.0-v0)*(1.0-v0) + 3.0*k1*v0*(1.0-v0)*(1.0-v0) + 3.0*k2*v0*v0*(1.0-v0) + k3*v0*v0*v0; // Y(uv0)
-    sign_Ytp = (v0 > v1) ? Yv0 : -Yv0;
-    btScalar K1, K2;
-    if (diffSign(sign_Ytp,k0))
-    {
-        K1 = v0;
-        K2 = v1;
-    }
-    else
-    {
-        btScalar Yp_v0 = j0*(1.0-v0)*(1.0-v0) + 2.0*j1*(1.0-v0)*v0 + j2*v0*v0;
-        if (sameSign(Yp_v0,j0))
-        {
-            K1 = v1;
-            K2 = v1;
-        }
-        else
-        {
-            K1 = v0;
-            K2 = v0;
-        }
-    }
-    return (diffSign(K1, L1) || diffSign(L2, K2));
+	btScalar Yu0 = k0 * (1.0 - u0) * (1.0 - u0) * (1.0 - u0) + 3.0 * k1 * u0 * (1.0 - u0) * (1.0 - u0) + 3.0 * k2 * u0 * u0 * (1.0 - u0) + k3 * u0 * u0 * u0;  // Y(u0)
+	btScalar sign_Ytp = (u0 > u1) ? Yu0 : -Yu0, L1, L2;
+	if (diffSign(sign_Ytp, k0))
+	{
+		L1 = u0;
+		L2 = u1;
+	}
+	else
+	{
+		btScalar Yp_u0 = j0 * (1.0 - u0) * (1.0 - u0) + 2.0 * j1 * (1.0 - u0) * u0 + j2 * u0 * u0;
+		if (sameSign(Yp_u0, j0))
+		{
+			L1 = u1;
+			L2 = u1;
+		}
+		else
+		{
+			L1 = u0;
+			L2 = u0;
+		}
+	}
+	btScalar Yv0 = k0 * (1.0 - v0) * (1.0 - v0) * (1.0 - v0) + 3.0 * k1 * v0 * (1.0 - v0) * (1.0 - v0) + 3.0 * k2 * v0 * v0 * (1.0 - v0) + k3 * v0 * v0 * v0;  // Y(uv0)
+	sign_Ytp = (v0 > v1) ? Yv0 : -Yv0;
+	btScalar K1, K2;
+	if (diffSign(sign_Ytp, k0))
+	{
+		K1 = v0;
+		K2 = v1;
+	}
+	else
+	{
+		btScalar Yp_v0 = j0 * (1.0 - v0) * (1.0 - v0) + 2.0 * j1 * (1.0 - v0) * v0 + j2 * v0 * v0;
+		if (sameSign(Yp_v0, j0))
+		{
+			K1 = v1;
+			K2 = v1;
+		}
+		else
+		{
+			K1 = v0;
+			K2 = v0;
+		}
+	}
+	return (diffSign(K1, L1) || diffSign(L2, K2));
 }
 
 static SIMD_FORCE_INLINE bool signDetermination1(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btSoftBody::Face* face, const btSoftBody::Node* node, const btScalar& dt)
 {
-    btScalar j0, j1, j2, u0, u1, v0, v1;
-    // p1
-    getJs(k0,k1,k2,k3,face->m_n[0], face->m_n[1], face->m_n[2], node, dt, j0, j1, j2);
-    if (nearZero(j0+j2-j1*2.0))
-    {
-        btScalar lt0, lt1;
-        getSigns(true, k0, k1, k2, k3, j0, j2, lt0, lt1);
-        if (lt0 < -SAFE_EPSILON)
-            return false;
-    }
-    else
-    {
-        polyDecomposition(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1);
-        if (!signDetermination1Internal(k0,k1,k2,k3,u0,u1,v0,v1))
-            return false;
-    }
-    // p2
-    getJs(k0,k1,k2,k3,face->m_n[1], face->m_n[2], face->m_n[0], node, dt, j0, j1, j2);
-    if (nearZero(j0+j2-j1*2.0))
-    {
-        btScalar lt0, lt1;
-        getSigns(true, k0, k1, k2, k3, j0, j2, lt0, lt1);
-        if (lt0 < -SAFE_EPSILON)
-            return false;
-    }
-    else
-    {
-        polyDecomposition(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1);
-        if (!signDetermination1Internal(k0,k1,k2,k3,u0,u1,v0,v1))
-            return false;
-    }
-    // p3
-    getJs(k0,k1,k2,k3,face->m_n[2], face->m_n[0], face->m_n[1], node, dt, j0, j1, j2);
-    if (nearZero(j0+j2-j1*2.0))
-    {
-        btScalar lt0, lt1;
-        getSigns(true, k0, k1, k2, k3, j0, j2, lt0, lt1);
-        if (lt0 < -SAFE_EPSILON)
-            return false;
-    }
-    else
-    {
-        polyDecomposition(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1);
-        if (!signDetermination1Internal(k0,k1,k2,k3,u0,u1,v0,v1))
-            return false;
-    }
-    return true;
+	btScalar j0, j1, j2, u0, u1, v0, v1;
+	// p1
+	getJs(k0, k1, k2, k3, face->m_n[0], face->m_n[1], face->m_n[2], node, dt, j0, j1, j2);
+	if (nearZero(j0 + j2 - j1 * 2.0))
+	{
+		btScalar lt0, lt1;
+		getSigns(true, k0, k1, k2, k3, j0, j2, lt0, lt1);
+		if (lt0 < -SAFE_EPSILON)
+			return false;
+	}
+	else
+	{
+		polyDecomposition(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1);
+		if (!signDetermination1Internal(k0, k1, k2, k3, u0, u1, v0, v1))
+			return false;
+	}
+	// p2
+	getJs(k0, k1, k2, k3, face->m_n[1], face->m_n[2], face->m_n[0], node, dt, j0, j1, j2);
+	if (nearZero(j0 + j2 - j1 * 2.0))
+	{
+		btScalar lt0, lt1;
+		getSigns(true, k0, k1, k2, k3, j0, j2, lt0, lt1);
+		if (lt0 < -SAFE_EPSILON)
+			return false;
+	}
+	else
+	{
+		polyDecomposition(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1);
+		if (!signDetermination1Internal(k0, k1, k2, k3, u0, u1, v0, v1))
+			return false;
+	}
+	// p3
+	getJs(k0, k1, k2, k3, face->m_n[2], face->m_n[0], face->m_n[1], node, dt, j0, j1, j2);
+	if (nearZero(j0 + j2 - j1 * 2.0))
+	{
+		btScalar lt0, lt1;
+		getSigns(true, k0, k1, k2, k3, j0, j2, lt0, lt1);
+		if (lt0 < -SAFE_EPSILON)
+			return false;
+	}
+	else
+	{
+		polyDecomposition(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1);
+		if (!signDetermination1Internal(k0, k1, k2, k3, u0, u1, v0, v1))
+			return false;
+	}
+	return true;
 }
 
 static SIMD_FORCE_INLINE bool signDetermination2(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btSoftBody::Face* face, const btSoftBody::Node* node, const btScalar& dt)
 {
-    btScalar j0, j1, j2, u0, u1, v0, v1;
-    // p1
-    getJs(k0,k1,k2,k3,face->m_n[0], face->m_n[1], face->m_n[2], node, dt, j0, j1, j2);
-    if (nearZero(j0+j2-j1*2.0))
-    {
-        btScalar lt0, lt1;
-        bool bt0 = true, bt1=true;
-        getSigns(false, k0, k1, k2, k3, j0, j2, lt0, lt1);
-        if (lt0 < -SAFE_EPSILON)
-            bt0 = false;
-        if (lt1 < -SAFE_EPSILON)
-            bt1 = false;
-        if (!bt0 && !bt1)
-            return false;
-    }
-    else
-    {
-        polyDecomposition(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1);
-        if (!signDetermination2Internal(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1))
-            return false;
-    }
-    // p2
-    getJs(k0,k1,k2,k3,face->m_n[1], face->m_n[2], face->m_n[0], node, dt, j0, j1, j2);
-    if (nearZero(j0+j2-j1*2.0))
-    {
-        btScalar lt0, lt1;
-        bool bt0=true, bt1=true;
-        getSigns(false, k0, k1, k2, k3, j0, j2, lt0, lt1);
-        if (lt0 < -SAFE_EPSILON)
-            bt0 = false;
-        if (lt1 < -SAFE_EPSILON)
-            bt1 = false;
-        if (!bt0 && !bt1)
-            return false;
-    }
-    else
-    {
-        polyDecomposition(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1);
-        if (!signDetermination2Internal(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1))
-            return false;
-    }
-    // p3
-    getJs(k0,k1,k2,k3,face->m_n[2], face->m_n[0], face->m_n[1], node, dt, j0, j1, j2);
-    if (nearZero(j0+j2-j1*2.0))
-    {
-        btScalar lt0, lt1;
-        bool bt0=true, bt1=true;
-        getSigns(false, k0, k1, k2, k3, j0, j2, lt0, lt1);
-        if (lt0 < -SAFE_EPSILON)
-            bt0 = false;
-        if (lt1 < -SAFE_EPSILON)
-            bt1 = false;
-        if (!bt0 && !bt1)
-            return false;
-    }
-    else
-    {
-        polyDecomposition(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1);
-        if (!signDetermination2Internal(k0,k1,k2,k3,j0,j1,j2,u0,u1,v0,v1))
-            return false;
-    }
-    return true;
+	btScalar j0, j1, j2, u0, u1, v0, v1;
+	// p1
+	getJs(k0, k1, k2, k3, face->m_n[0], face->m_n[1], face->m_n[2], node, dt, j0, j1, j2);
+	if (nearZero(j0 + j2 - j1 * 2.0))
+	{
+		btScalar lt0, lt1;
+		bool bt0 = true, bt1 = true;
+		getSigns(false, k0, k1, k2, k3, j0, j2, lt0, lt1);
+		if (lt0 < -SAFE_EPSILON)
+			bt0 = false;
+		if (lt1 < -SAFE_EPSILON)
+			bt1 = false;
+		if (!bt0 && !bt1)
+			return false;
+	}
+	else
+	{
+		polyDecomposition(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1);
+		if (!signDetermination2Internal(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1))
+			return false;
+	}
+	// p2
+	getJs(k0, k1, k2, k3, face->m_n[1], face->m_n[2], face->m_n[0], node, dt, j0, j1, j2);
+	if (nearZero(j0 + j2 - j1 * 2.0))
+	{
+		btScalar lt0, lt1;
+		bool bt0 = true, bt1 = true;
+		getSigns(false, k0, k1, k2, k3, j0, j2, lt0, lt1);
+		if (lt0 < -SAFE_EPSILON)
+			bt0 = false;
+		if (lt1 < -SAFE_EPSILON)
+			bt1 = false;
+		if (!bt0 && !bt1)
+			return false;
+	}
+	else
+	{
+		polyDecomposition(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1);
+		if (!signDetermination2Internal(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1))
+			return false;
+	}
+	// p3
+	getJs(k0, k1, k2, k3, face->m_n[2], face->m_n[0], face->m_n[1], node, dt, j0, j1, j2);
+	if (nearZero(j0 + j2 - j1 * 2.0))
+	{
+		btScalar lt0, lt1;
+		bool bt0 = true, bt1 = true;
+		getSigns(false, k0, k1, k2, k3, j0, j2, lt0, lt1);
+		if (lt0 < -SAFE_EPSILON)
+			bt0 = false;
+		if (lt1 < -SAFE_EPSILON)
+			bt1 = false;
+		if (!bt0 && !bt1)
+			return false;
+	}
+	else
+	{
+		polyDecomposition(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1);
+		if (!signDetermination2Internal(k0, k1, k2, k3, j0, j1, j2, u0, u1, v0, v1))
+			return false;
+	}
+	return true;
 }
 
 static SIMD_FORCE_INLINE bool coplanarAndInsideTest(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btSoftBody::Face* face, const btSoftBody::Node* node, const btScalar& dt)
 {
-    // Coplanar test
-    if (diffSign(k1-k0, k3-k2))
-    {
-        // Case b:
-        if (sameSign(k0, k3) && !rootFindingLemma(k0,k1,k2,k3))
-            return false;
-        // inside test
-        return signDetermination2(k0, k1, k2, k3, face, node, dt);
-    }
-    else
-    {
-        // Case c:
-        if (sameSign(k0, k3))
-            return false;
-        // inside test
-        return signDetermination1(k0, k1, k2, k3, face, node, dt);
-    }
-    return false;
+	// Coplanar test
+	if (diffSign(k1 - k0, k3 - k2))
+	{
+		// Case b:
+		if (sameSign(k0, k3) && !rootFindingLemma(k0, k1, k2, k3))
+			return false;
+		// inside test
+		return signDetermination2(k0, k1, k2, k3, face, node, dt);
+	}
+	else
+	{
+		// Case c:
+		if (sameSign(k0, k3))
+			return false;
+		// inside test
+		return signDetermination1(k0, k1, k2, k3, face, node, dt);
+	}
+	return false;
 }
 static SIMD_FORCE_INLINE bool conservativeCulling(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btScalar& mrg)
 {
-    if (k0 > mrg && k1 > mrg && k2 > mrg && k3 > mrg)
-        return true;
-    if (k0 < -mrg && k1 < -mrg && k2 < -mrg && k3 < -mrg)
-        return true;
-    return false;
+	if (k0 > mrg && k1 > mrg && k2 > mrg && k3 > mrg)
+		return true;
+	if (k0 < -mrg && k1 < -mrg && k2 < -mrg && k3 < -mrg)
+		return true;
+	return false;
 }
 
 static SIMD_FORCE_INLINE bool bernsteinVFTest(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btScalar& mrg, const btSoftBody::Face* face, const btSoftBody::Node* node, const btScalar& dt)
 {
-    if (conservativeCulling(k0, k1, k2, k3, mrg))
-        return false;
-    return coplanarAndInsideTest(k0, k1, k2, k3, face, node, dt);
+	if (conservativeCulling(k0, k1, k2, k3, mrg))
+		return false;
+	return coplanarAndInsideTest(k0, k1, k2, k3, face, node, dt);
 }
 
 static SIMD_FORCE_INLINE void deCasteljau(const btScalar& k0, const btScalar& k1, const btScalar& k2, const btScalar& k3, const btScalar& t0, btScalar& k10, btScalar& k20, btScalar& k30, btScalar& k21, btScalar& k12)
 {
-    k10 = k0*(1.0-t0) + k1*t0;
-    btScalar k11 = k1*(1.0-t0) + k2*t0;
-    k12 = k2*(1.0-t0) + k3*t0;
-    k20 = k10*(1.0-t0) + k11*t0;
-    k21 = k11*(1.0-t0) + k12*t0;
-    k30 = k20*(1.0-t0) + k21*t0;
+	k10 = k0 * (1.0 - t0) + k1 * t0;
+	btScalar k11 = k1 * (1.0 - t0) + k2 * t0;
+	k12 = k2 * (1.0 - t0) + k3 * t0;
+	k20 = k10 * (1.0 - t0) + k11 * t0;
+	k21 = k11 * (1.0 - t0) + k12 * t0;
+	k30 = k20 * (1.0 - t0) + k21 * t0;
 }
 static SIMD_FORCE_INLINE bool bernsteinVFTest(const btSoftBody::Face* face, const btSoftBody::Node* node, const btScalar& dt, const btScalar& mrg)
 {
-    btScalar k0, k1, k2, k3;
-    getBernsteinCoeff(face, node, dt, k0, k1, k2, k3);
-    if (conservativeCulling(k0, k1, k2, k3, mrg))
-        return false;
-    return true;
-    if (diffSign(k2-2.0*k1+k0, k3-2.0*k2+k1))
-    {
-        btScalar k10, k20, k30, k21, k12;
-        btScalar t0 = (k2-2.0*k1+k0)/(k0-3.0*k1+3.0*k2-k3);
-        deCasteljau(k0, k1, k2, k3, t0, k10, k20, k30, k21, k12);
-        return bernsteinVFTest(k0, k10, k20, k30, mrg, face, node, dt) || bernsteinVFTest(k30, k21, k12, k3, mrg, face, node, dt);
-    }
-    return coplanarAndInsideTest(k0, k1, k2, k3, face, node, dt);
+	btScalar k0, k1, k2, k3;
+	getBernsteinCoeff(face, node, dt, k0, k1, k2, k3);
+	if (conservativeCulling(k0, k1, k2, k3, mrg))
+		return false;
+	return true;
+	if (diffSign(k2 - 2.0 * k1 + k0, k3 - 2.0 * k2 + k1))
+	{
+		btScalar k10, k20, k30, k21, k12;
+		btScalar t0 = (k2 - 2.0 * k1 + k0) / (k0 - 3.0 * k1 + 3.0 * k2 - k3);
+		deCasteljau(k0, k1, k2, k3, t0, k10, k20, k30, k21, k12);
+		return bernsteinVFTest(k0, k10, k20, k30, mrg, face, node, dt) || bernsteinVFTest(k30, k21, k12, k3, mrg, face, node, dt);
+	}
+	return coplanarAndInsideTest(k0, k1, k2, k3, face, node, dt);
 }
 
 static SIMD_FORCE_INLINE bool continuousCollisionDetection(const btSoftBody::Face* face, const btSoftBody::Node* node, const btScalar& dt, const btScalar& mrg, btVector3& bary)
 {
-    if (hasSeparatingPlane(face, node, dt))
-        return false;
-    btVector3 x21 = face->m_n[1]->m_x - face->m_n[0]->m_x;
-    btVector3 x31 = face->m_n[2]->m_x - face->m_n[0]->m_x;
-    btVector3 x41 = node->m_x - face->m_n[0]->m_x;
-    btVector3 v21 = face->m_n[1]->m_v - face->m_n[0]->m_v;
-    btVector3 v31 = face->m_n[2]->m_v - face->m_n[0]->m_v;
-    btVector3 v41 = node->m_v - face->m_n[0]->m_v;
-    btVector3 a = x21.cross(x31);
-    btVector3 b = x21.cross(v31) + v21.cross(x31);
-    btVector3 c = v21.cross(v31);
-    btVector3 d = x41;
-    btVector3 e = v41;
-    btScalar a0 = a.dot(d);
-    btScalar a1 = a.dot(e) + b.dot(d);
-    btScalar a2 = c.dot(d) + b.dot(e);
-    btScalar a3 = c.dot(e);
-    btScalar eps = SAFE_EPSILON;
-    int num_roots = 0;
-    btScalar roots[3];
-    if (std::abs(a3) < eps)
-    {
-        // cubic term is zero
-        if (std::abs(a2) < eps)
-        {
-            if (std::abs(a1) < eps)
-            {
-                if (std::abs(a0) < eps)
-                {
-                    num_roots = 2;
-                    roots[0] = 0;
-                    roots[1] = dt;
-                }
-            }
-            else
-            {
-                num_roots = 1;
-                roots[0] = -a0/a1;
-            }
-        }
-        else
-        {
-            num_roots = SolveP2(roots, a1/a2, a0/a2);
-        }
-    }
-    else
-    {
-        num_roots = SolveP3(roots, a2/a3, a1/a3, a0/a3);
-    }
-//    std::sort(roots, roots+num_roots);
-    if (num_roots > 1)
-    {
-        if (roots[0] > roots[1])
-            btSwap(roots[0], roots[1]);
-    }
-    if (num_roots > 2)
-    {
-        if (roots[0] > roots[2])
-            btSwap(roots[0], roots[2]);
-        if (roots[1] > roots[2])
-            btSwap(roots[1], roots[2]);
-    }
-    for (int r = 0; r < num_roots; ++r)
-    {
-        double root = roots[r];
-        if (root <= 0)
-            continue;
-        if (root > dt + SIMD_EPSILON)
-            return false;
-        btVector3 x1 = face->m_n[0]->m_x + root * face->m_n[0]->m_v;
-        btVector3 x2 = face->m_n[1]->m_x + root * face->m_n[1]->m_v;
-        btVector3 x3 = face->m_n[2]->m_x + root * face->m_n[2]->m_v;
-        btVector3 x4 = node->m_x + root * node->m_v;
-        btVector3 normal = (x2-x1).cross(x3-x1);
-        normal.safeNormalize();
-        if (proximityTest(x1, x2, x3, x4, normal, mrg, bary))
-            return true;
-    }
-    return false;
+	if (hasSeparatingPlane(face, node, dt))
+		return false;
+	btVector3 x21 = face->m_n[1]->m_x - face->m_n[0]->m_x;
+	btVector3 x31 = face->m_n[2]->m_x - face->m_n[0]->m_x;
+	btVector3 x41 = node->m_x - face->m_n[0]->m_x;
+	btVector3 v21 = face->m_n[1]->m_v - face->m_n[0]->m_v;
+	btVector3 v31 = face->m_n[2]->m_v - face->m_n[0]->m_v;
+	btVector3 v41 = node->m_v - face->m_n[0]->m_v;
+	btVector3 a = x21.cross(x31);
+	btVector3 b = x21.cross(v31) + v21.cross(x31);
+	btVector3 c = v21.cross(v31);
+	btVector3 d = x41;
+	btVector3 e = v41;
+	btScalar a0 = a.dot(d);
+	btScalar a1 = a.dot(e) + b.dot(d);
+	btScalar a2 = c.dot(d) + b.dot(e);
+	btScalar a3 = c.dot(e);
+	btScalar eps = SAFE_EPSILON;
+	int num_roots = 0;
+	btScalar roots[3];
+	if (std::abs(a3) < eps)
+	{
+		// cubic term is zero
+		if (std::abs(a2) < eps)
+		{
+			if (std::abs(a1) < eps)
+			{
+				if (std::abs(a0) < eps)
+				{
+					num_roots = 2;
+					roots[0] = 0;
+					roots[1] = dt;
+				}
+			}
+			else
+			{
+				num_roots = 1;
+				roots[0] = -a0 / a1;
+			}
+		}
+		else
+		{
+			num_roots = SolveP2(roots, a1 / a2, a0 / a2);
+		}
+	}
+	else
+	{
+		num_roots = SolveP3(roots, a2 / a3, a1 / a3, a0 / a3);
+	}
+	//    std::sort(roots, roots+num_roots);
+	if (num_roots > 1)
+	{
+		if (roots[0] > roots[1])
+			btSwap(roots[0], roots[1]);
+	}
+	if (num_roots > 2)
+	{
+		if (roots[0] > roots[2])
+			btSwap(roots[0], roots[2]);
+		if (roots[1] > roots[2])
+			btSwap(roots[1], roots[2]);
+	}
+	for (int r = 0; r < num_roots; ++r)
+	{
+		double root = roots[r];
+		if (root <= 0)
+			continue;
+		if (root > dt + SIMD_EPSILON)
+			return false;
+		btVector3 x1 = face->m_n[0]->m_x + root * face->m_n[0]->m_v;
+		btVector3 x2 = face->m_n[1]->m_x + root * face->m_n[1]->m_v;
+		btVector3 x3 = face->m_n[2]->m_x + root * face->m_n[2]->m_v;
+		btVector3 x4 = node->m_x + root * node->m_v;
+		btVector3 normal = (x2 - x1).cross(x3 - x1);
+		normal.safeNormalize();
+		if (proximityTest(x1, x2, x3, x4, normal, mrg, bary))
+			return true;
+	}
+	return false;
 }
 static SIMD_FORCE_INLINE bool bernsteinCCD(const btSoftBody::Face* face, const btSoftBody::Node* node, const btScalar& dt, const btScalar& mrg, btVector3& bary)
 {
-    if (!bernsteinVFTest(face, node, dt, mrg))
-        return false;
-    if (!continuousCollisionDetection(face, node, dt, 1e-6, bary))
-        return false;
-    return true;
+	if (!bernsteinVFTest(face, node, dt, mrg))
+		return false;
+	if (!continuousCollisionDetection(face, node, dt, 1e-6, bary))
+		return false;
+	return true;
 }
 
 //
@@ -902,62 +906,61 @@ static inline btMatrix3x3 Diagonal(btScalar x)
 
 static inline btMatrix3x3 Diagonal(const btVector3& v)
 {
-    btMatrix3x3 m;
-    m[0] = btVector3(v.getX(), 0, 0);
-    m[1] = btVector3(0, v.getY(), 0);
-    m[2] = btVector3(0, 0, v.getZ());
-    return (m);
-}
-
-static inline btScalar Dot(const btScalar* a,const btScalar* b, int ndof)
-{
-    btScalar result = 0;
-    for (int i = 0; i < ndof; ++i)
-        result += a[i] * b[i];
-    return result;
-}
-
-static inline btMatrix3x3 OuterProduct(const btScalar* v1,const btScalar* v2,const btScalar* v3,
-                                       const btScalar* u1, const btScalar* u2, const btScalar* u3, int ndof)
-{
-    btMatrix3x3 m;
-    btScalar a11 = Dot(v1,u1,ndof);
-    btScalar a12 = Dot(v1,u2,ndof);
-    btScalar a13 = Dot(v1,u3,ndof);
-    
-    btScalar a21 = Dot(v2,u1,ndof);
-    btScalar a22 = Dot(v2,u2,ndof);
-    btScalar a23 = Dot(v2,u3,ndof);
-    
-    btScalar a31 = Dot(v3,u1,ndof);
-    btScalar a32 = Dot(v3,u2,ndof);
-    btScalar a33 = Dot(v3,u3,ndof);
-    m[0] = btVector3(a11, a12, a13);
-    m[1] = btVector3(a21, a22, a23);
-    m[2] = btVector3(a31, a32, a33);
-    return (m);
-}
-
-static inline btMatrix3x3 OuterProduct(const btVector3& v1,const btVector3& v2)
-{
-    btMatrix3x3 m;
-    btScalar a11 = v1[0] * v2[0];
-    btScalar a12 = v1[0] * v2[1];
-    btScalar a13 = v1[0] * v2[2];
-    
-    btScalar a21 = v1[1] * v2[0];
-    btScalar a22 = v1[1] * v2[1];
-    btScalar a23 = v1[1] * v2[2];
-    
-    btScalar a31 = v1[2] * v2[0];
-    btScalar a32 = v1[2] * v2[1];
-    btScalar a33 = v1[2] * v2[2];
-    m[0] = btVector3(a11, a12, a13);
-    m[1] = btVector3(a21, a22, a23);
-    m[2] = btVector3(a31, a32, a33);
-    return (m);
+	btMatrix3x3 m;
+	m[0] = btVector3(v.getX(), 0, 0);
+	m[1] = btVector3(0, v.getY(), 0);
+	m[2] = btVector3(0, 0, v.getZ());
+	return (m);
+}
+
+static inline btScalar Dot(const btScalar* a, const btScalar* b, int ndof)
+{
+	btScalar result = 0;
+	for (int i = 0; i < ndof; ++i)
+		result += a[i] * b[i];
+	return result;
 }
 
+static inline btMatrix3x3 OuterProduct(const btScalar* v1, const btScalar* v2, const btScalar* v3,
+									   const btScalar* u1, const btScalar* u2, const btScalar* u3, int ndof)
+{
+	btMatrix3x3 m;
+	btScalar a11 = Dot(v1, u1, ndof);
+	btScalar a12 = Dot(v1, u2, ndof);
+	btScalar a13 = Dot(v1, u3, ndof);
+
+	btScalar a21 = Dot(v2, u1, ndof);
+	btScalar a22 = Dot(v2, u2, ndof);
+	btScalar a23 = Dot(v2, u3, ndof);
+
+	btScalar a31 = Dot(v3, u1, ndof);
+	btScalar a32 = Dot(v3, u2, ndof);
+	btScalar a33 = Dot(v3, u3, ndof);
+	m[0] = btVector3(a11, a12, a13);
+	m[1] = btVector3(a21, a22, a23);
+	m[2] = btVector3(a31, a32, a33);
+	return (m);
+}
+
+static inline btMatrix3x3 OuterProduct(const btVector3& v1, const btVector3& v2)
+{
+	btMatrix3x3 m;
+	btScalar a11 = v1[0] * v2[0];
+	btScalar a12 = v1[0] * v2[1];
+	btScalar a13 = v1[0] * v2[2];
+
+	btScalar a21 = v1[1] * v2[0];
+	btScalar a22 = v1[1] * v2[1];
+	btScalar a23 = v1[1] * v2[2];
+
+	btScalar a31 = v1[2] * v2[0];
+	btScalar a32 = v1[2] * v2[1];
+	btScalar a33 = v1[2] * v2[2];
+	m[0] = btVector3(a11, a12, a13);
+	m[1] = btVector3(a21, a22, a23);
+	m[2] = btVector3(a31, a32, a33);
+	return (m);
+}
 
 //
 static inline btMatrix3x3 Add(const btMatrix3x3& a,
@@ -1008,6 +1011,20 @@ static inline btMatrix3x3 ImpulseMatrix(btScalar dt,
 }
 
 //
+static inline btMatrix3x3 ImpulseMatrix(btScalar dt,
+										const btMatrix3x3& effective_mass_inv,
+										btScalar imb,
+										const btMatrix3x3& iwi,
+										const btVector3& r)
+{
+	return (Diagonal(1 / dt) * Add(effective_mass_inv, MassMatrix(imb, iwi, r)).inverse());
+	//    btMatrix3x3 iimb = MassMatrix(imb, iwi, r);
+	//    if (iimb.determinant() == 0)
+	//        return effective_mass_inv.inverse();
+	//    return effective_mass_inv.inverse() *  Add(effective_mass_inv.inverse(), iimb.inverse()).inverse() * iimb.inverse();
+}
+
+//
 static inline btMatrix3x3 ImpulseMatrix(btScalar ima, const btMatrix3x3& iia, const btVector3& ra,
 										btScalar imb, const btMatrix3x3& iib, const btVector3& rb)
 {
@@ -1091,73 +1108,70 @@ static inline void ProjectOrigin(const btVector3& a,
 //
 static inline bool rayIntersectsTriangle(const btVector3& origin, const btVector3& dir, const btVector3& v0, const btVector3& v1, const btVector3& v2, btScalar& t)
 {
-    btScalar a, f, u, v;
-    
-    btVector3 e1 = v1 - v0;
-    btVector3 e2 = v2 - v0;
-    btVector3 h = dir.cross(e2);
-    a = e1.dot(h);
-    
-    if (a > -0.00001 && a < 0.00001)
-        return (false);
-    
-    f = btScalar(1) / a;
-    btVector3 s = origin - v0;
-    u = f * s.dot(h);
-    
-    if (u < 0.0 || u > 1.0)
-        return (false);
-    
-    btVector3 q = s.cross(e1);
-    v = f * dir.dot(q);
-    if (v < 0.0 || u + v > 1.0)
-        return (false);
-    // at this stage we can compute t to find out where
-    // the intersection point is on the line
-    t = f * e2.dot(q);
-    if (t > 0)  // ray intersection
-        return (true);
-    else  // this means that there is a line intersection
-        // but not a ray intersection
-        return (false);
+	btScalar a, f, u, v;
+
+	btVector3 e1 = v1 - v0;
+	btVector3 e2 = v2 - v0;
+	btVector3 h = dir.cross(e2);
+	a = e1.dot(h);
+
+	if (a > -0.00001 && a < 0.00001)
+		return (false);
+
+	f = btScalar(1) / a;
+	btVector3 s = origin - v0;
+	u = f * s.dot(h);
+
+	if (u < 0.0 || u > 1.0)
+		return (false);
+
+	btVector3 q = s.cross(e1);
+	v = f * dir.dot(q);
+	if (v < 0.0 || u + v > 1.0)
+		return (false);
+	// at this stage we can compute t to find out where
+	// the intersection point is on the line
+	t = f * e2.dot(q);
+	if (t > 0)  // ray intersection
+		return (true);
+	else  // this means that there is a line intersection
+		// but not a ray intersection
+		return (false);
 }
 
 static inline bool lineIntersectsTriangle(const btVector3& rayStart, const btVector3& rayEnd, const btVector3& p1, const btVector3& p2, const btVector3& p3, btVector3& sect, btVector3& normal)
 {
-    btVector3 dir = rayEnd - rayStart;
-    btScalar dir_norm = dir.norm();
-    if (dir_norm < SIMD_EPSILON)
-        return false;
-    dir.normalize();
-
-    btScalar t;
-    
-    bool ret = rayIntersectsTriangle(rayStart, dir, p1, p2, p3, t);
-    
-    if (ret)
-    {
-        if (t <= dir_norm)
-        {
-            sect = rayStart + dir * t;
-        }
-        else
-        {
-            ret = false;
-        }
-    }
-    
-    if (ret)
-    {
-        btVector3 n = (p3-p1).cross(p2-p1);
-        n.safeNormalize();
-        if (n.dot(dir) < 0)
-            normal = n;
-        else
-            normal = -n;
-    }
-    return ret;
-}
+	btVector3 dir = rayEnd - rayStart;
+	btScalar dir_norm = dir.norm();
+	if (dir_norm < SIMD_EPSILON)
+		return false;
+	dir.normalize();
+	btScalar t;
+	bool ret = rayIntersectsTriangle(rayStart, dir, p1, p2, p3, t);
+
+	if (ret)
+	{
+		if (t <= dir_norm)
+		{
+			sect = rayStart + dir * t;
+		}
+		else
+		{
+			ret = false;
+		}
+	}
 
+	if (ret)
+	{
+		btVector3 n = (p3 - p1).cross(p2 - p1);
+		n.safeNormalize();
+		if (n.dot(dir) < 0)
+			normal = n;
+		else
+			normal = -n;
+	}
+	return ret;
+}
 
 //
 template <typename T>
@@ -1586,57 +1600,57 @@ struct btSoftColliders
 			psa->m_cdbvt.collideTT(psa->m_cdbvt.m_root, psb->m_cdbvt.m_root, *this);
 		}
 	};
-    //
-    // CollideSDF_RS
-    //
-    struct CollideSDF_RS : btDbvt::ICollide
-    {
-        void Process(const btDbvtNode* leaf)
-        {
-            btSoftBody::Node* node = (btSoftBody::Node*)leaf->data;
-            DoNode(*node);
-        }
-        void DoNode(btSoftBody::Node& n) const
-        {
-            const btScalar m = n.m_im > 0 ? dynmargin : stamargin;
-            btSoftBody::RContact c;
-            
-            if ((!n.m_battach) &&
-                psb->checkContact(m_colObj1Wrap, n.m_x, m, c.m_cti))
-            {
-                const btScalar ima = n.m_im;
-                const btScalar imb = m_rigidBody ? m_rigidBody->getInvMass() : 0.f;
-                const btScalar ms = ima + imb;
-                if (ms > 0)
-                {
-                    const btTransform& wtr = m_rigidBody ? m_rigidBody->getWorldTransform() : m_colObj1Wrap->getCollisionObject()->getWorldTransform();
-                    static const btMatrix3x3 iwiStatic(0, 0, 0, 0, 0, 0, 0, 0, 0);
-                    const btMatrix3x3& iwi = m_rigidBody ? m_rigidBody->getInvInertiaTensorWorld() : iwiStatic;
-                    const btVector3 ra = n.m_x - wtr.getOrigin();
-                    const btVector3 va = m_rigidBody ? m_rigidBody->getVelocityInLocalPoint(ra) * psb->m_sst.sdt : btVector3(0, 0, 0);
-                    const btVector3 vb = n.m_x - n.m_q;
-                    const btVector3 vr = vb - va;
-                    const btScalar dn = btDot(vr, c.m_cti.m_normal);
-                    const btVector3 fv = vr - c.m_cti.m_normal * dn;
-                    const btScalar fc = psb->m_cfg.kDF * m_colObj1Wrap->getCollisionObject()->getFriction();
-                    c.m_node = &n;
-                    c.m_c0 = ImpulseMatrix(psb->m_sst.sdt, ima, imb, iwi, ra);
-                    c.m_c1 = ra;
-                    c.m_c2 = ima * psb->m_sst.sdt;
-                    c.m_c3 = fv.length2() < (dn * fc * dn * fc) ? 0 : 1 - fc;
-                    c.m_c4 = m_colObj1Wrap->getCollisionObject()->isStaticOrKinematicObject() ? psb->m_cfg.kKHR : psb->m_cfg.kCHR;
-                    psb->m_rcontacts.push_back(c);
-                    if (m_rigidBody)
-                        m_rigidBody->activate();
-                }
-            }
-        }
-        btSoftBody* psb;
-        const btCollisionObjectWrapper* m_colObj1Wrap;
-        btRigidBody* m_rigidBody;
-        btScalar dynmargin;
-        btScalar stamargin;
-    };
+	//
+	// CollideSDF_RS
+	//
+	struct CollideSDF_RS : btDbvt::ICollide
+	{
+		void Process(const btDbvtNode* leaf)
+		{
+			btSoftBody::Node* node = (btSoftBody::Node*)leaf->data;
+			DoNode(*node);
+		}
+		void DoNode(btSoftBody::Node& n) const
+		{
+			const btScalar m = n.m_im > 0 ? dynmargin : stamargin;
+			btSoftBody::RContact c;
+
+			if ((!n.m_battach) &&
+				psb->checkContact(m_colObj1Wrap, n.m_x, m, c.m_cti))
+			{
+				const btScalar ima = n.m_im;
+				const btScalar imb = m_rigidBody ? m_rigidBody->getInvMass() : 0.f;
+				const btScalar ms = ima + imb;
+				if (ms > 0)
+				{
+					const btTransform& wtr = m_rigidBody ? m_rigidBody->getWorldTransform() : m_colObj1Wrap->getCollisionObject()->getWorldTransform();
+					static const btMatrix3x3 iwiStatic(0, 0, 0, 0, 0, 0, 0, 0, 0);
+					const btMatrix3x3& iwi = m_rigidBody ? m_rigidBody->getInvInertiaTensorWorld() : iwiStatic;
+					const btVector3 ra = n.m_x - wtr.getOrigin();
+					const btVector3 va = m_rigidBody ? m_rigidBody->getVelocityInLocalPoint(ra) * psb->m_sst.sdt : btVector3(0, 0, 0);
+					const btVector3 vb = n.m_x - n.m_q;
+					const btVector3 vr = vb - va;
+					const btScalar dn = btDot(vr, c.m_cti.m_normal);
+					const btVector3 fv = vr - c.m_cti.m_normal * dn;
+					const btScalar fc = psb->m_cfg.kDF * m_colObj1Wrap->getCollisionObject()->getFriction();
+					c.m_node = &n;
+					c.m_c0 = ImpulseMatrix(psb->m_sst.sdt, ima, imb, iwi, ra);
+					c.m_c1 = ra;
+					c.m_c2 = ima * psb->m_sst.sdt;
+					c.m_c3 = fv.length2() < (dn * fc * dn * fc) ? 0 : 1 - fc;
+					c.m_c4 = m_colObj1Wrap->getCollisionObject()->isStaticOrKinematicObject() ? psb->m_cfg.kKHR : psb->m_cfg.kCHR;
+					psb->m_rcontacts.push_back(c);
+					if (m_rigidBody)
+						m_rigidBody->activate();
+				}
+			}
+		}
+		btSoftBody* psb;
+		const btCollisionObjectWrapper* m_colObj1Wrap;
+		btRigidBody* m_rigidBody;
+		btScalar dynmargin;
+		btScalar stamargin;
+	};
 
 	//
 	// CollideSDF_RD
@@ -1654,72 +1668,74 @@ struct btSoftColliders
 			btSoftBody::DeformableNodeRigidContact c;
 
 			if (!n.m_battach)
-            {
+			{
 				// check for collision at x_{n+1}^*
 				if (psb->checkDeformableContact(m_colObj1Wrap, n.m_q, m, c.m_cti, /*predict = */ true))
-                {
-                    const btScalar ima = n.m_im;
-                    // todo: collision between multibody and fixed deformable node will be missed.
-                    const btScalar imb = m_rigidBody ? m_rigidBody->getInvMass() : 0.f;
-                    const btScalar ms = ima + imb;
-                    if (ms > 0)
-                    {
-                        // resolve contact at x_n
-                        psb->checkDeformableContact(m_colObj1Wrap, n.m_x, m, c.m_cti, /*predict = */ false);
-                        btSoftBody::sCti& cti = c.m_cti;
-                        c.m_node = &n;
-                        const btScalar fc = psb->m_cfg.kDF * m_colObj1Wrap->getCollisionObject()->getFriction();
-                        c.m_c2 = ima;
-                        c.m_c3 = fc;
-                        c.m_c4 = m_colObj1Wrap->getCollisionObject()->isStaticOrKinematicObject() ? psb->m_cfg.kKHR : psb->m_cfg.kCHR;
-                        
-                        if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
-                        {
-                            const btTransform& wtr = m_rigidBody ? m_rigidBody->getWorldTransform() : m_colObj1Wrap->getCollisionObject()->getWorldTransform();
-                            static const btMatrix3x3 iwiStatic(0, 0, 0, 0, 0, 0, 0, 0, 0);
-                            const btMatrix3x3& iwi = m_rigidBody ? m_rigidBody->getInvInertiaTensorWorld() : iwiStatic;
-                            const btVector3 ra = n.m_x - wtr.getOrigin();
-                            
-                            c.m_c0 = ImpulseMatrix(1, ima, imb, iwi, ra);
-                            c.m_c1 = ra;
-                        }
-                        else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
-                        {
-                            btMultiBodyLinkCollider* multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
-                            if (multibodyLinkCol)
-                            {
-                                btVector3 normal = cti.m_normal;
-                                btVector3 t1 = generateUnitOrthogonalVector(normal);
-                                btVector3 t2 = btCross(normal, t1);
-                                btMultiBodyJacobianData jacobianData_normal, jacobianData_t1, jacobianData_t2;
-                                findJacobian(multibodyLinkCol, jacobianData_normal, c.m_node->m_x, normal);
-                                findJacobian(multibodyLinkCol, jacobianData_t1, c.m_node->m_x, t1);
-                                findJacobian(multibodyLinkCol, jacobianData_t2, c.m_node->m_x, t2);
-                                
-                                btScalar* J_n = &jacobianData_normal.m_jacobians[0];
-                                btScalar* J_t1 = &jacobianData_t1.m_jacobians[0];
-                                btScalar* J_t2 = &jacobianData_t2.m_jacobians[0];
-                                
-                                btScalar* u_n = &jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
-                                btScalar* u_t1 = &jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
-                                btScalar* u_t2 = &jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
-                                
-                                btMatrix3x3 rot(normal.getX(), normal.getY(), normal.getZ(),
-                                                t1.getX(), t1.getY(), t1.getZ(),
-                                                t2.getX(), t2.getY(), t2.getZ()); // world frame to local frame
-                                const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
-                                btMatrix3x3 local_impulse_matrix = (Diagonal(n.m_im) + OuterProduct(J_n, J_t1, J_t2, u_n, u_t1, u_t2, ndof)).inverse();
-                                c.m_c0 =  rot.transpose() * local_impulse_matrix * rot;
-                                c.jacobianData_normal = jacobianData_normal;
-                                c.jacobianData_t1 = jacobianData_t1;
-                                c.jacobianData_t2 = jacobianData_t2;
-                                c.t1 = t1;
-                                c.t2 = t2;
-                            }
-                        }
-                        psb->m_nodeRigidContacts.push_back(c);
-                    }
-                }
+				{
+					const btScalar ima = n.m_im;
+					// todo: collision between multibody and fixed deformable node will be missed.
+					const btScalar imb = m_rigidBody ? m_rigidBody->getInvMass() : 0.f;
+					const btScalar ms = ima + imb;
+					if (ms > 0)
+					{
+						// resolve contact at x_n
+						psb->checkDeformableContact(m_colObj1Wrap, n.m_x, m, c.m_cti, /*predict = */ false);
+						btSoftBody::sCti& cti = c.m_cti;
+						c.m_node = &n;
+						const btScalar fc = psb->m_cfg.kDF * m_colObj1Wrap->getCollisionObject()->getFriction();
+						c.m_c2 = ima;
+						c.m_c3 = fc;
+						c.m_c4 = m_colObj1Wrap->getCollisionObject()->isStaticOrKinematicObject() ? psb->m_cfg.kKHR : psb->m_cfg.kCHR;
+						c.m_c5 = n.m_effectiveMass_inv;
+
+						if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
+						{
+							const btTransform& wtr = m_rigidBody ? m_rigidBody->getWorldTransform() : m_colObj1Wrap->getCollisionObject()->getWorldTransform();
+							static const btMatrix3x3 iwiStatic(0, 0, 0, 0, 0, 0, 0, 0, 0);
+							const btMatrix3x3& iwi = m_rigidBody ? m_rigidBody->getInvInertiaTensorWorld() : iwiStatic;
+							const btVector3 ra = n.m_x - wtr.getOrigin();
+
+							c.m_c0 = ImpulseMatrix(1, n.m_effectiveMass_inv, imb, iwi, ra);
+							//                            c.m_c0 = ImpulseMatrix(1, ima, imb, iwi, ra);
+							c.m_c1 = ra;
+						}
+						else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
+						{
+							btMultiBodyLinkCollider* multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
+							if (multibodyLinkCol)
+							{
+								btVector3 normal = cti.m_normal;
+								btVector3 t1 = generateUnitOrthogonalVector(normal);
+								btVector3 t2 = btCross(normal, t1);
+								btMultiBodyJacobianData jacobianData_normal, jacobianData_t1, jacobianData_t2;
+								findJacobian(multibodyLinkCol, jacobianData_normal, c.m_node->m_x, normal);
+								findJacobian(multibodyLinkCol, jacobianData_t1, c.m_node->m_x, t1);
+								findJacobian(multibodyLinkCol, jacobianData_t2, c.m_node->m_x, t2);
+
+								btScalar* J_n = &jacobianData_normal.m_jacobians[0];
+								btScalar* J_t1 = &jacobianData_t1.m_jacobians[0];
+								btScalar* J_t2 = &jacobianData_t2.m_jacobians[0];
+
+								btScalar* u_n = &jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
+								btScalar* u_t1 = &jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
+								btScalar* u_t2 = &jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
+
+								btMatrix3x3 rot(normal.getX(), normal.getY(), normal.getZ(),
+												t1.getX(), t1.getY(), t1.getZ(),
+												t2.getX(), t2.getY(), t2.getZ());  // world frame to local frame
+								const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
+								btMatrix3x3 local_impulse_matrix = (n.m_effectiveMass_inv + OuterProduct(J_n, J_t1, J_t2, u_n, u_t1, u_t2, ndof)).inverse();
+								c.m_c0 = rot.transpose() * local_impulse_matrix * rot;
+								c.jacobianData_normal = jacobianData_normal;
+								c.jacobianData_t1 = jacobianData_t1;
+								c.jacobianData_t2 = jacobianData_t2;
+								c.t1 = t1;
+								c.t2 = t2;
+							}
+						}
+						psb->m_nodeRigidContacts.push_back(c);
+					}
+				}
 			}
 		}
 		btSoftBody* psb;
@@ -1728,112 +1744,111 @@ struct btSoftColliders
 		btScalar dynmargin;
 		btScalar stamargin;
 	};
-    
-    //
-    // CollideSDF_RDF
-    //
-    struct CollideSDF_RDF : btDbvt::ICollide
-    {
-        void Process(const btDbvtNode* leaf)
-        {
-            btSoftBody::Face* face = (btSoftBody::Face*)leaf->data;
-            DoNode(*face);
-        }
-        void DoNode(btSoftBody::Face& f) const
-        {
-            btSoftBody::Node* n0 = f.m_n[0];
-            btSoftBody::Node* n1 = f.m_n[1];
-            btSoftBody::Node* n2 = f.m_n[2];
-            const btScalar m = (n0->m_im > 0 && n1->m_im > 0 && n2->m_im > 0 )? dynmargin : stamargin;
-            btSoftBody::DeformableFaceRigidContact c;
-            btVector3 contact_point;
-            btVector3 bary;
-            if (psb->checkDeformableFaceContact(m_colObj1Wrap, f, contact_point, bary, m, c.m_cti, true))
-            {
-                f.m_pcontact[3] = 1;
-                btScalar ima = n0->m_im + n1->m_im + n2->m_im;
-                const btScalar imb = m_rigidBody ? m_rigidBody->getInvMass() : 0.f;
-                // todo: collision between multibody and fixed deformable face will be missed.
-                const btScalar ms = ima + imb;
-                if (ms > 0)
-                {
-                    // resolve contact at x_n
-//                    psb->checkDeformableFaceContact(m_colObj1Wrap, f, contact_point, bary, m, c.m_cti, /*predict = */ false);
-                    btSoftBody::sCti& cti = c.m_cti;
-                    c.m_contactPoint = contact_point;
-                    c.m_bary = bary;
-                    // todo xuchenhan@: this is assuming mass of all vertices are the same. Need to modify if mass are different for distinct vertices
-                    c.m_weights = btScalar(2)/(btScalar(1) + bary.length2()) * bary;
-                    c.m_face = &f;
+
+	//
+	// CollideSDF_RDF
+	//
+	struct CollideSDF_RDF : btDbvt::ICollide
+	{
+		void Process(const btDbvtNode* leaf)
+		{
+			btSoftBody::Face* face = (btSoftBody::Face*)leaf->data;
+			DoNode(*face);
+		}
+		void DoNode(btSoftBody::Face& f) const
+		{
+			btSoftBody::Node* n0 = f.m_n[0];
+			btSoftBody::Node* n1 = f.m_n[1];
+			btSoftBody::Node* n2 = f.m_n[2];
+			const btScalar m = (n0->m_im > 0 && n1->m_im > 0 && n2->m_im > 0) ? dynmargin : stamargin;
+			btSoftBody::DeformableFaceRigidContact c;
+			btVector3 contact_point;
+			btVector3 bary;
+			if (psb->checkDeformableFaceContact(m_colObj1Wrap, f, contact_point, bary, m, c.m_cti, true))
+			{
+				btScalar ima = n0->m_im + n1->m_im + n2->m_im;
+				const btScalar imb = m_rigidBody ? m_rigidBody->getInvMass() : 0.f;
+				// todo: collision between multibody and fixed deformable face will be missed.
+				const btScalar ms = ima + imb;
+				if (ms > 0)
+				{
+					// resolve contact at x_n
+					//                    psb->checkDeformableFaceContact(m_colObj1Wrap, f, contact_point, bary, m, c.m_cti, /*predict = */ false);
+					btSoftBody::sCti& cti = c.m_cti;
+					c.m_contactPoint = contact_point;
+					c.m_bary = bary;
+					// todo xuchenhan@: this is assuming mass of all vertices are the same. Need to modify if mass are different for distinct vertices
+					c.m_weights = btScalar(2) / (btScalar(1) + bary.length2()) * bary;
+					c.m_face = &f;
 					// friction is handled by the nodes to prevent sticking
-//                    const btScalar fc = 0;
-                    const btScalar fc = psb->m_cfg.kDF * m_colObj1Wrap->getCollisionObject()->getFriction();
-                    
-                    // the effective inverse mass of the face as in https://graphics.stanford.edu/papers/cloth-sig02/cloth.pdf
-                    ima = bary.getX()*c.m_weights.getX() * n0->m_im + bary.getY()*c.m_weights.getY() * n1->m_im + bary.getZ()*c.m_weights.getZ() * n2->m_im;
-                    c.m_c2 = ima;
-                    c.m_c3 = fc;
-                    c.m_c4 = m_colObj1Wrap->getCollisionObject()->isStaticOrKinematicObject() ? psb->m_cfg.kKHR : psb->m_cfg.kCHR;
-                    if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
-                    {
-                        const btTransform& wtr = m_rigidBody ? m_rigidBody->getWorldTransform() : m_colObj1Wrap->getCollisionObject()->getWorldTransform();
-                        static const btMatrix3x3 iwiStatic(0, 0, 0, 0, 0, 0, 0, 0, 0);
-                        const btMatrix3x3& iwi = m_rigidBody ? m_rigidBody->getInvInertiaTensorWorld() : iwiStatic;
-                        const btVector3 ra = contact_point - wtr.getOrigin();
-                        
-                        // we do not scale the impulse matrix by dt
-                        c.m_c0 = ImpulseMatrix(1, ima, imb, iwi, ra);
-                        c.m_c1 = ra;
-                    }
-                    else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
-                    {
-                        btMultiBodyLinkCollider* multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
-                        if (multibodyLinkCol)
-                        {
-                            btVector3 normal = cti.m_normal;
-                            btVector3 t1 = generateUnitOrthogonalVector(normal);
-                            btVector3 t2 = btCross(normal, t1);
-                            btMultiBodyJacobianData jacobianData_normal, jacobianData_t1, jacobianData_t2;
-                            findJacobian(multibodyLinkCol, jacobianData_normal, contact_point, normal);
-                            findJacobian(multibodyLinkCol, jacobianData_t1, contact_point, t1);
-                            findJacobian(multibodyLinkCol, jacobianData_t2, contact_point, t2);
-                            
-                            btScalar* J_n = &jacobianData_normal.m_jacobians[0];
-                            btScalar* J_t1 = &jacobianData_t1.m_jacobians[0];
-                            btScalar* J_t2 = &jacobianData_t2.m_jacobians[0];
-                            
-                            btScalar* u_n = &jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
-                            btScalar* u_t1 = &jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
-                            btScalar* u_t2 = &jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
-                            
-                            btMatrix3x3 rot(normal.getX(), normal.getY(), normal.getZ(),
-                                            t1.getX(), t1.getY(), t1.getZ(),
-                                            t2.getX(), t2.getY(), t2.getZ()); // world frame to local frame
-                            const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
-                            btMatrix3x3 local_impulse_matrix = (Diagonal(ima) + OuterProduct(J_n, J_t1, J_t2, u_n, u_t1, u_t2, ndof)).inverse();
-                            c.m_c0 =  rot.transpose() * local_impulse_matrix * rot;
-                            c.jacobianData_normal = jacobianData_normal;
-                            c.jacobianData_t1 = jacobianData_t1;
-                            c.jacobianData_t2 = jacobianData_t2;
-                            c.t1 = t1;
-                            c.t2 = t2;
-                        }
-                    }
-                    psb->m_faceRigidContacts.push_back(c);
-                }
-            }
-            else
-            {
-                f.m_pcontact[3] = 0;
-            }
-        }
-        btSoftBody* psb;
-        const btCollisionObjectWrapper* m_colObj1Wrap;
-        btRigidBody* m_rigidBody;
-        btScalar dynmargin;
-        btScalar stamargin;
-    };
-    
+					//                    const btScalar fc = 0;
+					const btScalar fc = psb->m_cfg.kDF * m_colObj1Wrap->getCollisionObject()->getFriction();
+
+					// the effective inverse mass of the face as in https://graphics.stanford.edu/papers/cloth-sig02/cloth.pdf
+					ima = bary.getX() * c.m_weights.getX() * n0->m_im + bary.getY() * c.m_weights.getY() * n1->m_im + bary.getZ() * c.m_weights.getZ() * n2->m_im;
+					c.m_c2 = ima;
+					c.m_c3 = fc;
+					c.m_c4 = m_colObj1Wrap->getCollisionObject()->isStaticOrKinematicObject() ? psb->m_cfg.kKHR : psb->m_cfg.kCHR;
+					c.m_c5 = Diagonal(ima);
+					if (cti.m_colObj->getInternalType() == btCollisionObject::CO_RIGID_BODY)
+					{
+						const btTransform& wtr = m_rigidBody ? m_rigidBody->getWorldTransform() : m_colObj1Wrap->getCollisionObject()->getWorldTransform();
+						static const btMatrix3x3 iwiStatic(0, 0, 0, 0, 0, 0, 0, 0, 0);
+						const btMatrix3x3& iwi = m_rigidBody ? m_rigidBody->getInvInertiaTensorWorld() : iwiStatic;
+						const btVector3 ra = contact_point - wtr.getOrigin();
+
+						// we do not scale the impulse matrix by dt
+						c.m_c0 = ImpulseMatrix(1, ima, imb, iwi, ra);
+						c.m_c1 = ra;
+					}
+					else if (cti.m_colObj->getInternalType() == btCollisionObject::CO_FEATHERSTONE_LINK)
+					{
+						btMultiBodyLinkCollider* multibodyLinkCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(cti.m_colObj);
+						if (multibodyLinkCol)
+						{
+							btVector3 normal = cti.m_normal;
+							btVector3 t1 = generateUnitOrthogonalVector(normal);
+							btVector3 t2 = btCross(normal, t1);
+							btMultiBodyJacobianData jacobianData_normal, jacobianData_t1, jacobianData_t2;
+							findJacobian(multibodyLinkCol, jacobianData_normal, contact_point, normal);
+							findJacobian(multibodyLinkCol, jacobianData_t1, contact_point, t1);
+							findJacobian(multibodyLinkCol, jacobianData_t2, contact_point, t2);
+
+							btScalar* J_n = &jacobianData_normal.m_jacobians[0];
+							btScalar* J_t1 = &jacobianData_t1.m_jacobians[0];
+							btScalar* J_t2 = &jacobianData_t2.m_jacobians[0];
+
+							btScalar* u_n = &jacobianData_normal.m_deltaVelocitiesUnitImpulse[0];
+							btScalar* u_t1 = &jacobianData_t1.m_deltaVelocitiesUnitImpulse[0];
+							btScalar* u_t2 = &jacobianData_t2.m_deltaVelocitiesUnitImpulse[0];
+
+							btMatrix3x3 rot(normal.getX(), normal.getY(), normal.getZ(),
+											t1.getX(), t1.getY(), t1.getZ(),
+											t2.getX(), t2.getY(), t2.getZ());  // world frame to local frame
+							const int ndof = multibodyLinkCol->m_multiBody->getNumDofs() + 6;
+							btMatrix3x3 local_impulse_matrix = (Diagonal(ima) + OuterProduct(J_n, J_t1, J_t2, u_n, u_t1, u_t2, ndof)).inverse();
+							c.m_c0 = rot.transpose() * local_impulse_matrix * rot;
+							c.jacobianData_normal = jacobianData_normal;
+							c.jacobianData_t1 = jacobianData_t1;
+							c.jacobianData_t2 = jacobianData_t2;
+							c.t1 = t1;
+							c.t2 = t2;
+						}
+					}
+					psb->m_faceRigidContacts.push_back(c);
+				}
+			}
+			// Set caching barycenters to be false after collision detection.
+			// Only turn on when contact is static.
+			f.m_pcontact[3] = 0;
+		}
+		btSoftBody* psb;
+		const btCollisionObjectWrapper* m_colObj1Wrap;
+		btRigidBody* m_rigidBody;
+		btScalar dynmargin;
+		btScalar stamargin;
+	};
+
 	//
 	// CollideVF_SS
 	//
@@ -1844,12 +1859,12 @@ struct btSoftColliders
 		{
 			btSoftBody::Node* node = (btSoftBody::Node*)lnode->data;
 			btSoftBody::Face* face = (btSoftBody::Face*)lface->data;
-            for (int i = 0; i < 3; ++i)
-            {
-                if (face->m_n[i] == node)
-                    continue;
-            }
-            
+			for (int i = 0; i < 3; ++i)
+			{
+				if (face->m_n[i] == node)
+					continue;
+			}
+
 			btVector3 o = node->m_x;
 			btVector3 p;
 			btScalar d = SIMD_INFINITY;
@@ -1879,7 +1894,7 @@ struct btSoftColliders
 					c.m_node = node;
 					c.m_face = face;
 					c.m_weights = w;
-					c.m_friction = btMax (psb[0]->m_cfg.kDF, psb[1]->m_cfg.kDF);
+					c.m_friction = btMax(psb[0]->m_cfg.kDF, psb[1]->m_cfg.kDF);
 					c.m_cfm[0] = ma / ms * psb[0]->m_cfg.kSHR;
 					c.m_cfm[1] = mb / ms * psb[1]->m_cfg.kSHR;
 					psb[0]->m_scontacts.push_back(c);
@@ -1889,206 +1904,205 @@ struct btSoftColliders
 		btSoftBody* psb[2];
 		btScalar mrg;
 	};
-    
-    
-    //
-    // CollideVF_DD
-    //
-    struct CollideVF_DD : btDbvt::ICollide
-    {
-        void Process(const btDbvtNode* lnode,
-                     const btDbvtNode* lface)
-        {
-            btSoftBody::Node* node = (btSoftBody::Node*)lnode->data;
-            btSoftBody::Face* face = (btSoftBody::Face*)lface->data;
-            btVector3 bary;
-            if (proximityTest(face->m_n[0]->m_x, face->m_n[1]->m_x, face->m_n[2]->m_x, node->m_x, face->m_normal, mrg, bary))
-            {
-                const btSoftBody::Node* n[] = {face->m_n[0], face->m_n[1], face->m_n[2]};
-                const btVector3 w = bary;
-                const btScalar ma = node->m_im;
-                btScalar mb = BaryEval(n[0]->m_im, n[1]->m_im, n[2]->m_im, w);
-                if ((n[0]->m_im <= 0) ||
-                    (n[1]->m_im <= 0) ||
-                    (n[2]->m_im <= 0))
-                {
-                    mb = 0;
-                }
-                const btScalar ms = ma + mb;
-                if (ms > 0)
-                {
-                    btSoftBody::DeformableFaceNodeContact c;
-                    c.m_normal = face->m_normal;
-                    if (!useFaceNormal && c.m_normal.dot(node->m_x - face->m_n[2]->m_x) < 0)
-                        c.m_normal  = -face->m_normal;
-                    c.m_margin = mrg;
-                    c.m_node = node;
-                    c.m_face = face;
-                    c.m_bary = w;
-                    c.m_friction = psb[0]->m_cfg.kDF * psb[1]->m_cfg.kDF;
-                    psb[0]->m_faceNodeContacts.push_back(c);
-                }
-            }
-        }
-        btSoftBody* psb[2];
-        btScalar mrg;
-        bool useFaceNormal;
-    };
-    
-    //
-    // CollideFF_DD
-    //
-    struct CollideFF_DD : btDbvt::ICollide
-    {
-        void Process(const btDbvntNode* lface1,
-                     const btDbvntNode* lface2)
-        {
-            btSoftBody::Face* f1 = (btSoftBody::Face*)lface1->data;
-            btSoftBody::Face* f2 = (btSoftBody::Face*)lface2->data;
-            if (f1 != f2)
-            {
-                Repel(f1, f2);
-                Repel(f2, f1);
-            }
-        }
-        void Repel(btSoftBody::Face* f1, btSoftBody::Face* f2)
-        {
-            //#define REPEL_NEIGHBOR 1
+
+	//
+	// CollideVF_DD
+	//
+	struct CollideVF_DD : btDbvt::ICollide
+	{
+		void Process(const btDbvtNode* lnode,
+					 const btDbvtNode* lface)
+		{
+			btSoftBody::Node* node = (btSoftBody::Node*)lnode->data;
+			btSoftBody::Face* face = (btSoftBody::Face*)lface->data;
+			btVector3 bary;
+			if (proximityTest(face->m_n[0]->m_x, face->m_n[1]->m_x, face->m_n[2]->m_x, node->m_x, face->m_normal, mrg, bary))
+			{
+				const btSoftBody::Node* n[] = {face->m_n[0], face->m_n[1], face->m_n[2]};
+				const btVector3 w = bary;
+				const btScalar ma = node->m_im;
+				btScalar mb = BaryEval(n[0]->m_im, n[1]->m_im, n[2]->m_im, w);
+				if ((n[0]->m_im <= 0) ||
+					(n[1]->m_im <= 0) ||
+					(n[2]->m_im <= 0))
+				{
+					mb = 0;
+				}
+				const btScalar ms = ma + mb;
+				if (ms > 0)
+				{
+					btSoftBody::DeformableFaceNodeContact c;
+					c.m_normal = face->m_normal;
+					if (!useFaceNormal && c.m_normal.dot(node->m_x - face->m_n[2]->m_x) < 0)
+						c.m_normal = -face->m_normal;
+					c.m_margin = mrg;
+					c.m_node = node;
+					c.m_face = face;
+					c.m_bary = w;
+					c.m_friction = psb[0]->m_cfg.kDF * psb[1]->m_cfg.kDF;
+					psb[0]->m_faceNodeContacts.push_back(c);
+				}
+			}
+		}
+		btSoftBody* psb[2];
+		btScalar mrg;
+		bool useFaceNormal;
+	};
+
+	//
+	// CollideFF_DD
+	//
+	struct CollideFF_DD : btDbvt::ICollide
+	{
+		void Process(const btDbvntNode* lface1,
+					 const btDbvntNode* lface2)
+		{
+			btSoftBody::Face* f1 = (btSoftBody::Face*)lface1->data;
+			btSoftBody::Face* f2 = (btSoftBody::Face*)lface2->data;
+			if (f1 != f2)
+			{
+				Repel(f1, f2);
+				Repel(f2, f1);
+			}
+		}
+		void Repel(btSoftBody::Face* f1, btSoftBody::Face* f2)
+		{
+			//#define REPEL_NEIGHBOR 1
 #ifndef REPEL_NEIGHBOR
-            for (int node_id = 0; node_id < 3; ++node_id)
-            {
-                btSoftBody::Node* node = f1->m_n[node_id];
-                for (int i = 0; i < 3; ++i)
-                {
-                    if (f2->m_n[i] == node)
-                        return;
-                }
-            }
+			for (int node_id = 0; node_id < 3; ++node_id)
+			{
+				btSoftBody::Node* node = f1->m_n[node_id];
+				for (int i = 0; i < 3; ++i)
+				{
+					if (f2->m_n[i] == node)
+						return;
+				}
+			}
 #endif
-            bool skip = false;
-            for (int node_id = 0; node_id < 3; ++node_id)
-            {
-                btSoftBody::Node* node = f1->m_n[node_id];
+			bool skip = false;
+			for (int node_id = 0; node_id < 3; ++node_id)
+			{
+				btSoftBody::Node* node = f1->m_n[node_id];
 #ifdef REPEL_NEIGHBOR
-                for (int i = 0; i < 3; ++i)
-                {
-                    if (f2->m_n[i] == node)
-                    {
-                        skip = true;
-                        break;
-                    }
-                }
-                if (skip)
-                {
-                    skip = false;
-                    continue;
-                }
+				for (int i = 0; i < 3; ++i)
+				{
+					if (f2->m_n[i] == node)
+					{
+						skip = true;
+						break;
+					}
+				}
+				if (skip)
+				{
+					skip = false;
+					continue;
+				}
 #endif
-                btSoftBody::Face* face = f2;
-                btVector3 bary;
-                if (!proximityTest(face->m_n[0]->m_x, face->m_n[1]->m_x, face->m_n[2]->m_x, node->m_x, face->m_normal, mrg, bary))
-                    continue;
-                btSoftBody::DeformableFaceNodeContact c;
-                c.m_normal = face->m_normal;
-                if (!useFaceNormal && c.m_normal.dot(node->m_x - face->m_n[2]->m_x) < 0)
-                    c.m_normal  = -face->m_normal;
-                c.m_margin = mrg;
-                c.m_node = node;
-                c.m_face = face;
-                c.m_bary = bary;
-                c.m_friction = psb[0]->m_cfg.kDF * psb[1]->m_cfg.kDF;
-                psb[0]->m_faceNodeContacts.push_back(c);
-            }
-        }
-        btSoftBody* psb[2];
-        btScalar mrg;
-        bool useFaceNormal;
-    };
-
-    struct CollideCCD : btDbvt::ICollide
-    {
-        void Process(const btDbvtNode* lnode,
-                     const btDbvtNode* lface)
-        {
-            btSoftBody::Node* node = (btSoftBody::Node*)lnode->data;
-            btSoftBody::Face* face = (btSoftBody::Face*)lface->data;
-            btVector3 bary;
-            if (bernsteinCCD(face, node, dt, SAFE_EPSILON, bary))
-            {
-                btSoftBody::DeformableFaceNodeContact c;
-                c.m_normal = face->m_normal;
-                if (!useFaceNormal && c.m_normal.dot(node->m_x - face->m_n[2]->m_x) < 0)
-                    c.m_normal  = -face->m_normal;
-                c.m_node = node;
-                c.m_face = face;
-                c.m_bary = bary;
-                c.m_friction = psb[0]->m_cfg.kDF * psb[1]->m_cfg.kDF;
-                psb[0]->m_faceNodeContacts.push_back(c);
-            }
-        }
-        void Process(const btDbvntNode* lface1,
-                     const btDbvntNode* lface2)
-        {
-            btSoftBody::Face* f1 = (btSoftBody::Face*)lface1->data;
-            btSoftBody::Face* f2 = (btSoftBody::Face*)lface2->data;
-            if (f1 != f2)
-            {
-                Repel(f1, f2);
-                Repel(f2, f1);
-            }
-        }
-        void Repel(btSoftBody::Face* f1, btSoftBody::Face* f2)
-        {
-            //#define REPEL_NEIGHBOR 1
+				btSoftBody::Face* face = f2;
+				btVector3 bary;
+				if (!proximityTest(face->m_n[0]->m_x, face->m_n[1]->m_x, face->m_n[2]->m_x, node->m_x, face->m_normal, mrg, bary))
+					continue;
+				btSoftBody::DeformableFaceNodeContact c;
+				c.m_normal = face->m_normal;
+				if (!useFaceNormal && c.m_normal.dot(node->m_x - face->m_n[2]->m_x) < 0)
+					c.m_normal = -face->m_normal;
+				c.m_margin = mrg;
+				c.m_node = node;
+				c.m_face = face;
+				c.m_bary = bary;
+				c.m_friction = psb[0]->m_cfg.kDF * psb[1]->m_cfg.kDF;
+				psb[0]->m_faceNodeContacts.push_back(c);
+			}
+		}
+		btSoftBody* psb[2];
+		btScalar mrg;
+		bool useFaceNormal;
+	};
+
+	struct CollideCCD : btDbvt::ICollide
+	{
+		void Process(const btDbvtNode* lnode,
+					 const btDbvtNode* lface)
+		{
+			btSoftBody::Node* node = (btSoftBody::Node*)lnode->data;
+			btSoftBody::Face* face = (btSoftBody::Face*)lface->data;
+			btVector3 bary;
+			if (bernsteinCCD(face, node, dt, SAFE_EPSILON, bary))
+			{
+				btSoftBody::DeformableFaceNodeContact c;
+				c.m_normal = face->m_normal;
+				if (!useFaceNormal && c.m_normal.dot(node->m_x - face->m_n[2]->m_x) < 0)
+					c.m_normal = -face->m_normal;
+				c.m_node = node;
+				c.m_face = face;
+				c.m_bary = bary;
+				c.m_friction = psb[0]->m_cfg.kDF * psb[1]->m_cfg.kDF;
+				psb[0]->m_faceNodeContacts.push_back(c);
+			}
+		}
+		void Process(const btDbvntNode* lface1,
+					 const btDbvntNode* lface2)
+		{
+			btSoftBody::Face* f1 = (btSoftBody::Face*)lface1->data;
+			btSoftBody::Face* f2 = (btSoftBody::Face*)lface2->data;
+			if (f1 != f2)
+			{
+				Repel(f1, f2);
+				Repel(f2, f1);
+			}
+		}
+		void Repel(btSoftBody::Face* f1, btSoftBody::Face* f2)
+		{
+			//#define REPEL_NEIGHBOR 1
 #ifndef REPEL_NEIGHBOR
-            for (int node_id = 0; node_id < 3; ++node_id)
-            {
-                btSoftBody::Node* node = f1->m_n[node_id];
-                for (int i = 0; i < 3; ++i)
-                {
-                    if (f2->m_n[i] == node)
-                        return;
-                }
-            }
+			for (int node_id = 0; node_id < 3; ++node_id)
+			{
+				btSoftBody::Node* node = f1->m_n[node_id];
+				for (int i = 0; i < 3; ++i)
+				{
+					if (f2->m_n[i] == node)
+						return;
+				}
+			}
 #endif
-            bool skip = false;
-            for (int node_id = 0; node_id < 3; ++node_id)
-            {
-                btSoftBody::Node* node = f1->m_n[node_id];
+			bool skip = false;
+			for (int node_id = 0; node_id < 3; ++node_id)
+			{
+				btSoftBody::Node* node = f1->m_n[node_id];
 #ifdef REPEL_NEIGHBOR
-                for (int i = 0; i < 3; ++i)
-                {
-                    if (f2->m_n[i] == node)
-                    {
-                        skip = true;
-                        break;
-                    }
-                }
-                if (skip)
-                {
-                    skip = false;
-                    continue;
-                }
+				for (int i = 0; i < 3; ++i)
+				{
+					if (f2->m_n[i] == node)
+					{
+						skip = true;
+						break;
+					}
+				}
+				if (skip)
+				{
+					skip = false;
+					continue;
+				}
 #endif
-                btSoftBody::Face* face = f2;
-                btVector3 bary;
+				btSoftBody::Face* face = f2;
+				btVector3 bary;
 				if (bernsteinCCD(face, node, dt, SAFE_EPSILON, bary))
-                {
-                    btSoftBody::DeformableFaceNodeContact c;
-                    c.m_normal = face->m_normal;
-                    if (!useFaceNormal && c.m_normal.dot(node->m_x - face->m_n[2]->m_x) < 0)
-                        c.m_normal  = -face->m_normal;
-                    c.m_node = node;
-                    c.m_face = face;
-                    c.m_bary = bary;
-                    c.m_friction = psb[0]->m_cfg.kDF * psb[1]->m_cfg.kDF;
-                    psb[0]->m_faceNodeContacts.push_back(c);
-                }
-            }
-        }
-        btSoftBody* psb[2];
-        btScalar dt, mrg;
-        bool useFaceNormal;
-    };
+				{
+					btSoftBody::DeformableFaceNodeContact c;
+					c.m_normal = face->m_normal;
+					if (!useFaceNormal && c.m_normal.dot(node->m_x - face->m_n[2]->m_x) < 0)
+						c.m_normal = -face->m_normal;
+					c.m_node = node;
+					c.m_face = face;
+					c.m_bary = bary;
+					c.m_friction = psb[0]->m_cfg.kDF * psb[1]->m_cfg.kDF;
+					psb[0]->m_faceNodeContacts.push_back(c);
+				}
+			}
+		}
+		btSoftBody* psb[2];
+		btScalar dt, mrg;
+		bool useFaceNormal;
+	};
 };
 #endif  //_BT_SOFT_BODY_INTERNALS_H
diff --git a/thirdparty/bullet/BulletSoftBody/btSoftBodySolvers.h b/thirdparty/bullet/BulletSoftBody/btSoftBodySolvers.h
index c4ac4141aa..dbb2624eee 100644
--- a/thirdparty/bullet/BulletSoftBody/btSoftBodySolvers.h
+++ b/thirdparty/bullet/BulletSoftBody/btSoftBodySolvers.h
@@ -36,7 +36,7 @@ public:
 		CL_SIMD_SOLVER,
 		DX_SOLVER,
 		DX_SIMD_SOLVER,
-        DEFORMABLE_SOLVER
+		DEFORMABLE_SOLVER
 	};
 
 protected:
diff --git a/thirdparty/bullet/BulletSoftBody/btSoftMultiBodyDynamicsWorld.cpp b/thirdparty/bullet/BulletSoftBody/btSoftMultiBodyDynamicsWorld.cpp
index 282dbf75f0..329bd19d71 100644
--- a/thirdparty/bullet/BulletSoftBody/btSoftMultiBodyDynamicsWorld.cpp
+++ b/thirdparty/bullet/BulletSoftBody/btSoftMultiBodyDynamicsWorld.cpp
@@ -100,6 +100,11 @@ void btSoftMultiBodyDynamicsWorld::internalSingleStepSimulation(btScalar timeSte
 	///update soft bodies
 	m_softBodySolver->updateSoftBodies();
 
+	for (int i = 0; i < m_softBodies.size(); i++)
+	{
+		btSoftBody* psb = (btSoftBody*)m_softBodies[i];
+		psb->interpolateRenderMesh();
+	}
 	// End solver-wise simulation step
 	// ///////////////////////////////
 }
diff --git a/thirdparty/bullet/BulletSoftBody/btSparseSDF.h b/thirdparty/bullet/BulletSoftBody/btSparseSDF.h
index eb290a1dbd..d611726bcd 100644
--- a/thirdparty/bullet/BulletSoftBody/btSparseSDF.h
+++ b/thirdparty/bullet/BulletSoftBody/btSparseSDF.h
@@ -22,36 +22,36 @@ subject to the following restrictions:
 
 // Fast Hash
 
-#if !defined (get16bits)
-#define get16bits(d) ((((unsigned int)(((const unsigned char *)(d))[1])) << 8)\
-+(unsigned int)(((const unsigned char *)(d))[0]) )
+#if !defined(get16bits)
+#define get16bits(d) ((((unsigned int)(((const unsigned char*)(d))[1])) << 8) + (unsigned int)(((const unsigned char*)(d))[0]))
 #endif
 //
 // super hash function by Paul Hsieh
 //
-inline unsigned int HsiehHash (const char * data, int len) {
-  unsigned int hash = len, tmp;
-  len>>=2;
-
-    /* Main loop */
-    for (;len > 0; len--) {
-        hash  += get16bits (data);
-        tmp    = (get16bits (data+2) << 11) ^ hash;
-        hash   = (hash << 16) ^ tmp;
-        data  += 2*sizeof (unsigned short);
-        hash  += hash >> 11;
-    }
+inline unsigned int HsiehHash(const char* data, int len)
+{
+	unsigned int hash = len, tmp;
+	len >>= 2;
 
+	/* Main loop */
+	for (; len > 0; len--)
+	{
+		hash += get16bits(data);
+		tmp = (get16bits(data + 2) << 11) ^ hash;
+		hash = (hash << 16) ^ tmp;
+		data += 2 * sizeof(unsigned short);
+		hash += hash >> 11;
+	}
 
-    /* Force "avalanching" of final 127 bits */
-    hash ^= hash << 3;
-    hash += hash >> 5;
-    hash ^= hash << 4;
-    hash += hash >> 17;
-    hash ^= hash << 25;
-    hash += hash >> 6;
+	/* Force "avalanching" of final 127 bits */
+	hash ^= hash << 3;
+	hash += hash >> 5;
+	hash ^= hash << 4;
+	hash += hash >> 17;
+	hash ^= hash << 25;
+	hash += hash >> 6;
 
-    return hash;
+	return hash;
 }
 
 template <const int CELLSIZE>
@@ -81,7 +81,7 @@ struct btSparseSdf
 
 	btAlignedObjectArray<Cell*> cells;
 	btScalar voxelsz;
-    btScalar m_defaultVoxelsz;
+	btScalar m_defaultVoxelsz;
 	int puid;
 	int ncells;
 	int m_clampCells;
@@ -103,16 +103,16 @@ struct btSparseSdf
 		//if this limit is reached, the SDF is reset (at the cost of some performance during the reset)
 		m_clampCells = clampCells;
 		cells.resize(hashsize, 0);
-        m_defaultVoxelsz = 0.25;
+		m_defaultVoxelsz = 0.25;
 		Reset();
 	}
 	//
-    
-    void setDefaultVoxelsz(btScalar sz)
-    {
-        m_defaultVoxelsz = sz;
-    }
-    
+
+	void setDefaultVoxelsz(btScalar sz)
+	{
+		m_defaultVoxelsz = sz;
+	}
+
 	void Reset()
 	{
 		for (int i = 0, ni = cells.size(); i < ni; ++i)
@@ -162,7 +162,7 @@ struct btSparseSdf
 		nqueries = 1;
 		nprobes = 1;
 		++puid;  ///@todo: Reset puid's when int range limit is reached	*/
-				 /* else setup a priority list...						*/
+		/* else setup a priority list...						*/
 	}
 	//
 	int RemoveReferences(btCollisionShape* pcs)
@@ -221,7 +221,7 @@ struct btSparseSdf
 			else
 			{
 				// printf("c->hash/c[0][1][2]=%d,%d,%d,%d\n", c->hash, c->c[0], c->c[1],c->c[2]);
-                        //printf("h,ixb,iyb,izb=%d,%d,%d,%d\n", h,ix.b, iy.b, iz.b);
+				//printf("h,ixb,iyb,izb=%d,%d,%d,%d\n", h,ix.b, iy.b, iz.b);
 
 				c = c->next;
 			}
@@ -363,7 +363,7 @@ struct btSparseSdf
 		myset.p = (void*)shape;
 		const char* ptr = (const char*)&myset;
 
-		unsigned int result = HsiehHash(ptr, sizeof(btS) );
+		unsigned int result = HsiehHash(ptr, sizeof(btS));
 
 		return result;
 	}
diff --git a/thirdparty/bullet/BulletSoftBody/poly34.cpp b/thirdparty/bullet/BulletSoftBody/poly34.cpp
index 819d0c79f7..ec7549c8e8 100644
--- a/thirdparty/bullet/BulletSoftBody/poly34.cpp
+++ b/thirdparty/bullet/BulletSoftBody/poly34.cpp
@@ -6,7 +6,7 @@
 //
 #include <math.h>
 
-#include "poly34.h" // solution of cubic and quartic equation
+#include "poly34.h"  // solution of cubic and quartic equation
 #define TwoPi 6.28318530717958648
 const btScalar eps = SIMD_EPSILON;
 
@@ -15,50 +15,53 @@ const btScalar eps = SIMD_EPSILON;
 //=============================================================================
 static SIMD_FORCE_INLINE btScalar _root3(btScalar x)
 {
-    btScalar s = 1.;
-    while (x < 1.) {
-        x *= 8.;
-        s *= 0.5;
-    }
-    while (x > 8.) {
-        x *= 0.125;
-        s *= 2.;
-    }
-    btScalar r = 1.5;
-    r -= 1. / 3. * (r - x / (r * r));
-    r -= 1. / 3. * (r - x / (r * r));
-    r -= 1. / 3. * (r - x / (r * r));
-    r -= 1. / 3. * (r - x / (r * r));
-    r -= 1. / 3. * (r - x / (r * r));
-    r -= 1. / 3. * (r - x / (r * r));
-    return r * s;
+	btScalar s = 1.;
+	while (x < 1.)
+	{
+		x *= 8.;
+		s *= 0.5;
+	}
+	while (x > 8.)
+	{
+		x *= 0.125;
+		s *= 2.;
+	}
+	btScalar r = 1.5;
+	r -= 1. / 3. * (r - x / (r * r));
+	r -= 1. / 3. * (r - x / (r * r));
+	r -= 1. / 3. * (r - x / (r * r));
+	r -= 1. / 3. * (r - x / (r * r));
+	r -= 1. / 3. * (r - x / (r * r));
+	r -= 1. / 3. * (r - x / (r * r));
+	return r * s;
 }
 
 btScalar SIMD_FORCE_INLINE root3(btScalar x)
 {
-    if (x > 0)
-        return _root3(x);
-    else if (x < 0)
-        return -_root3(-x);
-    else
-        return 0.;
+	if (x > 0)
+		return _root3(x);
+	else if (x < 0)
+		return -_root3(-x);
+	else
+		return 0.;
 }
 
 // x - array of size 2
 // return 2: 2 real roots x[0], x[1]
 // return 0: pair of complex roots: x[0]i*x[1]
 int SolveP2(btScalar* x, btScalar a, btScalar b)
-{ // solve equation x^2 + a*x + b = 0
-    btScalar D = 0.25 * a * a - b;
-    if (D >= 0) {
-        D = sqrt(D);
-        x[0] = -0.5 * a + D;
-        x[1] = -0.5 * a - D;
-        return 2;
-    }
-    x[0] = -0.5 * a;
-    x[1] = sqrt(-D);
-    return 0;
+{  // solve equation x^2 + a*x + b = 0
+	btScalar D = 0.25 * a * a - b;
+	if (D >= 0)
+	{
+		D = sqrt(D);
+		x[0] = -0.5 * a + D;
+		x[1] = -0.5 * a - D;
+		return 2;
+	}
+	x[0] = -0.5 * a;
+	x[1] = sqrt(-D);
+	return 0;
 }
 //---------------------------------------------------------------------------
 // x - array of size 3
@@ -66,217 +69,228 @@ int SolveP2(btScalar* x, btScalar a, btScalar b)
 //         2 real roots: x[0], x[1],          return 2
 //         1 real root : x[0], x[1]  i*x[2], return 1
 int SolveP3(btScalar* x, btScalar a, btScalar b, btScalar c)
-{ // solve cubic equation x^3 + a*x^2 + b*x + c = 0
-    btScalar a2 = a * a;
-    btScalar q = (a2 - 3 * b) / 9;
-    if (q < 0)
-        q = eps;
-    btScalar r = (a * (2 * a2 - 9 * b) + 27 * c) / 54;
-    // equation x^3 + q*x + r = 0
-    btScalar r2 = r * r;
-    btScalar q3 = q * q * q;
-    btScalar A, B;
-    if (r2 <= (q3 + eps)) { //<<-- FIXED!
-        btScalar t = r / sqrt(q3);
-        if (t < -1)
-            t = -1;
-        if (t > 1)
-            t = 1;
-        t = acos(t);
-        a /= 3;
-        q = -2 * sqrt(q);
-        x[0] = q * cos(t / 3) - a;
-        x[1] = q * cos((t + TwoPi) / 3) - a;
-        x[2] = q * cos((t - TwoPi) / 3) - a;
-        return (3);
-    }
-    else {
-        //A =-pow(fabs(r)+sqrt(r2-q3),1./3);
-        A = -root3(fabs(r) + sqrt(r2 - q3));
-        if (r < 0)
-            A = -A;
-        B = (A == 0 ? 0 : q / A);
-        
-        a /= 3;
-        x[0] = (A + B) - a;
-        x[1] = -0.5 * (A + B) - a;
-        x[2] = 0.5 * sqrt(3.) * (A - B);
-        if (fabs(x[2]) < eps) {
-            x[2] = x[1];
-            return (2);
-        }
-        return (1);
-    }
-} // SolveP3(btScalar *x,btScalar a,btScalar b,btScalar c) {
+{  // solve cubic equation x^3 + a*x^2 + b*x + c = 0
+	btScalar a2 = a * a;
+	btScalar q = (a2 - 3 * b) / 9;
+	if (q < 0)
+		q = eps;
+	btScalar r = (a * (2 * a2 - 9 * b) + 27 * c) / 54;
+	// equation x^3 + q*x + r = 0
+	btScalar r2 = r * r;
+	btScalar q3 = q * q * q;
+	btScalar A, B;
+	if (r2 <= (q3 + eps))
+	{  //<<-- FIXED!
+		btScalar t = r / sqrt(q3);
+		if (t < -1)
+			t = -1;
+		if (t > 1)
+			t = 1;
+		t = acos(t);
+		a /= 3;
+		q = -2 * sqrt(q);
+		x[0] = q * cos(t / 3) - a;
+		x[1] = q * cos((t + TwoPi) / 3) - a;
+		x[2] = q * cos((t - TwoPi) / 3) - a;
+		return (3);
+	}
+	else
+	{
+		//A =-pow(fabs(r)+sqrt(r2-q3),1./3);
+		A = -root3(fabs(r) + sqrt(r2 - q3));
+		if (r < 0)
+			A = -A;
+		B = (A == 0 ? 0 : q / A);
+
+		a /= 3;
+		x[0] = (A + B) - a;
+		x[1] = -0.5 * (A + B) - a;
+		x[2] = 0.5 * sqrt(3.) * (A - B);
+		if (fabs(x[2]) < eps)
+		{
+			x[2] = x[1];
+			return (2);
+		}
+		return (1);
+	}
+}  // SolveP3(btScalar *x,btScalar a,btScalar b,btScalar c) {
 //---------------------------------------------------------------------------
 // a>=0!
-void CSqrt(btScalar x, btScalar y, btScalar& a, btScalar& b) // returns:  a+i*s = sqrt(x+i*y)
+void CSqrt(btScalar x, btScalar y, btScalar& a, btScalar& b)  // returns:  a+i*s = sqrt(x+i*y)
 {
-    btScalar r = sqrt(x * x + y * y);
-    if (y == 0) {
-        r = sqrt(r);
-        if (x >= 0) {
-            a = r;
-            b = 0;
-        }
-        else {
-            a = 0;
-            b = r;
-        }
-    }
-    else { // y != 0
-        a = sqrt(0.5 * (x + r));
-        b = 0.5 * y / a;
-    }
+	btScalar r = sqrt(x * x + y * y);
+	if (y == 0)
+	{
+		r = sqrt(r);
+		if (x >= 0)
+		{
+			a = r;
+			b = 0;
+		}
+		else
+		{
+			a = 0;
+			b = r;
+		}
+	}
+	else
+	{  // y != 0
+		a = sqrt(0.5 * (x + r));
+		b = 0.5 * y / a;
+	}
 }
 //---------------------------------------------------------------------------
-int SolveP4Bi(btScalar* x, btScalar b, btScalar d) // solve equation x^4 + b*x^2 + d = 0
+int SolveP4Bi(btScalar* x, btScalar b, btScalar d)  // solve equation x^4 + b*x^2 + d = 0
 {
-    btScalar D = b * b - 4 * d;
-    if (D >= 0) {
-        btScalar sD = sqrt(D);
-        btScalar x1 = (-b + sD) / 2;
-        btScalar x2 = (-b - sD) / 2; // x2 <= x1
-        if (x2 >= 0) // 0 <= x2 <= x1, 4 real roots
-        {
-            btScalar sx1 = sqrt(x1);
-            btScalar sx2 = sqrt(x2);
-            x[0] = -sx1;
-            x[1] = sx1;
-            x[2] = -sx2;
-            x[3] = sx2;
-            return 4;
-        }
-        if (x1 < 0) // x2 <= x1 < 0, two pair of imaginary roots
-        {
-            btScalar sx1 = sqrt(-x1);
-            btScalar sx2 = sqrt(-x2);
-            x[0] = 0;
-            x[1] = sx1;
-            x[2] = 0;
-            x[3] = sx2;
-            return 0;
-        }
-        // now x2 < 0 <= x1 , two real roots and one pair of imginary root
-        btScalar sx1 = sqrt(x1);
-        btScalar sx2 = sqrt(-x2);
-        x[0] = -sx1;
-        x[1] = sx1;
-        x[2] = 0;
-        x[3] = sx2;
-        return 2;
-    }
-    else { // if( D < 0 ), two pair of compex roots
-        btScalar sD2 = 0.5 * sqrt(-D);
-        CSqrt(-0.5 * b, sD2, x[0], x[1]);
-        CSqrt(-0.5 * b, -sD2, x[2], x[3]);
-        return 0;
-    } // if( D>=0 )
-} // SolveP4Bi(btScalar *x, btScalar b, btScalar d)    // solve equation x^4 + b*x^2 d
+	btScalar D = b * b - 4 * d;
+	if (D >= 0)
+	{
+		btScalar sD = sqrt(D);
+		btScalar x1 = (-b + sD) / 2;
+		btScalar x2 = (-b - sD) / 2;  // x2 <= x1
+		if (x2 >= 0)                  // 0 <= x2 <= x1, 4 real roots
+		{
+			btScalar sx1 = sqrt(x1);
+			btScalar sx2 = sqrt(x2);
+			x[0] = -sx1;
+			x[1] = sx1;
+			x[2] = -sx2;
+			x[3] = sx2;
+			return 4;
+		}
+		if (x1 < 0)  // x2 <= x1 < 0, two pair of imaginary roots
+		{
+			btScalar sx1 = sqrt(-x1);
+			btScalar sx2 = sqrt(-x2);
+			x[0] = 0;
+			x[1] = sx1;
+			x[2] = 0;
+			x[3] = sx2;
+			return 0;
+		}
+		// now x2 < 0 <= x1 , two real roots and one pair of imginary root
+		btScalar sx1 = sqrt(x1);
+		btScalar sx2 = sqrt(-x2);
+		x[0] = -sx1;
+		x[1] = sx1;
+		x[2] = 0;
+		x[3] = sx2;
+		return 2;
+	}
+	else
+	{  // if( D < 0 ), two pair of compex roots
+		btScalar sD2 = 0.5 * sqrt(-D);
+		CSqrt(-0.5 * b, sD2, x[0], x[1]);
+		CSqrt(-0.5 * b, -sD2, x[2], x[3]);
+		return 0;
+	}  // if( D>=0 )
+}  // SolveP4Bi(btScalar *x, btScalar b, btScalar d)    // solve equation x^4 + b*x^2 d
 //---------------------------------------------------------------------------
 #define SWAP(a, b) \
-{              \
-t = b;     \
-b = a;     \
-a = t;     \
-}
-static void dblSort3(btScalar& a, btScalar& b, btScalar& c) // make: a <= b <= c
+	{              \
+		t = b;     \
+		b = a;     \
+		a = t;     \
+	}
+static void dblSort3(btScalar& a, btScalar& b, btScalar& c)  // make: a <= b <= c
 {
-    btScalar t;
-    if (a > b)
-        SWAP(a, b); // now a<=b
-    if (c < b) {
-        SWAP(b, c); // now a<=b, b<=c
-        if (a > b)
-            SWAP(a, b); // now a<=b
-    }
+	btScalar t;
+	if (a > b)
+		SWAP(a, b);  // now a<=b
+	if (c < b)
+	{
+		SWAP(b, c);  // now a<=b, b<=c
+		if (a > b)
+			SWAP(a, b);  // now a<=b
+	}
 }
 //---------------------------------------------------------------------------
-int SolveP4De(btScalar* x, btScalar b, btScalar c, btScalar d) // solve equation x^4 + b*x^2 + c*x + d
+int SolveP4De(btScalar* x, btScalar b, btScalar c, btScalar d)  // solve equation x^4 + b*x^2 + c*x + d
 {
-    //if( c==0 ) return SolveP4Bi(x,b,d); // After that, c!=0
-    if (fabs(c) < 1e-14 * (fabs(b) + fabs(d)))
-        return SolveP4Bi(x, b, d); // After that, c!=0
-    
-    int res3 = SolveP3(x, 2 * b, b * b - 4 * d, -c * c); // solve resolvent
-    // by Viet theorem:  x1*x2*x3=-c*c not equals to 0, so x1!=0, x2!=0, x3!=0
-    if (res3 > 1) // 3 real roots,
-    {
-        dblSort3(x[0], x[1], x[2]); // sort roots to x[0] <= x[1] <= x[2]
-        // Note: x[0]*x[1]*x[2]= c*c > 0
-        if (x[0] > 0) // all roots are positive
-        {
-            btScalar sz1 = sqrt(x[0]);
-            btScalar sz2 = sqrt(x[1]);
-            btScalar sz3 = sqrt(x[2]);
-            // Note: sz1*sz2*sz3= -c (and not equal to 0)
-            if (c > 0) {
-                x[0] = (-sz1 - sz2 - sz3) / 2;
-                x[1] = (-sz1 + sz2 + sz3) / 2;
-                x[2] = (+sz1 - sz2 + sz3) / 2;
-                x[3] = (+sz1 + sz2 - sz3) / 2;
-                return 4;
-            }
-            // now: c<0
-            x[0] = (-sz1 - sz2 + sz3) / 2;
-            x[1] = (-sz1 + sz2 - sz3) / 2;
-            x[2] = (+sz1 - sz2 - sz3) / 2;
-            x[3] = (+sz1 + sz2 + sz3) / 2;
-            return 4;
-        } // if( x[0] > 0) // all roots are positive
-        // now x[0] <= x[1] < 0, x[2] > 0
-        // two pair of comlex roots
-        btScalar sz1 = sqrt(-x[0]);
-        btScalar sz2 = sqrt(-x[1]);
-        btScalar sz3 = sqrt(x[2]);
-        
-        if (c > 0) // sign = -1
-        {
-            x[0] = -sz3 / 2;
-            x[1] = (sz1 - sz2) / 2; // x[0]i*x[1]
-            x[2] = sz3 / 2;
-            x[3] = (-sz1 - sz2) / 2; // x[2]i*x[3]
-            return 0;
-        }
-        // now: c<0 , sign = +1
-        x[0] = sz3 / 2;
-        x[1] = (-sz1 + sz2) / 2;
-        x[2] = -sz3 / 2;
-        x[3] = (sz1 + sz2) / 2;
-        return 0;
-    } // if( res3>1 )    // 3 real roots,
-    // now resoventa have 1 real and pair of compex roots
-    // x[0] - real root, and x[0]>0,
-    // x[1]i*x[2] - complex roots,
-    // x[0] must be >=0. But one times x[0]=~ 1e-17, so:
-    if (x[0] < 0)
-        x[0] = 0;
-    btScalar sz1 = sqrt(x[0]);
-    btScalar szr, szi;
-    CSqrt(x[1], x[2], szr, szi); // (szr+i*szi)^2 = x[1]+i*x[2]
-    if (c > 0) // sign = -1
-    {
-        x[0] = -sz1 / 2 - szr; // 1st real root
-        x[1] = -sz1 / 2 + szr; // 2nd real root
-        x[2] = sz1 / 2;
-        x[3] = szi;
-        return 2;
-    }
-    // now: c<0 , sign = +1
-    x[0] = sz1 / 2 - szr; // 1st real root
-    x[1] = sz1 / 2 + szr; // 2nd real root
-    x[2] = -sz1 / 2;
-    x[3] = szi;
-    return 2;
-} // SolveP4De(btScalar *x, btScalar b, btScalar c, btScalar d)    // solve equation x^4 + b*x^2 + c*x + d
+	//if( c==0 ) return SolveP4Bi(x,b,d); // After that, c!=0
+	if (fabs(c) < 1e-14 * (fabs(b) + fabs(d)))
+		return SolveP4Bi(x, b, d);  // After that, c!=0
+
+	int res3 = SolveP3(x, 2 * b, b * b - 4 * d, -c * c);  // solve resolvent
+	// by Viet theorem:  x1*x2*x3=-c*c not equals to 0, so x1!=0, x2!=0, x3!=0
+	if (res3 > 1)  // 3 real roots,
+	{
+		dblSort3(x[0], x[1], x[2]);  // sort roots to x[0] <= x[1] <= x[2]
+		// Note: x[0]*x[1]*x[2]= c*c > 0
+		if (x[0] > 0)  // all roots are positive
+		{
+			btScalar sz1 = sqrt(x[0]);
+			btScalar sz2 = sqrt(x[1]);
+			btScalar sz3 = sqrt(x[2]);
+			// Note: sz1*sz2*sz3= -c (and not equal to 0)
+			if (c > 0)
+			{
+				x[0] = (-sz1 - sz2 - sz3) / 2;
+				x[1] = (-sz1 + sz2 + sz3) / 2;
+				x[2] = (+sz1 - sz2 + sz3) / 2;
+				x[3] = (+sz1 + sz2 - sz3) / 2;
+				return 4;
+			}
+			// now: c<0
+			x[0] = (-sz1 - sz2 + sz3) / 2;
+			x[1] = (-sz1 + sz2 - sz3) / 2;
+			x[2] = (+sz1 - sz2 - sz3) / 2;
+			x[3] = (+sz1 + sz2 + sz3) / 2;
+			return 4;
+		}  // if( x[0] > 0) // all roots are positive
+		// now x[0] <= x[1] < 0, x[2] > 0
+		// two pair of comlex roots
+		btScalar sz1 = sqrt(-x[0]);
+		btScalar sz2 = sqrt(-x[1]);
+		btScalar sz3 = sqrt(x[2]);
+
+		if (c > 0)  // sign = -1
+		{
+			x[0] = -sz3 / 2;
+			x[1] = (sz1 - sz2) / 2;  // x[0]i*x[1]
+			x[2] = sz3 / 2;
+			x[3] = (-sz1 - sz2) / 2;  // x[2]i*x[3]
+			return 0;
+		}
+		// now: c<0 , sign = +1
+		x[0] = sz3 / 2;
+		x[1] = (-sz1 + sz2) / 2;
+		x[2] = -sz3 / 2;
+		x[3] = (sz1 + sz2) / 2;
+		return 0;
+	}  // if( res3>1 )    // 3 real roots,
+	// now resoventa have 1 real and pair of compex roots
+	// x[0] - real root, and x[0]>0,
+	// x[1]i*x[2] - complex roots,
+	// x[0] must be >=0. But one times x[0]=~ 1e-17, so:
+	if (x[0] < 0)
+		x[0] = 0;
+	btScalar sz1 = sqrt(x[0]);
+	btScalar szr, szi;
+	CSqrt(x[1], x[2], szr, szi);  // (szr+i*szi)^2 = x[1]+i*x[2]
+	if (c > 0)                    // sign = -1
+	{
+		x[0] = -sz1 / 2 - szr;  // 1st real root
+		x[1] = -sz1 / 2 + szr;  // 2nd real root
+		x[2] = sz1 / 2;
+		x[3] = szi;
+		return 2;
+	}
+	// now: c<0 , sign = +1
+	x[0] = sz1 / 2 - szr;  // 1st real root
+	x[1] = sz1 / 2 + szr;  // 2nd real root
+	x[2] = -sz1 / 2;
+	x[3] = szi;
+	return 2;
+}  // SolveP4De(btScalar *x, btScalar b, btScalar c, btScalar d)    // solve equation x^4 + b*x^2 + c*x + d
 //-----------------------------------------------------------------------------
-btScalar N4Step(btScalar x, btScalar a, btScalar b, btScalar c, btScalar d) // one Newton step for x^4 + a*x^3 + b*x^2 + c*x + d
+btScalar N4Step(btScalar x, btScalar a, btScalar b, btScalar c, btScalar d)  // one Newton step for x^4 + a*x^3 + b*x^2 + c*x + d
 {
-    btScalar fxs = ((4 * x + 3 * a) * x + 2 * b) * x + c; // f'(x)
-    if (fxs == 0)
-        return x; //return 1e99; <<-- FIXED!
-    btScalar fx = (((x + a) * x + b) * x + c) * x + d; // f(x)
-    return x - fx / fxs;
+	btScalar fxs = ((4 * x + 3 * a) * x + 2 * b) * x + c;  // f'(x)
+	if (fxs == 0)
+		return x;                                       //return 1e99; <<-- FIXED!
+	btScalar fx = (((x + a) * x + b) * x + c) * x + d;  // f(x)
+	return x - fx / fxs;
 }
 //-----------------------------------------------------------------------------
 // x - array of size 4
@@ -284,136 +298,150 @@ btScalar N4Step(btScalar x, btScalar a, btScalar b, btScalar c, btScalar d) // o
 // return 2: 2 real roots x[0], x[1] and complex x[2]i*x[3],
 // return 0: two pair of complex roots: x[0]i*x[1],  x[2]i*x[3],
 int SolveP4(btScalar* x, btScalar a, btScalar b, btScalar c, btScalar d)
-{ // solve equation x^4 + a*x^3 + b*x^2 + c*x + d by Dekart-Euler method
-    // move to a=0:
-    btScalar d1 = d + 0.25 * a * (0.25 * b * a - 3. / 64 * a * a * a - c);
-    btScalar c1 = c + 0.5 * a * (0.25 * a * a - b);
-    btScalar b1 = b - 0.375 * a * a;
-    int res = SolveP4De(x, b1, c1, d1);
-    if (res == 4) {
-        x[0] -= a / 4;
-        x[1] -= a / 4;
-        x[2] -= a / 4;
-        x[3] -= a / 4;
-    }
-    else if (res == 2) {
-        x[0] -= a / 4;
-        x[1] -= a / 4;
-        x[2] -= a / 4;
-    }
-    else {
-        x[0] -= a / 4;
-        x[2] -= a / 4;
-    }
-    // one Newton step for each real root:
-    if (res > 0) {
-        x[0] = N4Step(x[0], a, b, c, d);
-        x[1] = N4Step(x[1], a, b, c, d);
-    }
-    if (res > 2) {
-        x[2] = N4Step(x[2], a, b, c, d);
-        x[3] = N4Step(x[3], a, b, c, d);
-    }
-    return res;
+{  // solve equation x^4 + a*x^3 + b*x^2 + c*x + d by Dekart-Euler method
+	// move to a=0:
+	btScalar d1 = d + 0.25 * a * (0.25 * b * a - 3. / 64 * a * a * a - c);
+	btScalar c1 = c + 0.5 * a * (0.25 * a * a - b);
+	btScalar b1 = b - 0.375 * a * a;
+	int res = SolveP4De(x, b1, c1, d1);
+	if (res == 4)
+	{
+		x[0] -= a / 4;
+		x[1] -= a / 4;
+		x[2] -= a / 4;
+		x[3] -= a / 4;
+	}
+	else if (res == 2)
+	{
+		x[0] -= a / 4;
+		x[1] -= a / 4;
+		x[2] -= a / 4;
+	}
+	else
+	{
+		x[0] -= a / 4;
+		x[2] -= a / 4;
+	}
+	// one Newton step for each real root:
+	if (res > 0)
+	{
+		x[0] = N4Step(x[0], a, b, c, d);
+		x[1] = N4Step(x[1], a, b, c, d);
+	}
+	if (res > 2)
+	{
+		x[2] = N4Step(x[2], a, b, c, d);
+		x[3] = N4Step(x[3], a, b, c, d);
+	}
+	return res;
 }
 //-----------------------------------------------------------------------------
 #define F5(t) (((((t + a) * t + b) * t + c) * t + d) * t + e)
 //-----------------------------------------------------------------------------
-btScalar SolveP5_1(btScalar a, btScalar b, btScalar c, btScalar d, btScalar e) // return real root of x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
+btScalar SolveP5_1(btScalar a, btScalar b, btScalar c, btScalar d, btScalar e)  // return real root of x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
 {
-    int cnt;
-    if (fabs(e) < eps)
-        return 0;
-    
-    btScalar brd = fabs(a); // brd - border of real roots
-    if (fabs(b) > brd)
-        brd = fabs(b);
-    if (fabs(c) > brd)
-        brd = fabs(c);
-    if (fabs(d) > brd)
-        brd = fabs(d);
-    if (fabs(e) > brd)
-        brd = fabs(e);
-    brd++; // brd - border of real roots
-    
-    btScalar x0, f0; // less than root
-    btScalar x1, f1; // greater than root
-    btScalar x2, f2, f2s; // next values, f(x2), f'(x2)
-    btScalar dx = 0;
-    
-    if (e < 0) {
-        x0 = 0;
-        x1 = brd;
-        f0 = e;
-        f1 = F5(x1);
-        x2 = 0.01 * brd;
-    } // positive root
-    else {
-        x0 = -brd;
-        x1 = 0;
-        f0 = F5(x0);
-        f1 = e;
-        x2 = -0.01 * brd;
-    } // negative root
-    
-    if (fabs(f0) < eps)
-        return x0;
-    if (fabs(f1) < eps)
-        return x1;
-    
-    // now x0<x1, f(x0)<0, f(x1)>0
-    // Firstly 10 bisections
-    for (cnt = 0; cnt < 10; cnt++) {
-        x2 = (x0 + x1) / 2; // next point
-        //x2 = x0 - f0*(x1 - x0) / (f1 - f0);        // next point
-        f2 = F5(x2); // f(x2)
-        if (fabs(f2) < eps)
-            return x2;
-        if (f2 > 0) {
-            x1 = x2;
-            f1 = f2;
-        }
-        else {
-            x0 = x2;
-            f0 = f2;
-        }
-    }
-    
-    // At each step:
-    // x0<x1, f(x0)<0, f(x1)>0.
-    // x2 - next value
-    // we hope that x0 < x2 < x1, but not necessarily
-    do {
-        if (cnt++ > 50)
-            break;
-        if (x2 <= x0 || x2 >= x1)
-            x2 = (x0 + x1) / 2; // now  x0 < x2 < x1
-        f2 = F5(x2); // f(x2)
-        if (fabs(f2) < eps)
-            return x2;
-        if (f2 > 0) {
-            x1 = x2;
-            f1 = f2;
-        }
-        else {
-            x0 = x2;
-            f0 = f2;
-        }
-        f2s = (((5 * x2 + 4 * a) * x2 + 3 * b) * x2 + 2 * c) * x2 + d; // f'(x2)
-        if (fabs(f2s) < eps) {
-            x2 = 1e99;
-            continue;
-        }
-        dx = f2 / f2s;
-        x2 -= dx;
-    } while (fabs(dx) > eps);
-    return x2;
-} // SolveP5_1(btScalar a,btScalar b,btScalar c,btScalar d,btScalar e)    // return real root of x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
+	int cnt;
+	if (fabs(e) < eps)
+		return 0;
+
+	btScalar brd = fabs(a);  // brd - border of real roots
+	if (fabs(b) > brd)
+		brd = fabs(b);
+	if (fabs(c) > brd)
+		brd = fabs(c);
+	if (fabs(d) > brd)
+		brd = fabs(d);
+	if (fabs(e) > brd)
+		brd = fabs(e);
+	brd++;  // brd - border of real roots
+
+	btScalar x0, f0;       // less than root
+	btScalar x1, f1;       // greater than root
+	btScalar x2, f2, f2s;  // next values, f(x2), f'(x2)
+	btScalar dx = 0;
+
+	if (e < 0)
+	{
+		x0 = 0;
+		x1 = brd;
+		f0 = e;
+		f1 = F5(x1);
+		x2 = 0.01 * brd;
+	}  // positive root
+	else
+	{
+		x0 = -brd;
+		x1 = 0;
+		f0 = F5(x0);
+		f1 = e;
+		x2 = -0.01 * brd;
+	}  // negative root
+
+	if (fabs(f0) < eps)
+		return x0;
+	if (fabs(f1) < eps)
+		return x1;
+
+	// now x0<x1, f(x0)<0, f(x1)>0
+	// Firstly 10 bisections
+	for (cnt = 0; cnt < 10; cnt++)
+	{
+		x2 = (x0 + x1) / 2;  // next point
+		//x2 = x0 - f0*(x1 - x0) / (f1 - f0);        // next point
+		f2 = F5(x2);  // f(x2)
+		if (fabs(f2) < eps)
+			return x2;
+		if (f2 > 0)
+		{
+			x1 = x2;
+			f1 = f2;
+		}
+		else
+		{
+			x0 = x2;
+			f0 = f2;
+		}
+	}
+
+	// At each step:
+	// x0<x1, f(x0)<0, f(x1)>0.
+	// x2 - next value
+	// we hope that x0 < x2 < x1, but not necessarily
+	do
+	{
+		if (cnt++ > 50)
+			break;
+		if (x2 <= x0 || x2 >= x1)
+			x2 = (x0 + x1) / 2;  // now  x0 < x2 < x1
+		f2 = F5(x2);             // f(x2)
+		if (fabs(f2) < eps)
+			return x2;
+		if (f2 > 0)
+		{
+			x1 = x2;
+			f1 = f2;
+		}
+		else
+		{
+			x0 = x2;
+			f0 = f2;
+		}
+		f2s = (((5 * x2 + 4 * a) * x2 + 3 * b) * x2 + 2 * c) * x2 + d;  // f'(x2)
+		if (fabs(f2s) < eps)
+		{
+			x2 = 1e99;
+			continue;
+		}
+		dx = f2 / f2s;
+		x2 -= dx;
+	} while (fabs(dx) > eps);
+	return x2;
+}  // SolveP5_1(btScalar a,btScalar b,btScalar c,btScalar d,btScalar e)    // return real root of x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
 //-----------------------------------------------------------------------------
-int SolveP5(btScalar* x, btScalar a, btScalar b, btScalar c, btScalar d, btScalar e) // solve equation x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
+int SolveP5(btScalar* x, btScalar a, btScalar b, btScalar c, btScalar d, btScalar e)  // solve equation x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
 {
-    btScalar r = x[0] = SolveP5_1(a, b, c, d, e);
-    btScalar a1 = a + r, b1 = b + r * a1, c1 = c + r * b1, d1 = d + r * c1;
-    return 1 + SolveP4(x + 1, a1, b1, c1, d1);
-} // SolveP5(btScalar *x,btScalar a,btScalar b,btScalar c,btScalar d,btScalar e)    // solve equation x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
+	btScalar r = x[0] = SolveP5_1(a, b, c, d, e);
+	btScalar a1 = a + r, b1 = b + r * a1, c1 = c + r * b1, d1 = d + r * c1;
+	return 1 + SolveP4(x + 1, a1, b1, c1, d1);
+}  // SolveP5(btScalar *x,btScalar a,btScalar b,btScalar c,btScalar d,btScalar e)    // solve equation x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
 //-----------------------------------------------------------------------------
diff --git a/thirdparty/bullet/BulletSoftBody/poly34.h b/thirdparty/bullet/BulletSoftBody/poly34.h
index 32ad5d7da5..35a52c5fec 100644
--- a/thirdparty/bullet/BulletSoftBody/poly34.h
+++ b/thirdparty/bullet/BulletSoftBody/poly34.h
@@ -8,31 +8,31 @@
 // x - array of size 2
 // return 2: 2 real roots x[0], x[1]
 // return 0: pair of complex roots: x[0]i*x[1]
-int SolveP2(btScalar* x, btScalar a, btScalar b); // solve equation x^2 + a*x + b = 0
+int SolveP2(btScalar* x, btScalar a, btScalar b);  // solve equation x^2 + a*x + b = 0
 
 // x - array of size 3
 // return 3: 3 real roots x[0], x[1], x[2]
 // return 1: 1 real root x[0] and pair of complex roots: x[1]i*x[2]
-int SolveP3(btScalar* x, btScalar a, btScalar b, btScalar c); // solve cubic equation x^3 + a*x^2 + b*x + c = 0
+int SolveP3(btScalar* x, btScalar a, btScalar b, btScalar c);  // solve cubic equation x^3 + a*x^2 + b*x + c = 0
 
 // x - array of size 4
 // return 4: 4 real roots x[0], x[1], x[2], x[3], possible multiple roots
 // return 2: 2 real roots x[0], x[1] and complex x[2]i*x[3],
 // return 0: two pair of complex roots: x[0]i*x[1],  x[2]i*x[3],
-int SolveP4(btScalar* x, btScalar a, btScalar b, btScalar c, btScalar d); // solve equation x^4 + a*x^3 + b*x^2 + c*x + d = 0 by Dekart-Euler method
+int SolveP4(btScalar* x, btScalar a, btScalar b, btScalar c, btScalar d);  // solve equation x^4 + a*x^3 + b*x^2 + c*x + d = 0 by Dekart-Euler method
 
 // x - array of size 5
 // return 5: 5 real roots x[0], x[1], x[2], x[3], x[4], possible multiple roots
 // return 3: 3 real roots x[0], x[1], x[2] and complex x[3]i*x[4],
 // return 1: 1 real root x[0] and two pair of complex roots: x[1]i*x[2],  x[3]i*x[4],
-int SolveP5(btScalar* x, btScalar a, btScalar b, btScalar c, btScalar d, btScalar e); // solve equation x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
+int SolveP5(btScalar* x, btScalar a, btScalar b, btScalar c, btScalar d, btScalar e);  // solve equation x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
 
 //-----------------------------------------------------------------------------
 // And some additional functions for internal use.
 // Your may remove this definitions from here
-int SolveP4Bi(btScalar* x, btScalar b, btScalar d); // solve equation x^4 + b*x^2 + d = 0
-int SolveP4De(btScalar* x, btScalar b, btScalar c, btScalar d); // solve equation x^4 + b*x^2 + c*x + d = 0
-void CSqrt(btScalar x, btScalar y, btScalar& a, btScalar& b); // returns as a+i*s,  sqrt(x+i*y)
-btScalar N4Step(btScalar x, btScalar a, btScalar b, btScalar c, btScalar d); // one Newton step for x^4 + a*x^3 + b*x^2 + c*x + d
-btScalar SolveP5_1(btScalar a, btScalar b, btScalar c, btScalar d, btScalar e); // return real root of x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
+int SolveP4Bi(btScalar* x, btScalar b, btScalar d);                              // solve equation x^4 + b*x^2 + d = 0
+int SolveP4De(btScalar* x, btScalar b, btScalar c, btScalar d);                  // solve equation x^4 + b*x^2 + c*x + d = 0
+void CSqrt(btScalar x, btScalar y, btScalar& a, btScalar& b);                    // returns as a+i*s,  sqrt(x+i*y)
+btScalar N4Step(btScalar x, btScalar a, btScalar b, btScalar c, btScalar d);     // one Newton step for x^4 + a*x^3 + b*x^2 + c*x + d
+btScalar SolveP5_1(btScalar a, btScalar b, btScalar c, btScalar d, btScalar e);  // return real root of x^5 + a*x^4 + b*x^3 + c*x^2 + d*x + e = 0
 #endif
diff --git a/thirdparty/bullet/LinearMath/btAlignedAllocator.cpp b/thirdparty/bullet/LinearMath/btAlignedAllocator.cpp
index 39b302b600..be8f8aa6d0 100644
--- a/thirdparty/bullet/LinearMath/btAlignedAllocator.cpp
+++ b/thirdparty/bullet/LinearMath/btAlignedAllocator.cpp
@@ -138,7 +138,7 @@ struct btDebugPtrMagic
 	};
 };
 
-void *btAlignedAllocInternal(size_t size, int alignment, int line, char *filename)
+void *btAlignedAllocInternal(size_t size, int alignment, int line, const char *filename)
 {
 	if (size == 0)
 	{
@@ -195,7 +195,7 @@ void *btAlignedAllocInternal(size_t size, int alignment, int line, char *filenam
 	return (ret);
 }
 
-void btAlignedFreeInternal(void *ptr, int line, char *filename)
+void btAlignedFreeInternal(void *ptr, int line, const char *filename)
 {
 	void *real;
 
diff --git a/thirdparty/bullet/LinearMath/btAlignedAllocator.h b/thirdparty/bullet/LinearMath/btAlignedAllocator.h
index ce4d3585f1..971f62bfb0 100644
--- a/thirdparty/bullet/LinearMath/btAlignedAllocator.h
+++ b/thirdparty/bullet/LinearMath/btAlignedAllocator.h
@@ -35,9 +35,9 @@ int btDumpMemoryLeaks();
 #define btAlignedFree(ptr) \
 	btAlignedFreeInternal(ptr, __LINE__, __FILE__)
 
-void* btAlignedAllocInternal(size_t size, int alignment, int line, char* filename);
+void* btAlignedAllocInternal(size_t size, int alignment, int line, const char* filename);
 
-void btAlignedFreeInternal(void* ptr, int line, char* filename);
+void btAlignedFreeInternal(void* ptr, int line, const char* filename);
 
 #else
 void* btAlignedAllocInternal(size_t size, int alignment);
diff --git a/thirdparty/bullet/LinearMath/btConvexHullComputer.cpp b/thirdparty/bullet/LinearMath/btConvexHullComputer.cpp
index 8bbfdc5f25..12125fd2de 100644
--- a/thirdparty/bullet/LinearMath/btConvexHullComputer.cpp
+++ b/thirdparty/bullet/LinearMath/btConvexHullComputer.cpp
@@ -105,7 +105,7 @@ public:
 
 		Point64 cross(const Point32& b) const
 		{
-			return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x);
+			return Point64(((int64_t)y) * b.z - ((int64_t)z) * b.y, ((int64_t)z) * b.x - ((int64_t)x) * b.z, ((int64_t)x) * b.y - ((int64_t)y) * b.x);
 		}
 
 		Point64 cross(const Point64& b) const
@@ -115,7 +115,7 @@ public:
 
 		int64_t dot(const Point32& b) const
 		{
-			return x * b.x + y * b.y + z * b.z;
+			return ((int64_t)x) * b.x + ((int64_t)y) * b.y + ((int64_t)z) * b.z;
 		}
 
 		int64_t dot(const Point64& b) const
@@ -2673,6 +2673,7 @@ btScalar btConvexHullComputer::compute(const void* coords, bool doubleCoords, in
 	}
 
 	vertices.resize(0);
+	original_vertex_index.resize(0);
 	edges.resize(0);
 	faces.resize(0);
 
@@ -2683,6 +2684,7 @@ btScalar btConvexHullComputer::compute(const void* coords, bool doubleCoords, in
 	{
 		btConvexHullInternal::Vertex* v = oldVertices[copied];
 		vertices.push_back(hull.getCoordinates(v));
+		original_vertex_index.push_back(v->point.index);
 		btConvexHullInternal::Edge* firstEdge = v->edges;
 		if (firstEdge)
 		{
diff --git a/thirdparty/bullet/LinearMath/btConvexHullComputer.h b/thirdparty/bullet/LinearMath/btConvexHullComputer.h
index cba684f2dc..18b26eea9a 100644
--- a/thirdparty/bullet/LinearMath/btConvexHullComputer.h
+++ b/thirdparty/bullet/LinearMath/btConvexHullComputer.h
@@ -66,6 +66,9 @@ public:
 	// Vertices of the output hull
 	btAlignedObjectArray<btVector3> vertices;
 
+	// The original vertex index in the input coords array
+	btAlignedObjectArray<int> original_vertex_index;
+
 	// Edges of the output hull
 	btAlignedObjectArray<Edge> edges;
 
diff --git a/thirdparty/bullet/LinearMath/btReducedVector.h b/thirdparty/bullet/LinearMath/btReducedVector.h
index 83b5e581e5..313a4271f0 100644
--- a/thirdparty/bullet/LinearMath/btReducedVector.h
+++ b/thirdparty/bullet/LinearMath/btReducedVector.h
@@ -267,7 +267,7 @@ public:
         std::sort(tuples.begin(), tuples.end());
         btAlignedObjectArray<int> new_indices;
         btAlignedObjectArray<btVector3> new_vecs;
-        for (int i = 0; i < tuples.size(); ++i)
+        for (size_t i = 0; i < tuples.size(); ++i)
         {
             new_indices.push_back(tuples[i].b);
             new_vecs.push_back(m_vecs[tuples[i].a]);
diff --git a/thirdparty/bullet/LinearMath/btScalar.h b/thirdparty/bullet/LinearMath/btScalar.h
index 86d94e8974..36b90cc944 100644
--- a/thirdparty/bullet/LinearMath/btScalar.h
+++ b/thirdparty/bullet/LinearMath/btScalar.h
@@ -25,7 +25,7 @@ subject to the following restrictions:
 #include <float.h>
 
 /* SVN $Revision$ on $Date$ from http://bullet.googlecode.com*/
-#define BT_BULLET_VERSION 289
+#define BT_BULLET_VERSION 307
 
 inline int btGetVersion()
 {
diff --git a/thirdparty/bullet/LinearMath/btSerializer.h b/thirdparty/bullet/LinearMath/btSerializer.h
index 2ee712047f..9abcf031d0 100644
--- a/thirdparty/bullet/LinearMath/btSerializer.h
+++ b/thirdparty/bullet/LinearMath/btSerializer.h
@@ -479,9 +479,9 @@ public:
 			buffer[8] = 'V';
 		}
 
-		buffer[9] = '2';
-		buffer[10] = '8';
-		buffer[11] = '9';
+		buffer[9] = '3';
+		buffer[10] = '0';
+		buffer[11] = '7';
 	}
 
 	virtual void startSerialization()
diff --git a/thirdparty/mbedtls/include/mbedtls/bignum.h b/thirdparty/mbedtls/include/mbedtls/bignum.h
index 4bb9fa3d43..4d04b336e7 100644
--- a/thirdparty/mbedtls/include/mbedtls/bignum.h
+++ b/thirdparty/mbedtls/include/mbedtls/bignum.h
@@ -88,12 +88,12 @@
  * Maximum window size used for modular exponentiation. Default: 6
  * Minimum value: 1. Maximum value: 6.
  *
- * Result is an array of ( 2 << MBEDTLS_MPI_WINDOW_SIZE ) MPIs used
+ * Result is an array of ( 2 ** MBEDTLS_MPI_WINDOW_SIZE ) MPIs used
  * for the sliding window calculation. (So 64 by default)
  *
  * Reduction in size, reduces speed.
  */
-#define MBEDTLS_MPI_WINDOW_SIZE                           6        /**< Maximum windows size used. */
+#define MBEDTLS_MPI_WINDOW_SIZE                           6        /**< Maximum window size used. */
 #endif /* !MBEDTLS_MPI_WINDOW_SIZE */
 
 #if !defined(MBEDTLS_MPI_MAX_SIZE)
diff --git a/thirdparty/mbedtls/include/mbedtls/ccm.h b/thirdparty/mbedtls/include/mbedtls/ccm.h
index 3dcdc91894..d50c6ec993 100644
--- a/thirdparty/mbedtls/include/mbedtls/ccm.h
+++ b/thirdparty/mbedtls/include/mbedtls/ccm.h
@@ -175,7 +175,7 @@ void mbedtls_ccm_free( mbedtls_ccm_context *ctx );
  *                  than zero, \p output must be a writable buffer of at least
  *                  that length.
  * \param tag       The buffer holding the authentication field. This must be a
- *                  readable buffer of at least \p tag_len Bytes.
+ *                  writable buffer of at least \p tag_len Bytes.
  * \param tag_len   The length of the authentication field to generate in Bytes:
  *                  4, 6, 8, 10, 12, 14 or 16.
  *
@@ -220,7 +220,7 @@ int mbedtls_ccm_encrypt_and_tag( mbedtls_ccm_context *ctx, size_t length,
  *                  than zero, \p output must be a writable buffer of at least
  *                  that length.
  * \param tag       The buffer holding the authentication field. This must be a
- *                  readable buffer of at least \p tag_len Bytes.
+ *                  writable buffer of at least \p tag_len Bytes.
  * \param tag_len   The length of the authentication field to generate in Bytes:
  *                  0, 4, 6, 8, 10, 12, 14 or 16.
  *
diff --git a/thirdparty/mbedtls/include/mbedtls/config.h b/thirdparty/mbedtls/include/mbedtls/config.h
index 217998a5eb..e17bc7e306 100644
--- a/thirdparty/mbedtls/include/mbedtls/config.h
+++ b/thirdparty/mbedtls/include/mbedtls/config.h
@@ -3128,7 +3128,7 @@
  */
 
 /* MPI / BIGNUM options */
-//#define MBEDTLS_MPI_WINDOW_SIZE            6 /**< Maximum windows size used. */
+//#define MBEDTLS_MPI_WINDOW_SIZE            6 /**< Maximum window size used. */
 //#define MBEDTLS_MPI_MAX_SIZE            1024 /**< Maximum number of bytes for usable MPIs. */
 
 /* CTR_DRBG options */
diff --git a/thirdparty/mbedtls/include/mbedtls/ctr_drbg.h b/thirdparty/mbedtls/include/mbedtls/ctr_drbg.h
index 7e5f2e5769..278fbbbb7a 100644
--- a/thirdparty/mbedtls/include/mbedtls/ctr_drbg.h
+++ b/thirdparty/mbedtls/include/mbedtls/ctr_drbg.h
@@ -224,6 +224,11 @@ mbedtls_ctr_drbg_context;
  *                      and prepares it for mbedtls_ctr_drbg_seed()
  *                      or mbedtls_ctr_drbg_free().
  *
+ * \note                The reseed interval is
+ *                      #MBEDTLS_CTR_DRBG_RESEED_INTERVAL by default.
+ *                      You can override it by calling
+ *                      mbedtls_ctr_drbg_set_reseed_interval().
+ *
  * \param ctx           The CTR_DRBG context to initialize.
  */
 void mbedtls_ctr_drbg_init( mbedtls_ctr_drbg_context *ctx );
@@ -305,7 +310,8 @@ int mbedtls_ctr_drbg_seed( mbedtls_ctr_drbg_context *ctx,
                    size_t len );
 
 /**
- * \brief               This function clears CTR_CRBG context data.
+ * \brief               This function resets CTR_DRBG context to the state immediately
+ *                      after initial call of mbedtls_ctr_drbg_init().
  *
  * \param ctx           The CTR_DRBG context to clear.
  */
diff --git a/thirdparty/mbedtls/include/mbedtls/gcm.h b/thirdparty/mbedtls/include/mbedtls/gcm.h
index 4e4434ed4d..1201fbd4f1 100644
--- a/thirdparty/mbedtls/include/mbedtls/gcm.h
+++ b/thirdparty/mbedtls/include/mbedtls/gcm.h
@@ -182,7 +182,7 @@ int mbedtls_gcm_setkey( mbedtls_gcm_context *ctx,
  *                  than zero, this must be a writable buffer of at least that
  *                  size in Bytes.
  * \param tag_len   The length of the tag to generate.
- * \param tag       The buffer for holding the tag. This must be a readable
+ * \param tag       The buffer for holding the tag. This must be a writable
  *                  buffer of at least \p tag_len Bytes.
  *
  * \return          \c 0 if the encryption or decryption was performed
@@ -310,7 +310,7 @@ int mbedtls_gcm_update( mbedtls_gcm_context *ctx,
  *                  tag. The tag can have a maximum length of 16 Bytes.
  *
  * \param ctx       The GCM context. This must be initialized.
- * \param tag       The buffer for holding the tag. This must be a readable
+ * \param tag       The buffer for holding the tag. This must be a writable
  *                  buffer of at least \p tag_len Bytes.
  * \param tag_len   The length of the tag to generate. This must be at least
  *                  four.
diff --git a/thirdparty/mbedtls/include/mbedtls/hmac_drbg.h b/thirdparty/mbedtls/include/mbedtls/hmac_drbg.h
index 6883678204..970c033c15 100644
--- a/thirdparty/mbedtls/include/mbedtls/hmac_drbg.h
+++ b/thirdparty/mbedtls/include/mbedtls/hmac_drbg.h
@@ -138,6 +138,10 @@ typedef struct mbedtls_hmac_drbg_context
  * This function makes the context ready for mbedtls_hmac_drbg_seed(),
  * mbedtls_hmac_drbg_seed_buf() or mbedtls_hmac_drbg_free().
  *
+ * \note                The reseed interval is #MBEDTLS_HMAC_DRBG_RESEED_INTERVAL
+ *                      by default. Override this value by calling
+ *                      mbedtls_hmac_drbg_set_reseed_interval().
+ *
  * \param ctx           HMAC_DRBG context to be initialized.
  */
 void mbedtls_hmac_drbg_init( mbedtls_hmac_drbg_context *ctx );
@@ -361,7 +365,8 @@ int mbedtls_hmac_drbg_random_with_add( void *p_rng,
 int mbedtls_hmac_drbg_random( void *p_rng, unsigned char *output, size_t out_len );
 
 /**
- * \brief               Free an HMAC_DRBG context
+ * \brief               This function resets HMAC_DRBG context to the state immediately
+ *                      after initial call of mbedtls_hmac_drbg_init().
  *
  * \param ctx           The HMAC_DRBG context to free.
  */
diff --git a/thirdparty/mbedtls/include/mbedtls/sha512.h b/thirdparty/mbedtls/include/mbedtls/sha512.h
index 9ff78ecf41..5e5a15e000 100644
--- a/thirdparty/mbedtls/include/mbedtls/sha512.h
+++ b/thirdparty/mbedtls/include/mbedtls/sha512.h
@@ -152,8 +152,7 @@ int mbedtls_sha512_update_ret( mbedtls_sha512_context *ctx,
 
 /**
  * \brief          This function finishes the SHA-512 operation, and writes
- *                 the result to the output buffer. This function is for
- *                 internal use only.
+ *                 the result to the output buffer.
  *
  * \param ctx      The SHA-512 context. This must be initialized
  *                 and have a hash operation started.
@@ -169,6 +168,7 @@ int mbedtls_sha512_finish_ret( mbedtls_sha512_context *ctx,
 /**
  * \brief          This function processes a single data block within
  *                 the ongoing SHA-512 computation.
+ *                 This function is for internal use only.
  *
  * \param ctx      The SHA-512 context. This must be initialized.
  * \param data     The buffer holding one block of data. This
diff --git a/thirdparty/mbedtls/include/mbedtls/ssl.h b/thirdparty/mbedtls/include/mbedtls/ssl.h
index d3ee3c4e6f..fe33ac8d57 100644
--- a/thirdparty/mbedtls/include/mbedtls/ssl.h
+++ b/thirdparty/mbedtls/include/mbedtls/ssl.h
@@ -1409,7 +1409,7 @@ void mbedtls_ssl_conf_dbg( mbedtls_ssl_config *conf,
  * \note           For DTLS, you need to provide either a non-NULL
  *                 f_recv_timeout callback, or a f_recv that doesn't block.
  *
- * \note           See the documentations of \c mbedtls_ssl_sent_t,
+ * \note           See the documentations of \c mbedtls_ssl_send_t,
  *                 \c mbedtls_ssl_recv_t and \c mbedtls_ssl_recv_timeout_t for
  *                 the conventions those callbacks must follow.
  *
diff --git a/thirdparty/mbedtls/include/mbedtls/version.h b/thirdparty/mbedtls/include/mbedtls/version.h
index d09b45002d..5f0a8f114c 100644
--- a/thirdparty/mbedtls/include/mbedtls/version.h
+++ b/thirdparty/mbedtls/include/mbedtls/version.h
@@ -65,16 +65,16 @@
  */
 #define MBEDTLS_VERSION_MAJOR  2
 #define MBEDTLS_VERSION_MINOR  16
-#define MBEDTLS_VERSION_PATCH  8
+#define MBEDTLS_VERSION_PATCH  9
 
 /**
  * The single version number has the following structure:
  *    MMNNPP00
  *    Major version | Minor version | Patch version
  */
-#define MBEDTLS_VERSION_NUMBER         0x02100800
-#define MBEDTLS_VERSION_STRING         "2.16.8"
-#define MBEDTLS_VERSION_STRING_FULL    "mbed TLS 2.16.8"
+#define MBEDTLS_VERSION_NUMBER         0x02100900
+#define MBEDTLS_VERSION_STRING         "2.16.9"
+#define MBEDTLS_VERSION_STRING_FULL    "mbed TLS 2.16.9"
 
 #if defined(MBEDTLS_VERSION_C)
 
diff --git a/thirdparty/mbedtls/library/aes.c b/thirdparty/mbedtls/library/aes.c
index 9b337505fd..da0e5b6bdc 100644
--- a/thirdparty/mbedtls/library/aes.c
+++ b/thirdparty/mbedtls/library/aes.c
@@ -760,6 +760,7 @@ exit:
 
     return( ret );
 }
+#endif /* !MBEDTLS_AES_SETKEY_DEC_ALT */
 
 #if defined(MBEDTLS_CIPHER_MODE_XTS)
 static int mbedtls_aes_xts_decode_keys( const unsigned char *key,
@@ -838,8 +839,6 @@ int mbedtls_aes_xts_setkey_dec( mbedtls_aes_xts_context *ctx,
 }
 #endif /* MBEDTLS_CIPHER_MODE_XTS */
 
-#endif /* !MBEDTLS_AES_SETKEY_DEC_ALT */
-
 #define AES_FROUND(X0,X1,X2,X3,Y0,Y1,Y2,Y3)                     \
     do                                                          \
     {                                                           \
@@ -897,63 +896,56 @@ int mbedtls_internal_aes_encrypt( mbedtls_aes_context *ctx,
                                   unsigned char output[16] )
 {
     int i;
-    uint32_t *RK, X0, X1, X2, X3, Y0, Y1, Y2, Y3;
-
-    RK = ctx->rk;
+    uint32_t *RK = ctx->rk;
+    struct
+    {
+        uint32_t X[4];
+        uint32_t Y[4];
+    } t;
 
-    GET_UINT32_LE( X0, input,  0 ); X0 ^= *RK++;
-    GET_UINT32_LE( X1, input,  4 ); X1 ^= *RK++;
-    GET_UINT32_LE( X2, input,  8 ); X2 ^= *RK++;
-    GET_UINT32_LE( X3, input, 12 ); X3 ^= *RK++;
+    GET_UINT32_LE( t.X[0], input,  0 ); t.X[0] ^= *RK++;
+    GET_UINT32_LE( t.X[1], input,  4 ); t.X[1] ^= *RK++;
+    GET_UINT32_LE( t.X[2], input,  8 ); t.X[2] ^= *RK++;
+    GET_UINT32_LE( t.X[3], input, 12 ); t.X[3] ^= *RK++;
 
     for( i = ( ctx->nr >> 1 ) - 1; i > 0; i-- )
     {
-        AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 );
-        AES_FROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 );
+        AES_FROUND( t.Y[0], t.Y[1], t.Y[2], t.Y[3], t.X[0], t.X[1], t.X[2], t.X[3] );
+        AES_FROUND( t.X[0], t.X[1], t.X[2], t.X[3], t.Y[0], t.Y[1], t.Y[2], t.Y[3] );
     }
 
-    AES_FROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 );
-
-    X0 = *RK++ ^ \
-            ( (uint32_t) FSb[ ( Y0       ) & 0xFF ]       ) ^
-            ( (uint32_t) FSb[ ( Y1 >>  8 ) & 0xFF ] <<  8 ) ^
-            ( (uint32_t) FSb[ ( Y2 >> 16 ) & 0xFF ] << 16 ) ^
-            ( (uint32_t) FSb[ ( Y3 >> 24 ) & 0xFF ] << 24 );
-
-    X1 = *RK++ ^ \
-            ( (uint32_t) FSb[ ( Y1       ) & 0xFF ]       ) ^
-            ( (uint32_t) FSb[ ( Y2 >>  8 ) & 0xFF ] <<  8 ) ^
-            ( (uint32_t) FSb[ ( Y3 >> 16 ) & 0xFF ] << 16 ) ^
-            ( (uint32_t) FSb[ ( Y0 >> 24 ) & 0xFF ] << 24 );
-
-    X2 = *RK++ ^ \
-            ( (uint32_t) FSb[ ( Y2       ) & 0xFF ]       ) ^
-            ( (uint32_t) FSb[ ( Y3 >>  8 ) & 0xFF ] <<  8 ) ^
-            ( (uint32_t) FSb[ ( Y0 >> 16 ) & 0xFF ] << 16 ) ^
-            ( (uint32_t) FSb[ ( Y1 >> 24 ) & 0xFF ] << 24 );
-
-    X3 = *RK++ ^ \
-            ( (uint32_t) FSb[ ( Y3       ) & 0xFF ]       ) ^
-            ( (uint32_t) FSb[ ( Y0 >>  8 ) & 0xFF ] <<  8 ) ^
-            ( (uint32_t) FSb[ ( Y1 >> 16 ) & 0xFF ] << 16 ) ^
-            ( (uint32_t) FSb[ ( Y2 >> 24 ) & 0xFF ] << 24 );
-
-    PUT_UINT32_LE( X0, output,  0 );
-    PUT_UINT32_LE( X1, output,  4 );
-    PUT_UINT32_LE( X2, output,  8 );
-    PUT_UINT32_LE( X3, output, 12 );
-
-    mbedtls_platform_zeroize( &X0, sizeof( X0 ) );
-    mbedtls_platform_zeroize( &X1, sizeof( X1 ) );
-    mbedtls_platform_zeroize( &X2, sizeof( X2 ) );
-    mbedtls_platform_zeroize( &X3, sizeof( X3 ) );
-
-    mbedtls_platform_zeroize( &Y0, sizeof( Y0 ) );
-    mbedtls_platform_zeroize( &Y1, sizeof( Y1 ) );
-    mbedtls_platform_zeroize( &Y2, sizeof( Y2 ) );
-    mbedtls_platform_zeroize( &Y3, sizeof( Y3 ) );
-
-    mbedtls_platform_zeroize( &RK, sizeof( RK ) );
+    AES_FROUND( t.Y[0], t.Y[1], t.Y[2], t.Y[3], t.X[0], t.X[1], t.X[2], t.X[3] );
+
+    t.X[0] = *RK++ ^ \
+            ( (uint32_t) FSb[ ( t.Y[0]       ) & 0xFF ]       ) ^
+            ( (uint32_t) FSb[ ( t.Y[1] >>  8 ) & 0xFF ] <<  8 ) ^
+            ( (uint32_t) FSb[ ( t.Y[2] >> 16 ) & 0xFF ] << 16 ) ^
+            ( (uint32_t) FSb[ ( t.Y[3] >> 24 ) & 0xFF ] << 24 );
+
+    t.X[1] = *RK++ ^ \
+            ( (uint32_t) FSb[ ( t.Y[1]       ) & 0xFF ]       ) ^
+            ( (uint32_t) FSb[ ( t.Y[2] >>  8 ) & 0xFF ] <<  8 ) ^
+            ( (uint32_t) FSb[ ( t.Y[3] >> 16 ) & 0xFF ] << 16 ) ^
+            ( (uint32_t) FSb[ ( t.Y[0] >> 24 ) & 0xFF ] << 24 );
+
+    t.X[2] = *RK++ ^ \
+            ( (uint32_t) FSb[ ( t.Y[2]       ) & 0xFF ]       ) ^
+            ( (uint32_t) FSb[ ( t.Y[3] >>  8 ) & 0xFF ] <<  8 ) ^
+            ( (uint32_t) FSb[ ( t.Y[0] >> 16 ) & 0xFF ] << 16 ) ^
+            ( (uint32_t) FSb[ ( t.Y[1] >> 24 ) & 0xFF ] << 24 );
+
+    t.X[3] = *RK++ ^ \
+            ( (uint32_t) FSb[ ( t.Y[3]       ) & 0xFF ]       ) ^
+            ( (uint32_t) FSb[ ( t.Y[0] >>  8 ) & 0xFF ] <<  8 ) ^
+            ( (uint32_t) FSb[ ( t.Y[1] >> 16 ) & 0xFF ] << 16 ) ^
+            ( (uint32_t) FSb[ ( t.Y[2] >> 24 ) & 0xFF ] << 24 );
+
+    PUT_UINT32_LE( t.X[0], output,  0 );
+    PUT_UINT32_LE( t.X[1], output,  4 );
+    PUT_UINT32_LE( t.X[2], output,  8 );
+    PUT_UINT32_LE( t.X[3], output, 12 );
+
+    mbedtls_platform_zeroize( &t, sizeof( t ) );
 
     return( 0 );
 }
@@ -977,63 +969,56 @@ int mbedtls_internal_aes_decrypt( mbedtls_aes_context *ctx,
                                   unsigned char output[16] )
 {
     int i;
-    uint32_t *RK, X0, X1, X2, X3, Y0, Y1, Y2, Y3;
-
-    RK = ctx->rk;
+    uint32_t *RK = ctx->rk;
+    struct
+    {
+        uint32_t X[4];
+        uint32_t Y[4];
+    } t;
 
-    GET_UINT32_LE( X0, input,  0 ); X0 ^= *RK++;
-    GET_UINT32_LE( X1, input,  4 ); X1 ^= *RK++;
-    GET_UINT32_LE( X2, input,  8 ); X2 ^= *RK++;
-    GET_UINT32_LE( X3, input, 12 ); X3 ^= *RK++;
+    GET_UINT32_LE( t.X[0], input,  0 ); t.X[0] ^= *RK++;
+    GET_UINT32_LE( t.X[1], input,  4 ); t.X[1] ^= *RK++;
+    GET_UINT32_LE( t.X[2], input,  8 ); t.X[2] ^= *RK++;
+    GET_UINT32_LE( t.X[3], input, 12 ); t.X[3] ^= *RK++;
 
     for( i = ( ctx->nr >> 1 ) - 1; i > 0; i-- )
     {
-        AES_RROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 );
-        AES_RROUND( X0, X1, X2, X3, Y0, Y1, Y2, Y3 );
+        AES_RROUND( t.Y[0], t.Y[1], t.Y[2], t.Y[3], t.X[0], t.X[1], t.X[2], t.X[3] );
+        AES_RROUND( t.X[0], t.X[1], t.X[2], t.X[3], t.Y[0], t.Y[1], t.Y[2], t.Y[3] );
     }
 
-    AES_RROUND( Y0, Y1, Y2, Y3, X0, X1, X2, X3 );
-
-    X0 = *RK++ ^ \
-            ( (uint32_t) RSb[ ( Y0       ) & 0xFF ]       ) ^
-            ( (uint32_t) RSb[ ( Y3 >>  8 ) & 0xFF ] <<  8 ) ^
-            ( (uint32_t) RSb[ ( Y2 >> 16 ) & 0xFF ] << 16 ) ^
-            ( (uint32_t) RSb[ ( Y1 >> 24 ) & 0xFF ] << 24 );
-
-    X1 = *RK++ ^ \
-            ( (uint32_t) RSb[ ( Y1       ) & 0xFF ]       ) ^
-            ( (uint32_t) RSb[ ( Y0 >>  8 ) & 0xFF ] <<  8 ) ^
-            ( (uint32_t) RSb[ ( Y3 >> 16 ) & 0xFF ] << 16 ) ^
-            ( (uint32_t) RSb[ ( Y2 >> 24 ) & 0xFF ] << 24 );
-
-    X2 = *RK++ ^ \
-            ( (uint32_t) RSb[ ( Y2       ) & 0xFF ]       ) ^
-            ( (uint32_t) RSb[ ( Y1 >>  8 ) & 0xFF ] <<  8 ) ^
-            ( (uint32_t) RSb[ ( Y0 >> 16 ) & 0xFF ] << 16 ) ^
-            ( (uint32_t) RSb[ ( Y3 >> 24 ) & 0xFF ] << 24 );
-
-    X3 = *RK++ ^ \
-            ( (uint32_t) RSb[ ( Y3       ) & 0xFF ]       ) ^
-            ( (uint32_t) RSb[ ( Y2 >>  8 ) & 0xFF ] <<  8 ) ^
-            ( (uint32_t) RSb[ ( Y1 >> 16 ) & 0xFF ] << 16 ) ^
-            ( (uint32_t) RSb[ ( Y0 >> 24 ) & 0xFF ] << 24 );
-
-    PUT_UINT32_LE( X0, output,  0 );
-    PUT_UINT32_LE( X1, output,  4 );
-    PUT_UINT32_LE( X2, output,  8 );
-    PUT_UINT32_LE( X3, output, 12 );
-
-    mbedtls_platform_zeroize( &X0, sizeof( X0 ) );
-    mbedtls_platform_zeroize( &X1, sizeof( X1 ) );
-    mbedtls_platform_zeroize( &X2, sizeof( X2 ) );
-    mbedtls_platform_zeroize( &X3, sizeof( X3 ) );
-
-    mbedtls_platform_zeroize( &Y0, sizeof( Y0 ) );
-    mbedtls_platform_zeroize( &Y1, sizeof( Y1 ) );
-    mbedtls_platform_zeroize( &Y2, sizeof( Y2 ) );
-    mbedtls_platform_zeroize( &Y3, sizeof( Y3 ) );
-
-    mbedtls_platform_zeroize( &RK, sizeof( RK ) );
+    AES_RROUND( t.Y[0], t.Y[1], t.Y[2], t.Y[3], t.X[0], t.X[1], t.X[2], t.X[3] );
+
+    t.X[0] = *RK++ ^ \
+            ( (uint32_t) RSb[ ( t.Y[0]       ) & 0xFF ]       ) ^
+            ( (uint32_t) RSb[ ( t.Y[3] >>  8 ) & 0xFF ] <<  8 ) ^
+            ( (uint32_t) RSb[ ( t.Y[2] >> 16 ) & 0xFF ] << 16 ) ^
+            ( (uint32_t) RSb[ ( t.Y[1] >> 24 ) & 0xFF ] << 24 );
+
+    t.X[1] = *RK++ ^ \
+            ( (uint32_t) RSb[ ( t.Y[1]       ) & 0xFF ]       ) ^
+            ( (uint32_t) RSb[ ( t.Y[0] >>  8 ) & 0xFF ] <<  8 ) ^
+            ( (uint32_t) RSb[ ( t.Y[3] >> 16 ) & 0xFF ] << 16 ) ^
+            ( (uint32_t) RSb[ ( t.Y[2] >> 24 ) & 0xFF ] << 24 );
+
+    t.X[2] = *RK++ ^ \
+            ( (uint32_t) RSb[ ( t.Y[2]       ) & 0xFF ]       ) ^
+            ( (uint32_t) RSb[ ( t.Y[1] >>  8 ) & 0xFF ] <<  8 ) ^
+            ( (uint32_t) RSb[ ( t.Y[0] >> 16 ) & 0xFF ] << 16 ) ^
+            ( (uint32_t) RSb[ ( t.Y[3] >> 24 ) & 0xFF ] << 24 );
+
+    t.X[3] = *RK++ ^ \
+            ( (uint32_t) RSb[ ( t.Y[3]       ) & 0xFF ]       ) ^
+            ( (uint32_t) RSb[ ( t.Y[2] >>  8 ) & 0xFF ] <<  8 ) ^
+            ( (uint32_t) RSb[ ( t.Y[1] >> 16 ) & 0xFF ] << 16 ) ^
+            ( (uint32_t) RSb[ ( t.Y[0] >> 24 ) & 0xFF ] << 24 );
+
+    PUT_UINT32_LE( t.X[0], output,  0 );
+    PUT_UINT32_LE( t.X[1], output,  4 );
+    PUT_UINT32_LE( t.X[2], output,  8 );
+    PUT_UINT32_LE( t.X[3], output, 12 );
+
+    mbedtls_platform_zeroize( &t, sizeof( t ) );
 
     return( 0 );
 }
diff --git a/thirdparty/mbedtls/library/bignum.c b/thirdparty/mbedtls/library/bignum.c
index dfe976d648..2feb727d89 100644
--- a/thirdparty/mbedtls/library/bignum.c
+++ b/thirdparty/mbedtls/library/bignum.c
@@ -1364,7 +1364,10 @@ int mbedtls_mpi_sub_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi
         /* If we ran out of space for the carry, it means that the result
          * is negative. */
         if( n == X->n )
-            return( MBEDTLS_ERR_MPI_NEGATIVE_VALUE );
+        {
+            ret = MBEDTLS_ERR_MPI_NEGATIVE_VALUE;
+            goto cleanup;
+        }
         --X->p[n];
     }
 
@@ -2044,7 +2047,7 @@ int mbedtls_mpi_exp_mod( mbedtls_mpi *X, const mbedtls_mpi *A,
     size_t i, j, nblimbs;
     size_t bufsize, nbits;
     mbedtls_mpi_uint ei, mm, state;
-    mbedtls_mpi RR, T, W[ 2 << MBEDTLS_MPI_WINDOW_SIZE ], Apos;
+    mbedtls_mpi RR, T, W[ 1 << MBEDTLS_MPI_WINDOW_SIZE ], Apos;
     int neg;
 
     MPI_VALIDATE_RET( X != NULL );
@@ -2058,6 +2061,10 @@ int mbedtls_mpi_exp_mod( mbedtls_mpi *X, const mbedtls_mpi *A,
     if( mbedtls_mpi_cmp_int( E, 0 ) < 0 )
         return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
 
+    if( mbedtls_mpi_bitlen( E ) > MBEDTLS_MPI_MAX_BITS ||
+        mbedtls_mpi_bitlen( N ) > MBEDTLS_MPI_MAX_BITS )
+        return ( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+
     /*
      * Init temps and window size
      */
@@ -2334,7 +2341,7 @@ int mbedtls_mpi_fill_random( mbedtls_mpi *X, size_t size,
     MBEDTLS_MPI_CHK( mbedtls_mpi_lset( X, 0 ) );
 
     Xp = (unsigned char*) X->p;
-    f_rng( p_rng, Xp + overhead, size );
+    MBEDTLS_MPI_CHK( f_rng( p_rng, Xp + overhead, size ) );
 
     mpi_bigendian_to_host( X->p, limbs );
 
diff --git a/thirdparty/mbedtls/library/cipher_wrap.c b/thirdparty/mbedtls/library/cipher_wrap.c
index 1dcac21be1..5973ca6ba2 100644
--- a/thirdparty/mbedtls/library/cipher_wrap.c
+++ b/thirdparty/mbedtls/library/cipher_wrap.c
@@ -779,7 +779,7 @@ static const mbedtls_cipher_info_t camellia_128_ecb_info = {
     MBEDTLS_MODE_ECB,
     128,
     "CAMELLIA-128-ECB",
-    16,
+    0,
     0,
     16,
     &camellia_info
@@ -790,7 +790,7 @@ static const mbedtls_cipher_info_t camellia_192_ecb_info = {
     MBEDTLS_MODE_ECB,
     192,
     "CAMELLIA-192-ECB",
-    16,
+    0,
     0,
     16,
     &camellia_info
@@ -801,7 +801,7 @@ static const mbedtls_cipher_info_t camellia_256_ecb_info = {
     MBEDTLS_MODE_ECB,
     256,
     "CAMELLIA-256-ECB",
-    16,
+    0,
     0,
     16,
     &camellia_info
@@ -1155,7 +1155,7 @@ static const mbedtls_cipher_info_t aria_128_ecb_info = {
     MBEDTLS_MODE_ECB,
     128,
     "ARIA-128-ECB",
-    16,
+    0,
     0,
     16,
     &aria_info
@@ -1166,7 +1166,7 @@ static const mbedtls_cipher_info_t aria_192_ecb_info = {
     MBEDTLS_MODE_ECB,
     192,
     "ARIA-192-ECB",
-    16,
+    0,
     0,
     16,
     &aria_info
@@ -1177,7 +1177,7 @@ static const mbedtls_cipher_info_t aria_256_ecb_info = {
     MBEDTLS_MODE_ECB,
     256,
     "ARIA-256-ECB",
-    16,
+    0,
     0,
     16,
     &aria_info
@@ -1579,7 +1579,7 @@ static const mbedtls_cipher_info_t des_ecb_info = {
     MBEDTLS_MODE_ECB,
     MBEDTLS_KEY_LENGTH_DES,
     "DES-ECB",
-    8,
+    0,
     0,
     8,
     &des_info
@@ -1630,7 +1630,7 @@ static const mbedtls_cipher_info_t des_ede_ecb_info = {
     MBEDTLS_MODE_ECB,
     MBEDTLS_KEY_LENGTH_DES_EDE,
     "DES-EDE-ECB",
-    8,
+    0,
     0,
     8,
     &des_ede_info
@@ -1681,7 +1681,7 @@ static const mbedtls_cipher_info_t des_ede3_ecb_info = {
     MBEDTLS_MODE_ECB,
     MBEDTLS_KEY_LENGTH_DES_EDE3,
     "DES-EDE3-ECB",
-    8,
+    0,
     0,
     8,
     &des_ede3_info
@@ -1796,7 +1796,7 @@ static const mbedtls_cipher_info_t blowfish_ecb_info = {
     MBEDTLS_MODE_ECB,
     128,
     "BLOWFISH-ECB",
-    8,
+    0,
     MBEDTLS_CIPHER_VARIABLE_KEY_LEN,
     8,
     &blowfish_info
diff --git a/thirdparty/mbedtls/library/cmac.c b/thirdparty/mbedtls/library/cmac.c
index 1a1200b52b..409f67958e 100644
--- a/thirdparty/mbedtls/library/cmac.c
+++ b/thirdparty/mbedtls/library/cmac.c
@@ -450,7 +450,7 @@ exit:
  */
 int mbedtls_aes_cmac_prf_128( const unsigned char *key, size_t key_length,
                               const unsigned char *input, size_t in_len,
-                              unsigned char *output )
+                              unsigned char output[16] )
 {
     int ret;
     const mbedtls_cipher_info_t *cipher_info;
diff --git a/thirdparty/mbedtls/library/ctr_drbg.c b/thirdparty/mbedtls/library/ctr_drbg.c
index b98df29a9b..e92008bbe8 100644
--- a/thirdparty/mbedtls/library/ctr_drbg.c
+++ b/thirdparty/mbedtls/library/ctr_drbg.c
@@ -82,11 +82,17 @@ void mbedtls_ctr_drbg_init( mbedtls_ctr_drbg_context *ctx )
 {
     memset( ctx, 0, sizeof( mbedtls_ctr_drbg_context ) );
 
+    ctx->reseed_interval = MBEDTLS_CTR_DRBG_RESEED_INTERVAL;
+
 #if defined(MBEDTLS_THREADING_C)
     mbedtls_mutex_init( &ctx->mutex );
 #endif
 }
 
+/*
+ *  This function resets CTR_DRBG context to the state immediately
+ *  after initial call of mbedtls_ctr_drbg_init().
+ */
 void mbedtls_ctr_drbg_free( mbedtls_ctr_drbg_context *ctx )
 {
     if( ctx == NULL )
@@ -97,6 +103,10 @@ void mbedtls_ctr_drbg_free( mbedtls_ctr_drbg_context *ctx )
 #endif
     mbedtls_aes_free( &ctx->aes_ctx );
     mbedtls_platform_zeroize( ctx, sizeof( mbedtls_ctr_drbg_context ) );
+    ctx->reseed_interval = MBEDTLS_CTR_DRBG_RESEED_INTERVAL;
+#if defined(MBEDTLS_THREADING_C)
+    mbedtls_mutex_init( &ctx->mutex );
+#endif
 }
 
 void mbedtls_ctr_drbg_set_prediction_resistance( mbedtls_ctr_drbg_context *ctx, int resistance )
@@ -419,7 +429,6 @@ int mbedtls_ctr_drbg_seed( mbedtls_ctr_drbg_context *ctx,
 
     if( ctx->entropy_len == 0 )
         ctx->entropy_len = MBEDTLS_CTR_DRBG_ENTROPY_LEN;
-    ctx->reseed_interval = MBEDTLS_CTR_DRBG_RESEED_INTERVAL;
 
     /*
      * Initialize with an empty key
diff --git a/thirdparty/mbedtls/library/ecp_curves.c b/thirdparty/mbedtls/library/ecp_curves.c
index cc4c5b71c0..b04596b561 100644
--- a/thirdparty/mbedtls/library/ecp_curves.c
+++ b/thirdparty/mbedtls/library/ecp_curves.c
@@ -1044,7 +1044,7 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
     STORE32; i++;                               \
     cur = c > 0 ? c : 0; STORE32;               \
     cur = 0; while( ++i < MAX32 ) { STORE32; }  \
-    if( c < 0 ) fix_negative( N, c, &C, bits );
+    if( c < 0 ) MBEDTLS_MPI_CHK( fix_negative( N, c, &C, bits ) );
 
 /*
  * If the result is negative, we get it in the form
diff --git a/thirdparty/mbedtls/library/entropy_poll.c b/thirdparty/mbedtls/library/entropy_poll.c
index 26b7e4e2b9..2095a7dd34 100644
--- a/thirdparty/mbedtls/library/entropy_poll.c
+++ b/thirdparty/mbedtls/library/entropy_poll.c
@@ -44,7 +44,7 @@
  *  **********
  */
 
-#if defined(__linux__)
+#if defined(__linux__) && !defined(_GNU_SOURCE)
 /* Ensure that syscall() is available even when compiling with -std=c99 */
 #define _GNU_SOURCE
 #endif
diff --git a/thirdparty/mbedtls/library/error.c b/thirdparty/mbedtls/library/error.c
index eb52052b51..b83b8d1f1b 100644
--- a/thirdparty/mbedtls/library/error.c
+++ b/thirdparty/mbedtls/library/error.c
@@ -51,20 +51,19 @@
 #endif
 
 #if defined(MBEDTLS_ERROR_C) || defined(MBEDTLS_ERROR_STRERROR_DUMMY)
+
 #include "mbedtls/error.h"
-#include <string.h>
-#endif
+
+#if defined(MBEDTLS_ERROR_C)
 
 #if defined(MBEDTLS_PLATFORM_C)
 #include "mbedtls/platform.h"
 #else
 #define mbedtls_snprintf snprintf
-#define mbedtls_time_t   time_t
 #endif
 
-#if defined(MBEDTLS_ERROR_C)
-
 #include <stdio.h>
+#include <string.h>
 
 #if defined(MBEDTLS_AES_C)
 #include "mbedtls/aes.h"
@@ -929,8 +928,6 @@ void mbedtls_strerror( int ret, char *buf, size_t buflen )
 
 #else /* MBEDTLS_ERROR_C */
 
-#if defined(MBEDTLS_ERROR_STRERROR_DUMMY)
-
 /*
  * Provide an non-function in case MBEDTLS_ERROR_C is not defined
  */
@@ -942,6 +939,6 @@ void mbedtls_strerror( int ret, char *buf, size_t buflen )
         buf[0] = '\0';
 }
 
-#endif /* MBEDTLS_ERROR_STRERROR_DUMMY */
-
 #endif /* MBEDTLS_ERROR_C */
+
+#endif /* MBEDTLS_ERROR_C || MBEDTLS_ERROR_STRERROR_DUMMY */
diff --git a/thirdparty/mbedtls/library/hmac_drbg.c b/thirdparty/mbedtls/library/hmac_drbg.c
index 9fbfc30660..10cbd462ba 100644
--- a/thirdparty/mbedtls/library/hmac_drbg.c
+++ b/thirdparty/mbedtls/library/hmac_drbg.c
@@ -83,6 +83,8 @@ void mbedtls_hmac_drbg_init( mbedtls_hmac_drbg_context *ctx )
 {
     memset( ctx, 0, sizeof( mbedtls_hmac_drbg_context ) );
 
+    ctx->reseed_interval = MBEDTLS_HMAC_DRBG_RESEED_INTERVAL;
+
 #if defined(MBEDTLS_THREADING_C)
     mbedtls_mutex_init( &ctx->mutex );
 #endif
@@ -296,8 +298,6 @@ int mbedtls_hmac_drbg_seed( mbedtls_hmac_drbg_context *ctx,
     ctx->f_entropy = f_entropy;
     ctx->p_entropy = p_entropy;
 
-    ctx->reseed_interval = MBEDTLS_HMAC_DRBG_RESEED_INTERVAL;
-
     if( ctx->entropy_len == 0 )
     {
         /*
@@ -442,7 +442,8 @@ int mbedtls_hmac_drbg_random( void *p_rng, unsigned char *output, size_t out_len
 }
 
 /*
- * Free an HMAC_DRBG context
+ *  This function resets HMAC_DRBG context to the state immediately
+ *  after initial call of mbedtls_hmac_drbg_init().
  */
 void mbedtls_hmac_drbg_free( mbedtls_hmac_drbg_context *ctx )
 {
@@ -454,6 +455,10 @@ void mbedtls_hmac_drbg_free( mbedtls_hmac_drbg_context *ctx )
 #endif
     mbedtls_md_free( &ctx->md_ctx );
     mbedtls_platform_zeroize( ctx, sizeof( mbedtls_hmac_drbg_context ) );
+    ctx->reseed_interval = MBEDTLS_HMAC_DRBG_RESEED_INTERVAL;
+#if defined(MBEDTLS_THREADING_C)
+    mbedtls_mutex_init( &ctx->mutex );
+#endif
 }
 
 #if defined(MBEDTLS_FS_IO)
diff --git a/thirdparty/mbedtls/library/md2.c b/thirdparty/mbedtls/library/md2.c
index cbdaaabdc7..fdcb630a1f 100644
--- a/thirdparty/mbedtls/library/md2.c
+++ b/thirdparty/mbedtls/library/md2.c
@@ -177,6 +177,9 @@ int mbedtls_internal_md2_process( mbedtls_md2_context *ctx )
         t  = ctx->cksum[i];
     }
 
+    /* Zeroise variables to clear sensitive data from memory. */
+    mbedtls_platform_zeroize( &t, sizeof( t ) );
+
     return( 0 );
 }
 
diff --git a/thirdparty/mbedtls/library/md4.c b/thirdparty/mbedtls/library/md4.c
index cb16dce54a..95e893e654 100644
--- a/thirdparty/mbedtls/library/md4.c
+++ b/thirdparty/mbedtls/library/md4.c
@@ -143,31 +143,34 @@ void mbedtls_md4_starts( mbedtls_md4_context *ctx )
 int mbedtls_internal_md4_process( mbedtls_md4_context *ctx,
                                   const unsigned char data[64] )
 {
-    uint32_t X[16], A, B, C, D;
-
-    GET_UINT32_LE( X[ 0], data,  0 );
-    GET_UINT32_LE( X[ 1], data,  4 );
-    GET_UINT32_LE( X[ 2], data,  8 );
-    GET_UINT32_LE( X[ 3], data, 12 );
-    GET_UINT32_LE( X[ 4], data, 16 );
-    GET_UINT32_LE( X[ 5], data, 20 );
-    GET_UINT32_LE( X[ 6], data, 24 );
-    GET_UINT32_LE( X[ 7], data, 28 );
-    GET_UINT32_LE( X[ 8], data, 32 );
-    GET_UINT32_LE( X[ 9], data, 36 );
-    GET_UINT32_LE( X[10], data, 40 );
-    GET_UINT32_LE( X[11], data, 44 );
-    GET_UINT32_LE( X[12], data, 48 );
-    GET_UINT32_LE( X[13], data, 52 );
-    GET_UINT32_LE( X[14], data, 56 );
-    GET_UINT32_LE( X[15], data, 60 );
+    struct
+    {
+        uint32_t X[16], A, B, C, D;
+    } local;
+
+    GET_UINT32_LE( local.X[ 0], data,  0 );
+    GET_UINT32_LE( local.X[ 1], data,  4 );
+    GET_UINT32_LE( local.X[ 2], data,  8 );
+    GET_UINT32_LE( local.X[ 3], data, 12 );
+    GET_UINT32_LE( local.X[ 4], data, 16 );
+    GET_UINT32_LE( local.X[ 5], data, 20 );
+    GET_UINT32_LE( local.X[ 6], data, 24 );
+    GET_UINT32_LE( local.X[ 7], data, 28 );
+    GET_UINT32_LE( local.X[ 8], data, 32 );
+    GET_UINT32_LE( local.X[ 9], data, 36 );
+    GET_UINT32_LE( local.X[10], data, 40 );
+    GET_UINT32_LE( local.X[11], data, 44 );
+    GET_UINT32_LE( local.X[12], data, 48 );
+    GET_UINT32_LE( local.X[13], data, 52 );
+    GET_UINT32_LE( local.X[14], data, 56 );
+    GET_UINT32_LE( local.X[15], data, 60 );
 
 #define S(x,n) (((x) << (n)) | (((x) & 0xFFFFFFFF) >> (32 - (n))))
 
-    A = ctx->state[0];
-    B = ctx->state[1];
-    C = ctx->state[2];
-    D = ctx->state[3];
+    local.A = ctx->state[0];
+    local.B = ctx->state[1];
+    local.C = ctx->state[2];
+    local.D = ctx->state[3];
 
 #define F(x, y, z) (((x) & (y)) | ((~(x)) & (z)))
 #define P(a,b,c,d,x,s)                           \
@@ -178,22 +181,22 @@ int mbedtls_internal_md4_process( mbedtls_md4_context *ctx,
     } while( 0 )
 
 
-    P( A, B, C, D, X[ 0],  3 );
-    P( D, A, B, C, X[ 1],  7 );
-    P( C, D, A, B, X[ 2], 11 );
-    P( B, C, D, A, X[ 3], 19 );
-    P( A, B, C, D, X[ 4],  3 );
-    P( D, A, B, C, X[ 5],  7 );
-    P( C, D, A, B, X[ 6], 11 );
-    P( B, C, D, A, X[ 7], 19 );
-    P( A, B, C, D, X[ 8],  3 );
-    P( D, A, B, C, X[ 9],  7 );
-    P( C, D, A, B, X[10], 11 );
-    P( B, C, D, A, X[11], 19 );
-    P( A, B, C, D, X[12],  3 );
-    P( D, A, B, C, X[13],  7 );
-    P( C, D, A, B, X[14], 11 );
-    P( B, C, D, A, X[15], 19 );
+    P( local.A, local.B, local.C, local.D, local.X[ 0],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[ 1],  7 );
+    P( local.C, local.D, local.A, local.B, local.X[ 2], 11 );
+    P( local.B, local.C, local.D, local.A, local.X[ 3], 19 );
+    P( local.A, local.B, local.C, local.D, local.X[ 4],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[ 5],  7 );
+    P( local.C, local.D, local.A, local.B, local.X[ 6], 11 );
+    P( local.B, local.C, local.D, local.A, local.X[ 7], 19 );
+    P( local.A, local.B, local.C, local.D, local.X[ 8],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[ 9],  7 );
+    P( local.C, local.D, local.A, local.B, local.X[10], 11 );
+    P( local.B, local.C, local.D, local.A, local.X[11], 19 );
+    P( local.A, local.B, local.C, local.D, local.X[12],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[13],  7 );
+    P( local.C, local.D, local.A, local.B, local.X[14], 11 );
+    P( local.B, local.C, local.D, local.A, local.X[15], 19 );
 
 #undef P
 #undef F
@@ -206,22 +209,22 @@ int mbedtls_internal_md4_process( mbedtls_md4_context *ctx,
         (a) = S((a),(s));                               \
     } while( 0 )
 
-    P( A, B, C, D, X[ 0],  3 );
-    P( D, A, B, C, X[ 4],  5 );
-    P( C, D, A, B, X[ 8],  9 );
-    P( B, C, D, A, X[12], 13 );
-    P( A, B, C, D, X[ 1],  3 );
-    P( D, A, B, C, X[ 5],  5 );
-    P( C, D, A, B, X[ 9],  9 );
-    P( B, C, D, A, X[13], 13 );
-    P( A, B, C, D, X[ 2],  3 );
-    P( D, A, B, C, X[ 6],  5 );
-    P( C, D, A, B, X[10],  9 );
-    P( B, C, D, A, X[14], 13 );
-    P( A, B, C, D, X[ 3],  3 );
-    P( D, A, B, C, X[ 7],  5 );
-    P( C, D, A, B, X[11],  9 );
-    P( B, C, D, A, X[15], 13 );
+    P( local.A, local.B, local.C, local.D, local.X[ 0],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[ 4],  5 );
+    P( local.C, local.D, local.A, local.B, local.X[ 8],  9 );
+    P( local.B, local.C, local.D, local.A, local.X[12], 13 );
+    P( local.A, local.B, local.C, local.D, local.X[ 1],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[ 5],  5 );
+    P( local.C, local.D, local.A, local.B, local.X[ 9],  9 );
+    P( local.B, local.C, local.D, local.A, local.X[13], 13 );
+    P( local.A, local.B, local.C, local.D, local.X[ 2],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[ 6],  5 );
+    P( local.C, local.D, local.A, local.B, local.X[10],  9 );
+    P( local.B, local.C, local.D, local.A, local.X[14], 13 );
+    P( local.A, local.B, local.C, local.D, local.X[ 3],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[ 7],  5 );
+    P( local.C, local.D, local.A, local.B, local.X[11],  9 );
+    P( local.B, local.C, local.D, local.A, local.X[15], 13 );
 
 #undef P
 #undef F
@@ -234,30 +237,33 @@ int mbedtls_internal_md4_process( mbedtls_md4_context *ctx,
         (a) = S((a),(s));                               \
     } while( 0 )
 
-    P( A, B, C, D, X[ 0],  3 );
-    P( D, A, B, C, X[ 8],  9 );
-    P( C, D, A, B, X[ 4], 11 );
-    P( B, C, D, A, X[12], 15 );
-    P( A, B, C, D, X[ 2],  3 );
-    P( D, A, B, C, X[10],  9 );
-    P( C, D, A, B, X[ 6], 11 );
-    P( B, C, D, A, X[14], 15 );
-    P( A, B, C, D, X[ 1],  3 );
-    P( D, A, B, C, X[ 9],  9 );
-    P( C, D, A, B, X[ 5], 11 );
-    P( B, C, D, A, X[13], 15 );
-    P( A, B, C, D, X[ 3],  3 );
-    P( D, A, B, C, X[11],  9 );
-    P( C, D, A, B, X[ 7], 11 );
-    P( B, C, D, A, X[15], 15 );
+    P( local.A, local.B, local.C, local.D, local.X[ 0],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[ 8],  9 );
+    P( local.C, local.D, local.A, local.B, local.X[ 4], 11 );
+    P( local.B, local.C, local.D, local.A, local.X[12], 15 );
+    P( local.A, local.B, local.C, local.D, local.X[ 2],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[10],  9 );
+    P( local.C, local.D, local.A, local.B, local.X[ 6], 11 );
+    P( local.B, local.C, local.D, local.A, local.X[14], 15 );
+    P( local.A, local.B, local.C, local.D, local.X[ 1],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[ 9],  9 );
+    P( local.C, local.D, local.A, local.B, local.X[ 5], 11 );
+    P( local.B, local.C, local.D, local.A, local.X[13], 15 );
+    P( local.A, local.B, local.C, local.D, local.X[ 3],  3 );
+    P( local.D, local.A, local.B, local.C, local.X[11],  9 );
+    P( local.C, local.D, local.A, local.B, local.X[ 7], 11 );
+    P( local.B, local.C, local.D, local.A, local.X[15], 15 );
 
 #undef F
 #undef P
 
-    ctx->state[0] += A;
-    ctx->state[1] += B;
-    ctx->state[2] += C;
-    ctx->state[3] += D;
+    ctx->state[0] += local.A;
+    ctx->state[1] += local.B;
+    ctx->state[2] += local.C;
+    ctx->state[3] += local.D;
+
+    /* Zeroise variables to clear sensitive data from memory. */
+    mbedtls_platform_zeroize( &local, sizeof( local ) );
 
     return( 0 );
 }
diff --git a/thirdparty/mbedtls/library/md5.c b/thirdparty/mbedtls/library/md5.c
index fe25925214..d2b634fbb1 100644
--- a/thirdparty/mbedtls/library/md5.c
+++ b/thirdparty/mbedtls/library/md5.c
@@ -142,128 +142,134 @@ void mbedtls_md5_starts( mbedtls_md5_context *ctx )
 int mbedtls_internal_md5_process( mbedtls_md5_context *ctx,
                                   const unsigned char data[64] )
 {
-    uint32_t X[16], A, B, C, D;
-
-    GET_UINT32_LE( X[ 0], data,  0 );
-    GET_UINT32_LE( X[ 1], data,  4 );
-    GET_UINT32_LE( X[ 2], data,  8 );
-    GET_UINT32_LE( X[ 3], data, 12 );
-    GET_UINT32_LE( X[ 4], data, 16 );
-    GET_UINT32_LE( X[ 5], data, 20 );
-    GET_UINT32_LE( X[ 6], data, 24 );
-    GET_UINT32_LE( X[ 7], data, 28 );
-    GET_UINT32_LE( X[ 8], data, 32 );
-    GET_UINT32_LE( X[ 9], data, 36 );
-    GET_UINT32_LE( X[10], data, 40 );
-    GET_UINT32_LE( X[11], data, 44 );
-    GET_UINT32_LE( X[12], data, 48 );
-    GET_UINT32_LE( X[13], data, 52 );
-    GET_UINT32_LE( X[14], data, 56 );
-    GET_UINT32_LE( X[15], data, 60 );
+    struct
+    {
+        uint32_t X[16], A, B, C, D;
+    } local;
+
+    GET_UINT32_LE( local.X[ 0], data,  0 );
+    GET_UINT32_LE( local.X[ 1], data,  4 );
+    GET_UINT32_LE( local.X[ 2], data,  8 );
+    GET_UINT32_LE( local.X[ 3], data, 12 );
+    GET_UINT32_LE( local.X[ 4], data, 16 );
+    GET_UINT32_LE( local.X[ 5], data, 20 );
+    GET_UINT32_LE( local.X[ 6], data, 24 );
+    GET_UINT32_LE( local.X[ 7], data, 28 );
+    GET_UINT32_LE( local.X[ 8], data, 32 );
+    GET_UINT32_LE( local.X[ 9], data, 36 );
+    GET_UINT32_LE( local.X[10], data, 40 );
+    GET_UINT32_LE( local.X[11], data, 44 );
+    GET_UINT32_LE( local.X[12], data, 48 );
+    GET_UINT32_LE( local.X[13], data, 52 );
+    GET_UINT32_LE( local.X[14], data, 56 );
+    GET_UINT32_LE( local.X[15], data, 60 );
 
 #define S(x,n)                                                          \
     ( ( (x) << (n) ) | ( ( (x) & 0xFFFFFFFF) >> ( 32 - (n) ) ) )
 
-#define P(a,b,c,d,k,s,t)                                        \
-    do                                                          \
-    {                                                           \
-        (a) += F((b),(c),(d)) + X[(k)] + (t);                   \
-        (a) = S((a),(s)) + (b);                                 \
+#define P(a,b,c,d,k,s,t)                                                \
+    do                                                                  \
+    {                                                                   \
+        (a) += F((b),(c),(d)) + local.X[(k)] + (t);                     \
+        (a) = S((a),(s)) + (b);                                         \
     } while( 0 )
 
-    A = ctx->state[0];
-    B = ctx->state[1];
-    C = ctx->state[2];
-    D = ctx->state[3];
+    local.A = ctx->state[0];
+    local.B = ctx->state[1];
+    local.C = ctx->state[2];
+    local.D = ctx->state[3];
 
 #define F(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
 
-    P( A, B, C, D,  0,  7, 0xD76AA478 );
-    P( D, A, B, C,  1, 12, 0xE8C7B756 );
-    P( C, D, A, B,  2, 17, 0x242070DB );
-    P( B, C, D, A,  3, 22, 0xC1BDCEEE );
-    P( A, B, C, D,  4,  7, 0xF57C0FAF );
-    P( D, A, B, C,  5, 12, 0x4787C62A );
-    P( C, D, A, B,  6, 17, 0xA8304613 );
-    P( B, C, D, A,  7, 22, 0xFD469501 );
-    P( A, B, C, D,  8,  7, 0x698098D8 );
-    P( D, A, B, C,  9, 12, 0x8B44F7AF );
-    P( C, D, A, B, 10, 17, 0xFFFF5BB1 );
-    P( B, C, D, A, 11, 22, 0x895CD7BE );
-    P( A, B, C, D, 12,  7, 0x6B901122 );
-    P( D, A, B, C, 13, 12, 0xFD987193 );
-    P( C, D, A, B, 14, 17, 0xA679438E );
-    P( B, C, D, A, 15, 22, 0x49B40821 );
+    P( local.A, local.B, local.C, local.D,  0,  7, 0xD76AA478 );
+    P( local.D, local.A, local.B, local.C,  1, 12, 0xE8C7B756 );
+    P( local.C, local.D, local.A, local.B,  2, 17, 0x242070DB );
+    P( local.B, local.C, local.D, local.A,  3, 22, 0xC1BDCEEE );
+    P( local.A, local.B, local.C, local.D,  4,  7, 0xF57C0FAF );
+    P( local.D, local.A, local.B, local.C,  5, 12, 0x4787C62A );
+    P( local.C, local.D, local.A, local.B,  6, 17, 0xA8304613 );
+    P( local.B, local.C, local.D, local.A,  7, 22, 0xFD469501 );
+    P( local.A, local.B, local.C, local.D,  8,  7, 0x698098D8 );
+    P( local.D, local.A, local.B, local.C,  9, 12, 0x8B44F7AF );
+    P( local.C, local.D, local.A, local.B, 10, 17, 0xFFFF5BB1 );
+    P( local.B, local.C, local.D, local.A, 11, 22, 0x895CD7BE );
+    P( local.A, local.B, local.C, local.D, 12,  7, 0x6B901122 );
+    P( local.D, local.A, local.B, local.C, 13, 12, 0xFD987193 );
+    P( local.C, local.D, local.A, local.B, 14, 17, 0xA679438E );
+    P( local.B, local.C, local.D, local.A, 15, 22, 0x49B40821 );
 
 #undef F
 
 #define F(x,y,z) ((y) ^ ((z) & ((x) ^ (y))))
 
-    P( A, B, C, D,  1,  5, 0xF61E2562 );
-    P( D, A, B, C,  6,  9, 0xC040B340 );
-    P( C, D, A, B, 11, 14, 0x265E5A51 );
-    P( B, C, D, A,  0, 20, 0xE9B6C7AA );
-    P( A, B, C, D,  5,  5, 0xD62F105D );
-    P( D, A, B, C, 10,  9, 0x02441453 );
-    P( C, D, A, B, 15, 14, 0xD8A1E681 );
-    P( B, C, D, A,  4, 20, 0xE7D3FBC8 );
-    P( A, B, C, D,  9,  5, 0x21E1CDE6 );
-    P( D, A, B, C, 14,  9, 0xC33707D6 );
-    P( C, D, A, B,  3, 14, 0xF4D50D87 );
-    P( B, C, D, A,  8, 20, 0x455A14ED );
-    P( A, B, C, D, 13,  5, 0xA9E3E905 );
-    P( D, A, B, C,  2,  9, 0xFCEFA3F8 );
-    P( C, D, A, B,  7, 14, 0x676F02D9 );
-    P( B, C, D, A, 12, 20, 0x8D2A4C8A );
+    P( local.A, local.B, local.C, local.D,  1,  5, 0xF61E2562 );
+    P( local.D, local.A, local.B, local.C,  6,  9, 0xC040B340 );
+    P( local.C, local.D, local.A, local.B, 11, 14, 0x265E5A51 );
+    P( local.B, local.C, local.D, local.A,  0, 20, 0xE9B6C7AA );
+    P( local.A, local.B, local.C, local.D,  5,  5, 0xD62F105D );
+    P( local.D, local.A, local.B, local.C, 10,  9, 0x02441453 );
+    P( local.C, local.D, local.A, local.B, 15, 14, 0xD8A1E681 );
+    P( local.B, local.C, local.D, local.A,  4, 20, 0xE7D3FBC8 );
+    P( local.A, local.B, local.C, local.D,  9,  5, 0x21E1CDE6 );
+    P( local.D, local.A, local.B, local.C, 14,  9, 0xC33707D6 );
+    P( local.C, local.D, local.A, local.B,  3, 14, 0xF4D50D87 );
+    P( local.B, local.C, local.D, local.A,  8, 20, 0x455A14ED );
+    P( local.A, local.B, local.C, local.D, 13,  5, 0xA9E3E905 );
+    P( local.D, local.A, local.B, local.C,  2,  9, 0xFCEFA3F8 );
+    P( local.C, local.D, local.A, local.B,  7, 14, 0x676F02D9 );
+    P( local.B, local.C, local.D, local.A, 12, 20, 0x8D2A4C8A );
 
 #undef F
 
 #define F(x,y,z) ((x) ^ (y) ^ (z))
 
-    P( A, B, C, D,  5,  4, 0xFFFA3942 );
-    P( D, A, B, C,  8, 11, 0x8771F681 );
-    P( C, D, A, B, 11, 16, 0x6D9D6122 );
-    P( B, C, D, A, 14, 23, 0xFDE5380C );
-    P( A, B, C, D,  1,  4, 0xA4BEEA44 );
-    P( D, A, B, C,  4, 11, 0x4BDECFA9 );
-    P( C, D, A, B,  7, 16, 0xF6BB4B60 );
-    P( B, C, D, A, 10, 23, 0xBEBFBC70 );
-    P( A, B, C, D, 13,  4, 0x289B7EC6 );
-    P( D, A, B, C,  0, 11, 0xEAA127FA );
-    P( C, D, A, B,  3, 16, 0xD4EF3085 );
-    P( B, C, D, A,  6, 23, 0x04881D05 );
-    P( A, B, C, D,  9,  4, 0xD9D4D039 );
-    P( D, A, B, C, 12, 11, 0xE6DB99E5 );
-    P( C, D, A, B, 15, 16, 0x1FA27CF8 );
-    P( B, C, D, A,  2, 23, 0xC4AC5665 );
+    P( local.A, local.B, local.C, local.D,  5,  4, 0xFFFA3942 );
+    P( local.D, local.A, local.B, local.C,  8, 11, 0x8771F681 );
+    P( local.C, local.D, local.A, local.B, 11, 16, 0x6D9D6122 );
+    P( local.B, local.C, local.D, local.A, 14, 23, 0xFDE5380C );
+    P( local.A, local.B, local.C, local.D,  1,  4, 0xA4BEEA44 );
+    P( local.D, local.A, local.B, local.C,  4, 11, 0x4BDECFA9 );
+    P( local.C, local.D, local.A, local.B,  7, 16, 0xF6BB4B60 );
+    P( local.B, local.C, local.D, local.A, 10, 23, 0xBEBFBC70 );
+    P( local.A, local.B, local.C, local.D, 13,  4, 0x289B7EC6 );
+    P( local.D, local.A, local.B, local.C,  0, 11, 0xEAA127FA );
+    P( local.C, local.D, local.A, local.B,  3, 16, 0xD4EF3085 );
+    P( local.B, local.C, local.D, local.A,  6, 23, 0x04881D05 );
+    P( local.A, local.B, local.C, local.D,  9,  4, 0xD9D4D039 );
+    P( local.D, local.A, local.B, local.C, 12, 11, 0xE6DB99E5 );
+    P( local.C, local.D, local.A, local.B, 15, 16, 0x1FA27CF8 );
+    P( local.B, local.C, local.D, local.A,  2, 23, 0xC4AC5665 );
 
 #undef F
 
 #define F(x,y,z) ((y) ^ ((x) | ~(z)))
 
-    P( A, B, C, D,  0,  6, 0xF4292244 );
-    P( D, A, B, C,  7, 10, 0x432AFF97 );
-    P( C, D, A, B, 14, 15, 0xAB9423A7 );
-    P( B, C, D, A,  5, 21, 0xFC93A039 );
-    P( A, B, C, D, 12,  6, 0x655B59C3 );
-    P( D, A, B, C,  3, 10, 0x8F0CCC92 );
-    P( C, D, A, B, 10, 15, 0xFFEFF47D );
-    P( B, C, D, A,  1, 21, 0x85845DD1 );
-    P( A, B, C, D,  8,  6, 0x6FA87E4F );
-    P( D, A, B, C, 15, 10, 0xFE2CE6E0 );
-    P( C, D, A, B,  6, 15, 0xA3014314 );
-    P( B, C, D, A, 13, 21, 0x4E0811A1 );
-    P( A, B, C, D,  4,  6, 0xF7537E82 );
-    P( D, A, B, C, 11, 10, 0xBD3AF235 );
-    P( C, D, A, B,  2, 15, 0x2AD7D2BB );
-    P( B, C, D, A,  9, 21, 0xEB86D391 );
+    P( local.A, local.B, local.C, local.D,  0,  6, 0xF4292244 );
+    P( local.D, local.A, local.B, local.C,  7, 10, 0x432AFF97 );
+    P( local.C, local.D, local.A, local.B, 14, 15, 0xAB9423A7 );
+    P( local.B, local.C, local.D, local.A,  5, 21, 0xFC93A039 );
+    P( local.A, local.B, local.C, local.D, 12,  6, 0x655B59C3 );
+    P( local.D, local.A, local.B, local.C,  3, 10, 0x8F0CCC92 );
+    P( local.C, local.D, local.A, local.B, 10, 15, 0xFFEFF47D );
+    P( local.B, local.C, local.D, local.A,  1, 21, 0x85845DD1 );
+    P( local.A, local.B, local.C, local.D,  8,  6, 0x6FA87E4F );
+    P( local.D, local.A, local.B, local.C, 15, 10, 0xFE2CE6E0 );
+    P( local.C, local.D, local.A, local.B,  6, 15, 0xA3014314 );
+    P( local.B, local.C, local.D, local.A, 13, 21, 0x4E0811A1 );
+    P( local.A, local.B, local.C, local.D,  4,  6, 0xF7537E82 );
+    P( local.D, local.A, local.B, local.C, 11, 10, 0xBD3AF235 );
+    P( local.C, local.D, local.A, local.B,  2, 15, 0x2AD7D2BB );
+    P( local.B, local.C, local.D, local.A,  9, 21, 0xEB86D391 );
 
 #undef F
 
-    ctx->state[0] += A;
-    ctx->state[1] += B;
-    ctx->state[2] += C;
-    ctx->state[3] += D;
+    ctx->state[0] += local.A;
+    ctx->state[1] += local.B;
+    ctx->state[2] += local.C;
+    ctx->state[3] += local.D;
+
+    /* Zeroise variables to clear sensitive data from memory. */
+    mbedtls_platform_zeroize( &local, sizeof( local ) );
 
     return( 0 );
 }
diff --git a/thirdparty/mbedtls/library/pem.c b/thirdparty/mbedtls/library/pem.c
index a7a2f7f5cf..50e663ccdb 100644
--- a/thirdparty/mbedtls/library/pem.c
+++ b/thirdparty/mbedtls/library/pem.c
@@ -508,8 +508,12 @@ int mbedtls_pem_write_buffer( const char *header, const char *footer,
     *p++ = '\0';
     *olen = p - buf;
 
+     /* Clean any remaining data previously written to the buffer */
+    memset( buf + *olen, 0, buf_len - *olen );
+
     mbedtls_free( encode_buf );
     return( 0 );
 }
 #endif /* MBEDTLS_PEM_WRITE_C */
 #endif /* MBEDTLS_PEM_PARSE_C || MBEDTLS_PEM_WRITE_C */
+
diff --git a/thirdparty/mbedtls/library/pkcs5.c b/thirdparty/mbedtls/library/pkcs5.c
index 8a80aa5d05..c4447f1546 100644
--- a/thirdparty/mbedtls/library/pkcs5.c
+++ b/thirdparty/mbedtls/library/pkcs5.c
@@ -247,7 +247,7 @@ int mbedtls_pkcs5_pbkdf2_hmac( mbedtls_md_context_t *ctx, const unsigned char *p
                        unsigned int iteration_count,
                        uint32_t key_length, unsigned char *output )
 {
-    int ret, j;
+    int ret = 0, j;
     unsigned int i;
     unsigned char md1[MBEDTLS_MD_MAX_SIZE];
     unsigned char work[MBEDTLS_MD_MAX_SIZE];
@@ -269,16 +269,16 @@ int mbedtls_pkcs5_pbkdf2_hmac( mbedtls_md_context_t *ctx, const unsigned char *p
         // U1 ends up in work
         //
         if( ( ret = mbedtls_md_hmac_starts( ctx, password, plen ) ) != 0 )
-            return( ret );
+            goto cleanup;
 
         if( ( ret = mbedtls_md_hmac_update( ctx, salt, slen ) ) != 0 )
-            return( ret );
+            goto cleanup;
 
         if( ( ret = mbedtls_md_hmac_update( ctx, counter, 4 ) ) != 0 )
-            return( ret );
+            goto cleanup;
 
         if( ( ret = mbedtls_md_hmac_finish( ctx, work ) ) != 0 )
-            return( ret );
+            goto cleanup;
 
         memcpy( md1, work, md_size );
 
@@ -287,13 +287,13 @@ int mbedtls_pkcs5_pbkdf2_hmac( mbedtls_md_context_t *ctx, const unsigned char *p
             // U2 ends up in md1
             //
             if( ( ret = mbedtls_md_hmac_starts( ctx, password, plen ) ) != 0 )
-                return( ret );
+                goto cleanup;
 
             if( ( ret = mbedtls_md_hmac_update( ctx, md1, md_size ) ) != 0 )
-                return( ret );
+                goto cleanup;
 
             if( ( ret = mbedtls_md_hmac_finish( ctx, md1 ) ) != 0 )
-                return( ret );
+                goto cleanup;
 
             // U1 xor U2
             //
@@ -312,7 +312,12 @@ int mbedtls_pkcs5_pbkdf2_hmac( mbedtls_md_context_t *ctx, const unsigned char *p
                 break;
     }
 
-    return( 0 );
+cleanup:
+    /* Zeroise buffers to clear sensitive data from memory. */
+    mbedtls_platform_zeroize( work, MBEDTLS_MD_MAX_SIZE );
+    mbedtls_platform_zeroize( md1, MBEDTLS_MD_MAX_SIZE );
+
+    return( ret );
 }
 
 #if defined(MBEDTLS_SELF_TEST)
diff --git a/thirdparty/mbedtls/library/pkparse.c b/thirdparty/mbedtls/library/pkparse.c
index 086807d836..e410f3aae1 100644
--- a/thirdparty/mbedtls/library/pkparse.c
+++ b/thirdparty/mbedtls/library/pkparse.c
@@ -692,7 +692,7 @@ int mbedtls_pk_parse_subpubkey( unsigned char **p, const unsigned char *end,
         ret = MBEDTLS_ERR_PK_UNKNOWN_PK_ALG;
 
     if( ret == 0 && *p != end )
-        ret = MBEDTLS_ERR_PK_INVALID_PUBKEY
+        ret = MBEDTLS_ERR_PK_INVALID_PUBKEY +
               MBEDTLS_ERR_ASN1_LENGTH_MISMATCH;
 
     if( ret != 0 )
diff --git a/thirdparty/mbedtls/library/platform_util.c b/thirdparty/mbedtls/library/platform_util.c
index 3ba2aead12..c8cd52d52a 100644
--- a/thirdparty/mbedtls/library/platform_util.c
+++ b/thirdparty/mbedtls/library/platform_util.c
@@ -115,7 +115,7 @@ void mbedtls_platform_zeroize( void *buf, size_t len )
 
 #if !( ( defined(_POSIX_VERSION) && _POSIX_VERSION >= 200809L ) ||     \
        ( defined(_POSIX_THREAD_SAFE_FUNCTIONS ) &&                     \
-         _POSIX_THREAD_SAFE_FUNCTIONS >= 20112L ) )
+         _POSIX_THREAD_SAFE_FUNCTIONS >= 200112L ) )
 /*
  * This is a convenience shorthand macro to avoid checking the long
  * preprocessor conditions above. Ideally, we could expose this macro in
@@ -129,7 +129,7 @@ void mbedtls_platform_zeroize( void *buf, size_t len )
 
 #endif /* !( ( defined(_POSIX_VERSION) && _POSIX_VERSION >= 200809L ) ||     \
              ( defined(_POSIX_THREAD_SAFE_FUNCTIONS ) &&                     \
-                _POSIX_THREAD_SAFE_FUNCTIONS >= 20112L ) ) */
+                _POSIX_THREAD_SAFE_FUNCTIONS >= 200112L ) ) */
 
 struct tm *mbedtls_platform_gmtime_r( const mbedtls_time_t *tt,
                                       struct tm *tm_buf )
diff --git a/thirdparty/mbedtls/library/ripemd160.c b/thirdparty/mbedtls/library/ripemd160.c
index 0b6efcb574..d6ee933b2e 100644
--- a/thirdparty/mbedtls/library/ripemd160.c
+++ b/thirdparty/mbedtls/library/ripemd160.c
@@ -147,30 +147,33 @@ void mbedtls_ripemd160_starts( mbedtls_ripemd160_context *ctx )
 int mbedtls_internal_ripemd160_process( mbedtls_ripemd160_context *ctx,
                                         const unsigned char data[64] )
 {
-    uint32_t A, B, C, D, E, Ap, Bp, Cp, Dp, Ep, X[16];
-
-    GET_UINT32_LE( X[ 0], data,  0 );
-    GET_UINT32_LE( X[ 1], data,  4 );
-    GET_UINT32_LE( X[ 2], data,  8 );
-    GET_UINT32_LE( X[ 3], data, 12 );
-    GET_UINT32_LE( X[ 4], data, 16 );
-    GET_UINT32_LE( X[ 5], data, 20 );
-    GET_UINT32_LE( X[ 6], data, 24 );
-    GET_UINT32_LE( X[ 7], data, 28 );
-    GET_UINT32_LE( X[ 8], data, 32 );
-    GET_UINT32_LE( X[ 9], data, 36 );
-    GET_UINT32_LE( X[10], data, 40 );
-    GET_UINT32_LE( X[11], data, 44 );
-    GET_UINT32_LE( X[12], data, 48 );
-    GET_UINT32_LE( X[13], data, 52 );
-    GET_UINT32_LE( X[14], data, 56 );
-    GET_UINT32_LE( X[15], data, 60 );
-
-    A = Ap = ctx->state[0];
-    B = Bp = ctx->state[1];
-    C = Cp = ctx->state[2];
-    D = Dp = ctx->state[3];
-    E = Ep = ctx->state[4];
+    struct
+    {
+        uint32_t A, B, C, D, E, Ap, Bp, Cp, Dp, Ep, X[16];
+    } local;
+
+    GET_UINT32_LE( local.X[ 0], data,  0 );
+    GET_UINT32_LE( local.X[ 1], data,  4 );
+    GET_UINT32_LE( local.X[ 2], data,  8 );
+    GET_UINT32_LE( local.X[ 3], data, 12 );
+    GET_UINT32_LE( local.X[ 4], data, 16 );
+    GET_UINT32_LE( local.X[ 5], data, 20 );
+    GET_UINT32_LE( local.X[ 6], data, 24 );
+    GET_UINT32_LE( local.X[ 7], data, 28 );
+    GET_UINT32_LE( local.X[ 8], data, 32 );
+    GET_UINT32_LE( local.X[ 9], data, 36 );
+    GET_UINT32_LE( local.X[10], data, 40 );
+    GET_UINT32_LE( local.X[11], data, 44 );
+    GET_UINT32_LE( local.X[12], data, 48 );
+    GET_UINT32_LE( local.X[13], data, 52 );
+    GET_UINT32_LE( local.X[14], data, 56 );
+    GET_UINT32_LE( local.X[15], data, 60 );
+
+    local.A = local.Ap = ctx->state[0];
+    local.B = local.Bp = ctx->state[1];
+    local.C = local.Cp = ctx->state[2];
+    local.D = local.Dp = ctx->state[3];
+    local.E = local.Ep = ctx->state[4];
 
 #define F1( x, y, z )   ( (x) ^ (y) ^ (z) )
 #define F2( x, y, z )   ( ( (x) & (y) ) | ( ~(x) & (z) ) )
@@ -180,12 +183,12 @@ int mbedtls_internal_ripemd160_process( mbedtls_ripemd160_context *ctx,
 
 #define S( x, n ) ( ( (x) << (n) ) | ( (x) >> (32 - (n)) ) )
 
-#define P( a, b, c, d, e, r, s, f, k )                \
-    do                                                \
-    {                                                 \
-        (a) += f( (b), (c), (d) ) + X[r] + (k);       \
-        (a) = S( (a), (s) ) + (e);                    \
-        (c) = S( (c), 10 );                           \
+#define P( a, b, c, d, e, r, s, f, k )                      \
+    do                                                      \
+    {                                                       \
+        (a) += f( (b), (c), (d) ) + local.X[r] + (k);       \
+        (a) = S( (a), (s) ) + (e);                          \
+        (c) = S( (c), 10 );                                 \
     } while( 0 )
 
 #define P2( a, b, c, d, e, r, s, rp, sp )                               \
@@ -200,22 +203,22 @@ int mbedtls_internal_ripemd160_process( mbedtls_ripemd160_context *ctx,
 #define K   0x00000000
 #define Fp  F5
 #define Kp  0x50A28BE6
-    P2( A, B, C, D, E,  0, 11,  5,  8 );
-    P2( E, A, B, C, D,  1, 14, 14,  9 );
-    P2( D, E, A, B, C,  2, 15,  7,  9 );
-    P2( C, D, E, A, B,  3, 12,  0, 11 );
-    P2( B, C, D, E, A,  4,  5,  9, 13 );
-    P2( A, B, C, D, E,  5,  8,  2, 15 );
-    P2( E, A, B, C, D,  6,  7, 11, 15 );
-    P2( D, E, A, B, C,  7,  9,  4,  5 );
-    P2( C, D, E, A, B,  8, 11, 13,  7 );
-    P2( B, C, D, E, A,  9, 13,  6,  7 );
-    P2( A, B, C, D, E, 10, 14, 15,  8 );
-    P2( E, A, B, C, D, 11, 15,  8, 11 );
-    P2( D, E, A, B, C, 12,  6,  1, 14 );
-    P2( C, D, E, A, B, 13,  7, 10, 14 );
-    P2( B, C, D, E, A, 14,  9,  3, 12 );
-    P2( A, B, C, D, E, 15,  8, 12,  6 );
+    P2( local.A, local.B, local.C, local.D, local.E,  0, 11,  5,  8 );
+    P2( local.E, local.A, local.B, local.C, local.D,  1, 14, 14,  9 );
+    P2( local.D, local.E, local.A, local.B, local.C,  2, 15,  7,  9 );
+    P2( local.C, local.D, local.E, local.A, local.B,  3, 12,  0, 11 );
+    P2( local.B, local.C, local.D, local.E, local.A,  4,  5,  9, 13 );
+    P2( local.A, local.B, local.C, local.D, local.E,  5,  8,  2, 15 );
+    P2( local.E, local.A, local.B, local.C, local.D,  6,  7, 11, 15 );
+    P2( local.D, local.E, local.A, local.B, local.C,  7,  9,  4,  5 );
+    P2( local.C, local.D, local.E, local.A, local.B,  8, 11, 13,  7 );
+    P2( local.B, local.C, local.D, local.E, local.A,  9, 13,  6,  7 );
+    P2( local.A, local.B, local.C, local.D, local.E, 10, 14, 15,  8 );
+    P2( local.E, local.A, local.B, local.C, local.D, 11, 15,  8, 11 );
+    P2( local.D, local.E, local.A, local.B, local.C, 12,  6,  1, 14 );
+    P2( local.C, local.D, local.E, local.A, local.B, 13,  7, 10, 14 );
+    P2( local.B, local.C, local.D, local.E, local.A, 14,  9,  3, 12 );
+    P2( local.A, local.B, local.C, local.D, local.E, 15,  8, 12,  6 );
 #undef F
 #undef K
 #undef Fp
@@ -225,22 +228,22 @@ int mbedtls_internal_ripemd160_process( mbedtls_ripemd160_context *ctx,
 #define K   0x5A827999
 #define Fp  F4
 #define Kp  0x5C4DD124
-    P2( E, A, B, C, D,  7,  7,  6,  9 );
-    P2( D, E, A, B, C,  4,  6, 11, 13 );
-    P2( C, D, E, A, B, 13,  8,  3, 15 );
-    P2( B, C, D, E, A,  1, 13,  7,  7 );
-    P2( A, B, C, D, E, 10, 11,  0, 12 );
-    P2( E, A, B, C, D,  6,  9, 13,  8 );
-    P2( D, E, A, B, C, 15,  7,  5,  9 );
-    P2( C, D, E, A, B,  3, 15, 10, 11 );
-    P2( B, C, D, E, A, 12,  7, 14,  7 );
-    P2( A, B, C, D, E,  0, 12, 15,  7 );
-    P2( E, A, B, C, D,  9, 15,  8, 12 );
-    P2( D, E, A, B, C,  5,  9, 12,  7 );
-    P2( C, D, E, A, B,  2, 11,  4,  6 );
-    P2( B, C, D, E, A, 14,  7,  9, 15 );
-    P2( A, B, C, D, E, 11, 13,  1, 13 );
-    P2( E, A, B, C, D,  8, 12,  2, 11 );
+    P2( local.E, local.A, local.B, local.C, local.D,  7,  7,  6,  9 );
+    P2( local.D, local.E, local.A, local.B, local.C,  4,  6, 11, 13 );
+    P2( local.C, local.D, local.E, local.A, local.B, 13,  8,  3, 15 );
+    P2( local.B, local.C, local.D, local.E, local.A,  1, 13,  7,  7 );
+    P2( local.A, local.B, local.C, local.D, local.E, 10, 11,  0, 12 );
+    P2( local.E, local.A, local.B, local.C, local.D,  6,  9, 13,  8 );
+    P2( local.D, local.E, local.A, local.B, local.C, 15,  7,  5,  9 );
+    P2( local.C, local.D, local.E, local.A, local.B,  3, 15, 10, 11 );
+    P2( local.B, local.C, local.D, local.E, local.A, 12,  7, 14,  7 );
+    P2( local.A, local.B, local.C, local.D, local.E,  0, 12, 15,  7 );
+    P2( local.E, local.A, local.B, local.C, local.D,  9, 15,  8, 12 );
+    P2( local.D, local.E, local.A, local.B, local.C,  5,  9, 12,  7 );
+    P2( local.C, local.D, local.E, local.A, local.B,  2, 11,  4,  6 );
+    P2( local.B, local.C, local.D, local.E, local.A, 14,  7,  9, 15 );
+    P2( local.A, local.B, local.C, local.D, local.E, 11, 13,  1, 13 );
+    P2( local.E, local.A, local.B, local.C, local.D,  8, 12,  2, 11 );
 #undef F
 #undef K
 #undef Fp
@@ -250,22 +253,22 @@ int mbedtls_internal_ripemd160_process( mbedtls_ripemd160_context *ctx,
 #define K   0x6ED9EBA1
 #define Fp  F3
 #define Kp  0x6D703EF3
-    P2( D, E, A, B, C,  3, 11, 15,  9 );
-    P2( C, D, E, A, B, 10, 13,  5,  7 );
-    P2( B, C, D, E, A, 14,  6,  1, 15 );
-    P2( A, B, C, D, E,  4,  7,  3, 11 );
-    P2( E, A, B, C, D,  9, 14,  7,  8 );
-    P2( D, E, A, B, C, 15,  9, 14,  6 );
-    P2( C, D, E, A, B,  8, 13,  6,  6 );
-    P2( B, C, D, E, A,  1, 15,  9, 14 );
-    P2( A, B, C, D, E,  2, 14, 11, 12 );
-    P2( E, A, B, C, D,  7,  8,  8, 13 );
-    P2( D, E, A, B, C,  0, 13, 12,  5 );
-    P2( C, D, E, A, B,  6,  6,  2, 14 );
-    P2( B, C, D, E, A, 13,  5, 10, 13 );
-    P2( A, B, C, D, E, 11, 12,  0, 13 );
-    P2( E, A, B, C, D,  5,  7,  4,  7 );
-    P2( D, E, A, B, C, 12,  5, 13,  5 );
+    P2( local.D, local.E, local.A, local.B, local.C,  3, 11, 15,  9 );
+    P2( local.C, local.D, local.E, local.A, local.B, 10, 13,  5,  7 );
+    P2( local.B, local.C, local.D, local.E, local.A, 14,  6,  1, 15 );
+    P2( local.A, local.B, local.C, local.D, local.E,  4,  7,  3, 11 );
+    P2( local.E, local.A, local.B, local.C, local.D,  9, 14,  7,  8 );
+    P2( local.D, local.E, local.A, local.B, local.C, 15,  9, 14,  6 );
+    P2( local.C, local.D, local.E, local.A, local.B,  8, 13,  6,  6 );
+    P2( local.B, local.C, local.D, local.E, local.A,  1, 15,  9, 14 );
+    P2( local.A, local.B, local.C, local.D, local.E,  2, 14, 11, 12 );
+    P2( local.E, local.A, local.B, local.C, local.D,  7,  8,  8, 13 );
+    P2( local.D, local.E, local.A, local.B, local.C,  0, 13, 12,  5 );
+    P2( local.C, local.D, local.E, local.A, local.B,  6,  6,  2, 14 );
+    P2( local.B, local.C, local.D, local.E, local.A, 13,  5, 10, 13 );
+    P2( local.A, local.B, local.C, local.D, local.E, 11, 12,  0, 13 );
+    P2( local.E, local.A, local.B, local.C, local.D,  5,  7,  4,  7 );
+    P2( local.D, local.E, local.A, local.B, local.C, 12,  5, 13,  5 );
 #undef F
 #undef K
 #undef Fp
@@ -275,22 +278,22 @@ int mbedtls_internal_ripemd160_process( mbedtls_ripemd160_context *ctx,
 #define K   0x8F1BBCDC
 #define Fp  F2
 #define Kp  0x7A6D76E9
-    P2( C, D, E, A, B,  1, 11,  8, 15 );
-    P2( B, C, D, E, A,  9, 12,  6,  5 );
-    P2( A, B, C, D, E, 11, 14,  4,  8 );
-    P2( E, A, B, C, D, 10, 15,  1, 11 );
-    P2( D, E, A, B, C,  0, 14,  3, 14 );
-    P2( C, D, E, A, B,  8, 15, 11, 14 );
-    P2( B, C, D, E, A, 12,  9, 15,  6 );
-    P2( A, B, C, D, E,  4,  8,  0, 14 );
-    P2( E, A, B, C, D, 13,  9,  5,  6 );
-    P2( D, E, A, B, C,  3, 14, 12,  9 );
-    P2( C, D, E, A, B,  7,  5,  2, 12 );
-    P2( B, C, D, E, A, 15,  6, 13,  9 );
-    P2( A, B, C, D, E, 14,  8,  9, 12 );
-    P2( E, A, B, C, D,  5,  6,  7,  5 );
-    P2( D, E, A, B, C,  6,  5, 10, 15 );
-    P2( C, D, E, A, B,  2, 12, 14,  8 );
+    P2( local.C, local.D, local.E, local.A, local.B,  1, 11,  8, 15 );
+    P2( local.B, local.C, local.D, local.E, local.A,  9, 12,  6,  5 );
+    P2( local.A, local.B, local.C, local.D, local.E, 11, 14,  4,  8 );
+    P2( local.E, local.A, local.B, local.C, local.D, 10, 15,  1, 11 );
+    P2( local.D, local.E, local.A, local.B, local.C,  0, 14,  3, 14 );
+    P2( local.C, local.D, local.E, local.A, local.B,  8, 15, 11, 14 );
+    P2( local.B, local.C, local.D, local.E, local.A, 12,  9, 15,  6 );
+    P2( local.A, local.B, local.C, local.D, local.E,  4,  8,  0, 14 );
+    P2( local.E, local.A, local.B, local.C, local.D, 13,  9,  5,  6 );
+    P2( local.D, local.E, local.A, local.B, local.C,  3, 14, 12,  9 );
+    P2( local.C, local.D, local.E, local.A, local.B,  7,  5,  2, 12 );
+    P2( local.B, local.C, local.D, local.E, local.A, 15,  6, 13,  9 );
+    P2( local.A, local.B, local.C, local.D, local.E, 14,  8,  9, 12 );
+    P2( local.E, local.A, local.B, local.C, local.D,  5,  6,  7,  5 );
+    P2( local.D, local.E, local.A, local.B, local.C,  6,  5, 10, 15 );
+    P2( local.C, local.D, local.E, local.A, local.B,  2, 12, 14,  8 );
 #undef F
 #undef K
 #undef Fp
@@ -300,33 +303,36 @@ int mbedtls_internal_ripemd160_process( mbedtls_ripemd160_context *ctx,
 #define K   0xA953FD4E
 #define Fp  F1
 #define Kp  0x00000000
-    P2( B, C, D, E, A,  4,  9, 12,  8 );
-    P2( A, B, C, D, E,  0, 15, 15,  5 );
-    P2( E, A, B, C, D,  5,  5, 10, 12 );
-    P2( D, E, A, B, C,  9, 11,  4,  9 );
-    P2( C, D, E, A, B,  7,  6,  1, 12 );
-    P2( B, C, D, E, A, 12,  8,  5,  5 );
-    P2( A, B, C, D, E,  2, 13,  8, 14 );
-    P2( E, A, B, C, D, 10, 12,  7,  6 );
-    P2( D, E, A, B, C, 14,  5,  6,  8 );
-    P2( C, D, E, A, B,  1, 12,  2, 13 );
-    P2( B, C, D, E, A,  3, 13, 13,  6 );
-    P2( A, B, C, D, E,  8, 14, 14,  5 );
-    P2( E, A, B, C, D, 11, 11,  0, 15 );
-    P2( D, E, A, B, C,  6,  8,  3, 13 );
-    P2( C, D, E, A, B, 15,  5,  9, 11 );
-    P2( B, C, D, E, A, 13,  6, 11, 11 );
+    P2( local.B, local.C, local.D, local.E, local.A,  4,  9, 12,  8 );
+    P2( local.A, local.B, local.C, local.D, local.E,  0, 15, 15,  5 );
+    P2( local.E, local.A, local.B, local.C, local.D,  5,  5, 10, 12 );
+    P2( local.D, local.E, local.A, local.B, local.C,  9, 11,  4,  9 );
+    P2( local.C, local.D, local.E, local.A, local.B,  7,  6,  1, 12 );
+    P2( local.B, local.C, local.D, local.E, local.A, 12,  8,  5,  5 );
+    P2( local.A, local.B, local.C, local.D, local.E,  2, 13,  8, 14 );
+    P2( local.E, local.A, local.B, local.C, local.D, 10, 12,  7,  6 );
+    P2( local.D, local.E, local.A, local.B, local.C, 14,  5,  6,  8 );
+    P2( local.C, local.D, local.E, local.A, local.B,  1, 12,  2, 13 );
+    P2( local.B, local.C, local.D, local.E, local.A,  3, 13, 13,  6 );
+    P2( local.A, local.B, local.C, local.D, local.E,  8, 14, 14,  5 );
+    P2( local.E, local.A, local.B, local.C, local.D, 11, 11,  0, 15 );
+    P2( local.D, local.E, local.A, local.B, local.C,  6,  8,  3, 13 );
+    P2( local.C, local.D, local.E, local.A, local.B, 15,  5,  9, 11 );
+    P2( local.B, local.C, local.D, local.E, local.A, 13,  6, 11, 11 );
 #undef F
 #undef K
 #undef Fp
 #undef Kp
 
-    C             = ctx->state[1] + C + Dp;
-    ctx->state[1] = ctx->state[2] + D + Ep;
-    ctx->state[2] = ctx->state[3] + E + Ap;
-    ctx->state[3] = ctx->state[4] + A + Bp;
-    ctx->state[4] = ctx->state[0] + B + Cp;
-    ctx->state[0] = C;
+    local.C       = ctx->state[1] + local.C + local.Dp;
+    ctx->state[1] = ctx->state[2] + local.D + local.Ep;
+    ctx->state[2] = ctx->state[3] + local.E + local.Ap;
+    ctx->state[3] = ctx->state[4] + local.A + local.Bp;
+    ctx->state[4] = ctx->state[0] + local.B + local.Cp;
+    ctx->state[0] = local.C;
+
+    /* Zeroise variables to clear sensitive data from memory. */
+    mbedtls_platform_zeroize( &local, sizeof( local ) );
 
     return( 0 );
 }
diff --git a/thirdparty/mbedtls/library/rsa.c b/thirdparty/mbedtls/library/rsa.c
index 42becbf17b..a25c633bc6 100644
--- a/thirdparty/mbedtls/library/rsa.c
+++ b/thirdparty/mbedtls/library/rsa.c
@@ -841,15 +841,14 @@ static int rsa_prepare_blinding( mbedtls_rsa_context *ctx,
          * which one, we just loop and choose new values for both of them.
          * (Each iteration succeeds with overwhelming probability.) */
         ret = mbedtls_mpi_inv_mod( &ctx->Vi, &ctx->Vi, &ctx->N );
-        if( ret == MBEDTLS_ERR_MPI_NOT_ACCEPTABLE )
-            continue;
-        if( ret != 0 )
+        if( ret != 0 && ret != MBEDTLS_ERR_MPI_NOT_ACCEPTABLE )
             goto cleanup;
 
-        /* Finish the computation of Vf^-1 = R * (R Vf)^-1 */
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mpi( &ctx->Vi, &ctx->Vi, &R ) );
-        MBEDTLS_MPI_CHK( mbedtls_mpi_mod_mpi( &ctx->Vi, &ctx->Vi, &ctx->N ) );
-    } while( 0 );
+    } while( ret == MBEDTLS_ERR_MPI_NOT_ACCEPTABLE );
+
+    /* Finish the computation of Vf^-1 = R * (R Vf)^-1 */
+    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mpi( &ctx->Vi, &ctx->Vi, &R ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_mod_mpi( &ctx->Vi, &ctx->Vi, &ctx->N ) );
 
     /* Blinding value: Vi = Vf^(-e) mod N
      * (Vi already contains Vf^-1 at this point) */
diff --git a/thirdparty/mbedtls/library/sha1.c b/thirdparty/mbedtls/library/sha1.c
index 8682abd740..e99a5e8635 100644
--- a/thirdparty/mbedtls/library/sha1.c
+++ b/thirdparty/mbedtls/library/sha1.c
@@ -155,35 +155,40 @@ void mbedtls_sha1_starts( mbedtls_sha1_context *ctx )
 int mbedtls_internal_sha1_process( mbedtls_sha1_context *ctx,
                                    const unsigned char data[64] )
 {
-    uint32_t temp, W[16], A, B, C, D, E;
+    struct
+    {
+        uint32_t temp, W[16], A, B, C, D, E;
+    } local;
 
     SHA1_VALIDATE_RET( ctx != NULL );
     SHA1_VALIDATE_RET( (const unsigned char *)data != NULL );
 
-    GET_UINT32_BE( W[ 0], data,  0 );
-    GET_UINT32_BE( W[ 1], data,  4 );
-    GET_UINT32_BE( W[ 2], data,  8 );
-    GET_UINT32_BE( W[ 3], data, 12 );
-    GET_UINT32_BE( W[ 4], data, 16 );
-    GET_UINT32_BE( W[ 5], data, 20 );
-    GET_UINT32_BE( W[ 6], data, 24 );
-    GET_UINT32_BE( W[ 7], data, 28 );
-    GET_UINT32_BE( W[ 8], data, 32 );
-    GET_UINT32_BE( W[ 9], data, 36 );
-    GET_UINT32_BE( W[10], data, 40 );
-    GET_UINT32_BE( W[11], data, 44 );
-    GET_UINT32_BE( W[12], data, 48 );
-    GET_UINT32_BE( W[13], data, 52 );
-    GET_UINT32_BE( W[14], data, 56 );
-    GET_UINT32_BE( W[15], data, 60 );
+    GET_UINT32_BE( local.W[ 0], data,  0 );
+    GET_UINT32_BE( local.W[ 1], data,  4 );
+    GET_UINT32_BE( local.W[ 2], data,  8 );
+    GET_UINT32_BE( local.W[ 3], data, 12 );
+    GET_UINT32_BE( local.W[ 4], data, 16 );
+    GET_UINT32_BE( local.W[ 5], data, 20 );
+    GET_UINT32_BE( local.W[ 6], data, 24 );
+    GET_UINT32_BE( local.W[ 7], data, 28 );
+    GET_UINT32_BE( local.W[ 8], data, 32 );
+    GET_UINT32_BE( local.W[ 9], data, 36 );
+    GET_UINT32_BE( local.W[10], data, 40 );
+    GET_UINT32_BE( local.W[11], data, 44 );
+    GET_UINT32_BE( local.W[12], data, 48 );
+    GET_UINT32_BE( local.W[13], data, 52 );
+    GET_UINT32_BE( local.W[14], data, 56 );
+    GET_UINT32_BE( local.W[15], data, 60 );
 
 #define S(x,n) (((x) << (n)) | (((x) & 0xFFFFFFFF) >> (32 - (n))))
 
 #define R(t)                                                    \
     (                                                           \
-        temp = W[( (t) -  3 ) & 0x0F] ^ W[( (t) - 8 ) & 0x0F] ^ \
-               W[( (t) - 14 ) & 0x0F] ^ W[  (t)       & 0x0F],  \
-        ( W[(t) & 0x0F] = S(temp,1) )                           \
+        local.temp = local.W[( (t) -  3 ) & 0x0F] ^             \
+                     local.W[( (t) -  8 ) & 0x0F] ^             \
+                     local.W[( (t) - 14 ) & 0x0F] ^             \
+                     local.W[  (t)        & 0x0F],              \
+        ( local.W[(t) & 0x0F] = S(local.temp,1) )               \
     )
 
 #define P(a,b,c,d,e,x)                                          \
@@ -193,35 +198,35 @@ int mbedtls_internal_sha1_process( mbedtls_sha1_context *ctx,
         (b) = S((b),30);                                        \
     } while( 0 )
 
-    A = ctx->state[0];
-    B = ctx->state[1];
-    C = ctx->state[2];
-    D = ctx->state[3];
-    E = ctx->state[4];
+    local.A = ctx->state[0];
+    local.B = ctx->state[1];
+    local.C = ctx->state[2];
+    local.D = ctx->state[3];
+    local.E = ctx->state[4];
 
 #define F(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
 #define K 0x5A827999
 
-    P( A, B, C, D, E, W[0]  );
-    P( E, A, B, C, D, W[1]  );
-    P( D, E, A, B, C, W[2]  );
-    P( C, D, E, A, B, W[3]  );
-    P( B, C, D, E, A, W[4]  );
-    P( A, B, C, D, E, W[5]  );
-    P( E, A, B, C, D, W[6]  );
-    P( D, E, A, B, C, W[7]  );
-    P( C, D, E, A, B, W[8]  );
-    P( B, C, D, E, A, W[9]  );
-    P( A, B, C, D, E, W[10] );
-    P( E, A, B, C, D, W[11] );
-    P( D, E, A, B, C, W[12] );
-    P( C, D, E, A, B, W[13] );
-    P( B, C, D, E, A, W[14] );
-    P( A, B, C, D, E, W[15] );
-    P( E, A, B, C, D, R(16) );
-    P( D, E, A, B, C, R(17) );
-    P( C, D, E, A, B, R(18) );
-    P( B, C, D, E, A, R(19) );
+    P( local.A, local.B, local.C, local.D, local.E, local.W[0]  );
+    P( local.E, local.A, local.B, local.C, local.D, local.W[1]  );
+    P( local.D, local.E, local.A, local.B, local.C, local.W[2]  );
+    P( local.C, local.D, local.E, local.A, local.B, local.W[3]  );
+    P( local.B, local.C, local.D, local.E, local.A, local.W[4]  );
+    P( local.A, local.B, local.C, local.D, local.E, local.W[5]  );
+    P( local.E, local.A, local.B, local.C, local.D, local.W[6]  );
+    P( local.D, local.E, local.A, local.B, local.C, local.W[7]  );
+    P( local.C, local.D, local.E, local.A, local.B, local.W[8]  );
+    P( local.B, local.C, local.D, local.E, local.A, local.W[9]  );
+    P( local.A, local.B, local.C, local.D, local.E, local.W[10] );
+    P( local.E, local.A, local.B, local.C, local.D, local.W[11] );
+    P( local.D, local.E, local.A, local.B, local.C, local.W[12] );
+    P( local.C, local.D, local.E, local.A, local.B, local.W[13] );
+    P( local.B, local.C, local.D, local.E, local.A, local.W[14] );
+    P( local.A, local.B, local.C, local.D, local.E, local.W[15] );
+    P( local.E, local.A, local.B, local.C, local.D, R(16) );
+    P( local.D, local.E, local.A, local.B, local.C, R(17) );
+    P( local.C, local.D, local.E, local.A, local.B, R(18) );
+    P( local.B, local.C, local.D, local.E, local.A, R(19) );
 
 #undef K
 #undef F
@@ -229,26 +234,26 @@ int mbedtls_internal_sha1_process( mbedtls_sha1_context *ctx,
 #define F(x,y,z) ((x) ^ (y) ^ (z))
 #define K 0x6ED9EBA1
 
-    P( A, B, C, D, E, R(20) );
-    P( E, A, B, C, D, R(21) );
-    P( D, E, A, B, C, R(22) );
-    P( C, D, E, A, B, R(23) );
-    P( B, C, D, E, A, R(24) );
-    P( A, B, C, D, E, R(25) );
-    P( E, A, B, C, D, R(26) );
-    P( D, E, A, B, C, R(27) );
-    P( C, D, E, A, B, R(28) );
-    P( B, C, D, E, A, R(29) );
-    P( A, B, C, D, E, R(30) );
-    P( E, A, B, C, D, R(31) );
-    P( D, E, A, B, C, R(32) );
-    P( C, D, E, A, B, R(33) );
-    P( B, C, D, E, A, R(34) );
-    P( A, B, C, D, E, R(35) );
-    P( E, A, B, C, D, R(36) );
-    P( D, E, A, B, C, R(37) );
-    P( C, D, E, A, B, R(38) );
-    P( B, C, D, E, A, R(39) );
+    P( local.A, local.B, local.C, local.D, local.E, R(20) );
+    P( local.E, local.A, local.B, local.C, local.D, R(21) );
+    P( local.D, local.E, local.A, local.B, local.C, R(22) );
+    P( local.C, local.D, local.E, local.A, local.B, R(23) );
+    P( local.B, local.C, local.D, local.E, local.A, R(24) );
+    P( local.A, local.B, local.C, local.D, local.E, R(25) );
+    P( local.E, local.A, local.B, local.C, local.D, R(26) );
+    P( local.D, local.E, local.A, local.B, local.C, R(27) );
+    P( local.C, local.D, local.E, local.A, local.B, R(28) );
+    P( local.B, local.C, local.D, local.E, local.A, R(29) );
+    P( local.A, local.B, local.C, local.D, local.E, R(30) );
+    P( local.E, local.A, local.B, local.C, local.D, R(31) );
+    P( local.D, local.E, local.A, local.B, local.C, R(32) );
+    P( local.C, local.D, local.E, local.A, local.B, R(33) );
+    P( local.B, local.C, local.D, local.E, local.A, R(34) );
+    P( local.A, local.B, local.C, local.D, local.E, R(35) );
+    P( local.E, local.A, local.B, local.C, local.D, R(36) );
+    P( local.D, local.E, local.A, local.B, local.C, R(37) );
+    P( local.C, local.D, local.E, local.A, local.B, R(38) );
+    P( local.B, local.C, local.D, local.E, local.A, R(39) );
 
 #undef K
 #undef F
@@ -256,26 +261,26 @@ int mbedtls_internal_sha1_process( mbedtls_sha1_context *ctx,
 #define F(x,y,z) (((x) & (y)) | ((z) & ((x) | (y))))
 #define K 0x8F1BBCDC
 
-    P( A, B, C, D, E, R(40) );
-    P( E, A, B, C, D, R(41) );
-    P( D, E, A, B, C, R(42) );
-    P( C, D, E, A, B, R(43) );
-    P( B, C, D, E, A, R(44) );
-    P( A, B, C, D, E, R(45) );
-    P( E, A, B, C, D, R(46) );
-    P( D, E, A, B, C, R(47) );
-    P( C, D, E, A, B, R(48) );
-    P( B, C, D, E, A, R(49) );
-    P( A, B, C, D, E, R(50) );
-    P( E, A, B, C, D, R(51) );
-    P( D, E, A, B, C, R(52) );
-    P( C, D, E, A, B, R(53) );
-    P( B, C, D, E, A, R(54) );
-    P( A, B, C, D, E, R(55) );
-    P( E, A, B, C, D, R(56) );
-    P( D, E, A, B, C, R(57) );
-    P( C, D, E, A, B, R(58) );
-    P( B, C, D, E, A, R(59) );
+    P( local.A, local.B, local.C, local.D, local.E, R(40) );
+    P( local.E, local.A, local.B, local.C, local.D, R(41) );
+    P( local.D, local.E, local.A, local.B, local.C, R(42) );
+    P( local.C, local.D, local.E, local.A, local.B, R(43) );
+    P( local.B, local.C, local.D, local.E, local.A, R(44) );
+    P( local.A, local.B, local.C, local.D, local.E, R(45) );
+    P( local.E, local.A, local.B, local.C, local.D, R(46) );
+    P( local.D, local.E, local.A, local.B, local.C, R(47) );
+    P( local.C, local.D, local.E, local.A, local.B, R(48) );
+    P( local.B, local.C, local.D, local.E, local.A, R(49) );
+    P( local.A, local.B, local.C, local.D, local.E, R(50) );
+    P( local.E, local.A, local.B, local.C, local.D, R(51) );
+    P( local.D, local.E, local.A, local.B, local.C, R(52) );
+    P( local.C, local.D, local.E, local.A, local.B, R(53) );
+    P( local.B, local.C, local.D, local.E, local.A, R(54) );
+    P( local.A, local.B, local.C, local.D, local.E, R(55) );
+    P( local.E, local.A, local.B, local.C, local.D, R(56) );
+    P( local.D, local.E, local.A, local.B, local.C, R(57) );
+    P( local.C, local.D, local.E, local.A, local.B, R(58) );
+    P( local.B, local.C, local.D, local.E, local.A, R(59) );
 
 #undef K
 #undef F
@@ -283,35 +288,38 @@ int mbedtls_internal_sha1_process( mbedtls_sha1_context *ctx,
 #define F(x,y,z) ((x) ^ (y) ^ (z))
 #define K 0xCA62C1D6
 
-    P( A, B, C, D, E, R(60) );
-    P( E, A, B, C, D, R(61) );
-    P( D, E, A, B, C, R(62) );
-    P( C, D, E, A, B, R(63) );
-    P( B, C, D, E, A, R(64) );
-    P( A, B, C, D, E, R(65) );
-    P( E, A, B, C, D, R(66) );
-    P( D, E, A, B, C, R(67) );
-    P( C, D, E, A, B, R(68) );
-    P( B, C, D, E, A, R(69) );
-    P( A, B, C, D, E, R(70) );
-    P( E, A, B, C, D, R(71) );
-    P( D, E, A, B, C, R(72) );
-    P( C, D, E, A, B, R(73) );
-    P( B, C, D, E, A, R(74) );
-    P( A, B, C, D, E, R(75) );
-    P( E, A, B, C, D, R(76) );
-    P( D, E, A, B, C, R(77) );
-    P( C, D, E, A, B, R(78) );
-    P( B, C, D, E, A, R(79) );
+    P( local.A, local.B, local.C, local.D, local.E, R(60) );
+    P( local.E, local.A, local.B, local.C, local.D, R(61) );
+    P( local.D, local.E, local.A, local.B, local.C, R(62) );
+    P( local.C, local.D, local.E, local.A, local.B, R(63) );
+    P( local.B, local.C, local.D, local.E, local.A, R(64) );
+    P( local.A, local.B, local.C, local.D, local.E, R(65) );
+    P( local.E, local.A, local.B, local.C, local.D, R(66) );
+    P( local.D, local.E, local.A, local.B, local.C, R(67) );
+    P( local.C, local.D, local.E, local.A, local.B, R(68) );
+    P( local.B, local.C, local.D, local.E, local.A, R(69) );
+    P( local.A, local.B, local.C, local.D, local.E, R(70) );
+    P( local.E, local.A, local.B, local.C, local.D, R(71) );
+    P( local.D, local.E, local.A, local.B, local.C, R(72) );
+    P( local.C, local.D, local.E, local.A, local.B, R(73) );
+    P( local.B, local.C, local.D, local.E, local.A, R(74) );
+    P( local.A, local.B, local.C, local.D, local.E, R(75) );
+    P( local.E, local.A, local.B, local.C, local.D, R(76) );
+    P( local.D, local.E, local.A, local.B, local.C, R(77) );
+    P( local.C, local.D, local.E, local.A, local.B, R(78) );
+    P( local.B, local.C, local.D, local.E, local.A, R(79) );
 
 #undef K
 #undef F
 
-    ctx->state[0] += A;
-    ctx->state[1] += B;
-    ctx->state[2] += C;
-    ctx->state[3] += D;
-    ctx->state[4] += E;
+    ctx->state[0] += local.A;
+    ctx->state[1] += local.B;
+    ctx->state[2] += local.C;
+    ctx->state[3] += local.D;
+    ctx->state[4] += local.E;
+
+    /* Zeroise buffers and variables to clear sensitive data from memory. */
+    mbedtls_platform_zeroize( &local, sizeof( local ) );
 
     return( 0 );
 }
diff --git a/thirdparty/mbedtls/library/sha256.c b/thirdparty/mbedtls/library/sha256.c
index 5169584b68..75a8f8a2b2 100644
--- a/thirdparty/mbedtls/library/sha256.c
+++ b/thirdparty/mbedtls/library/sha256.c
@@ -209,77 +209,104 @@ static const uint32_t K[] =
 #define F0(x,y,z) (((x) & (y)) | ((z) & ((x) | (y))))
 #define F1(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
 
-#define R(t)                                    \
-    (                                           \
-        W[t] = S1(W[(t) -  2]) + W[(t) -  7] +  \
-               S0(W[(t) - 15]) + W[(t) - 16]    \
+#define R(t)                                                        \
+    (                                                               \
+        local.W[t] = S1(local.W[(t) -  2]) + local.W[(t) -  7] +    \
+                     S0(local.W[(t) - 15]) + local.W[(t) - 16]      \
     )
 
-#define P(a,b,c,d,e,f,g,h,x,K)                          \
-    do                                                  \
-    {                                                   \
-        temp1 = (h) + S3(e) + F1((e),(f),(g)) + (K) + (x);      \
-        temp2 = S2(a) + F0((a),(b),(c));                        \
-        (d) += temp1; (h) = temp1 + temp2;              \
+#define P(a,b,c,d,e,f,g,h,x,K)                                      \
+    do                                                              \
+    {                                                               \
+        local.temp1 = (h) + S3(e) + F1((e),(f),(g)) + (K) + (x);    \
+        local.temp2 = S2(a) + F0((a),(b),(c));                      \
+        (d) += local.temp1; (h) = local.temp1 + local.temp2;        \
     } while( 0 )
 
 int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
                                 const unsigned char data[64] )
 {
-    uint32_t temp1, temp2, W[64];
-    uint32_t A[8];
+    struct
+    {
+        uint32_t temp1, temp2, W[64];
+        uint32_t A[8];
+    } local;
+
     unsigned int i;
 
     SHA256_VALIDATE_RET( ctx != NULL );
     SHA256_VALIDATE_RET( (const unsigned char *)data != NULL );
 
     for( i = 0; i < 8; i++ )
-        A[i] = ctx->state[i];
+        local.A[i] = ctx->state[i];
 
 #if defined(MBEDTLS_SHA256_SMALLER)
     for( i = 0; i < 64; i++ )
     {
         if( i < 16 )
-            GET_UINT32_BE( W[i], data, 4 * i );
+            GET_UINT32_BE( local.W[i], data, 4 * i );
         else
             R( i );
 
-        P( A[0], A[1], A[2], A[3], A[4], A[5], A[6], A[7], W[i], K[i] );
+        P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
+           local.A[5], local.A[6], local.A[7], local.W[i], K[i] );
 
-        temp1 = A[7]; A[7] = A[6]; A[6] = A[5]; A[5] = A[4]; A[4] = A[3];
-        A[3] = A[2]; A[2] = A[1]; A[1] = A[0]; A[0] = temp1;
+        local.temp1 = local.A[7]; local.A[7] = local.A[6];
+        local.A[6] = local.A[5]; local.A[5] = local.A[4];
+        local.A[4] = local.A[3]; local.A[3] = local.A[2];
+        local.A[2] = local.A[1]; local.A[1] = local.A[0];
+        local.A[0] = local.temp1;
     }
 #else /* MBEDTLS_SHA256_SMALLER */
     for( i = 0; i < 16; i++ )
-        GET_UINT32_BE( W[i], data, 4 * i );
+        GET_UINT32_BE( local.W[i], data, 4 * i );
 
     for( i = 0; i < 16; i += 8 )
     {
-        P( A[0], A[1], A[2], A[3], A[4], A[5], A[6], A[7], W[i+0], K[i+0] );
-        P( A[7], A[0], A[1], A[2], A[3], A[4], A[5], A[6], W[i+1], K[i+1] );
-        P( A[6], A[7], A[0], A[1], A[2], A[3], A[4], A[5], W[i+2], K[i+2] );
-        P( A[5], A[6], A[7], A[0], A[1], A[2], A[3], A[4], W[i+3], K[i+3] );
-        P( A[4], A[5], A[6], A[7], A[0], A[1], A[2], A[3], W[i+4], K[i+4] );
-        P( A[3], A[4], A[5], A[6], A[7], A[0], A[1], A[2], W[i+5], K[i+5] );
-        P( A[2], A[3], A[4], A[5], A[6], A[7], A[0], A[1], W[i+6], K[i+6] );
-        P( A[1], A[2], A[3], A[4], A[5], A[6], A[7], A[0], W[i+7], K[i+7] );
+        P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
+           local.A[5], local.A[6], local.A[7], local.W[i+0], K[i+0] );
+        P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3],
+           local.A[4], local.A[5], local.A[6], local.W[i+1], K[i+1] );
+        P( local.A[6], local.A[7], local.A[0], local.A[1], local.A[2],
+           local.A[3], local.A[4], local.A[5], local.W[i+2], K[i+2] );
+        P( local.A[5], local.A[6], local.A[7], local.A[0], local.A[1],
+           local.A[2], local.A[3], local.A[4], local.W[i+3], K[i+3] );
+        P( local.A[4], local.A[5], local.A[6], local.A[7], local.A[0],
+           local.A[1], local.A[2], local.A[3], local.W[i+4], K[i+4] );
+        P( local.A[3], local.A[4], local.A[5], local.A[6], local.A[7],
+           local.A[0], local.A[1], local.A[2], local.W[i+5], K[i+5] );
+        P( local.A[2], local.A[3], local.A[4], local.A[5], local.A[6],
+           local.A[7], local.A[0], local.A[1], local.W[i+6], K[i+6] );
+        P( local.A[1], local.A[2], local.A[3], local.A[4], local.A[5],
+           local.A[6], local.A[7], local.A[0], local.W[i+7], K[i+7] );
     }
 
     for( i = 16; i < 64; i += 8 )
     {
-        P( A[0], A[1], A[2], A[3], A[4], A[5], A[6], A[7], R(i+0), K[i+0] );
-        P( A[7], A[0], A[1], A[2], A[3], A[4], A[5], A[6], R(i+1), K[i+1] );
-        P( A[6], A[7], A[0], A[1], A[2], A[3], A[4], A[5], R(i+2), K[i+2] );
-        P( A[5], A[6], A[7], A[0], A[1], A[2], A[3], A[4], R(i+3), K[i+3] );
-        P( A[4], A[5], A[6], A[7], A[0], A[1], A[2], A[3], R(i+4), K[i+4] );
-        P( A[3], A[4], A[5], A[6], A[7], A[0], A[1], A[2], R(i+5), K[i+5] );
-        P( A[2], A[3], A[4], A[5], A[6], A[7], A[0], A[1], R(i+6), K[i+6] );
-        P( A[1], A[2], A[3], A[4], A[5], A[6], A[7], A[0], R(i+7), K[i+7] );
+        P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
+           local.A[5], local.A[6], local.A[7], R(i+0), K[i+0] );
+        P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3],
+           local.A[4], local.A[5], local.A[6], R(i+1), K[i+1] );
+        P( local.A[6], local.A[7], local.A[0], local.A[1], local.A[2],
+           local.A[3], local.A[4], local.A[5], R(i+2), K[i+2] );
+        P( local.A[5], local.A[6], local.A[7], local.A[0], local.A[1],
+           local.A[2], local.A[3], local.A[4], R(i+3), K[i+3] );
+        P( local.A[4], local.A[5], local.A[6], local.A[7], local.A[0],
+           local.A[1], local.A[2], local.A[3], R(i+4), K[i+4] );
+        P( local.A[3], local.A[4], local.A[5], local.A[6], local.A[7],
+           local.A[0], local.A[1], local.A[2], R(i+5), K[i+5] );
+        P( local.A[2], local.A[3], local.A[4], local.A[5], local.A[6],
+           local.A[7], local.A[0], local.A[1], R(i+6), K[i+6] );
+        P( local.A[1], local.A[2], local.A[3], local.A[4], local.A[5],
+           local.A[6], local.A[7], local.A[0], R(i+7), K[i+7] );
     }
 #endif /* MBEDTLS_SHA256_SMALLER */
 
     for( i = 0; i < 8; i++ )
-        ctx->state[i] += A[i];
+        ctx->state[i] += local.A[i];
+
+    /* Zeroise buffers and variables to clear sensitive data from memory. */
+    mbedtls_platform_zeroize( &local, sizeof( local ) );
 
     return( 0 );
 }
diff --git a/thirdparty/mbedtls/library/sha512.c b/thirdparty/mbedtls/library/sha512.c
index 36d5d96146..986037ab7c 100644
--- a/thirdparty/mbedtls/library/sha512.c
+++ b/thirdparty/mbedtls/library/sha512.c
@@ -243,8 +243,11 @@ int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
                                      const unsigned char data[128] )
 {
     int i;
-    uint64_t temp1, temp2, W[80];
-    uint64_t A, B, C, D, E, F, G, H;
+    struct
+    {
+        uint64_t temp1, temp2, W[80];
+        uint64_t A, B, C, D, E, F, G, H;
+    } local;
 
     SHA512_VALIDATE_RET( ctx != NULL );
     SHA512_VALIDATE_RET( (const unsigned char *)data != NULL );
@@ -261,56 +264,67 @@ int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
 #define F0(x,y,z) (((x) & (y)) | ((z) & ((x) | (y))))
 #define F1(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
 
-#define P(a,b,c,d,e,f,g,h,x,K)                                  \
-    do                                                          \
-    {                                                           \
-        temp1 = (h) + S3(e) + F1((e),(f),(g)) + (K) + (x);      \
-        temp2 = S2(a) + F0((a),(b),(c));                        \
-        (d) += temp1; (h) = temp1 + temp2;                      \
+#define P(a,b,c,d,e,f,g,h,x,K)                                      \
+    do                                                              \
+    {                                                               \
+        local.temp1 = (h) + S3(e) + F1((e),(f),(g)) + (K) + (x);    \
+        local.temp2 = S2(a) + F0((a),(b),(c));                      \
+        (d) += local.temp1; (h) = local.temp1 + local.temp2;        \
     } while( 0 )
 
     for( i = 0; i < 16; i++ )
     {
-        GET_UINT64_BE( W[i], data, i << 3 );
+        GET_UINT64_BE( local.W[i], data, i << 3 );
     }
 
     for( ; i < 80; i++ )
     {
-        W[i] = S1(W[i -  2]) + W[i -  7] +
-               S0(W[i - 15]) + W[i - 16];
+        local.W[i] = S1(local.W[i -  2]) + local.W[i -  7] +
+                     S0(local.W[i - 15]) + local.W[i - 16];
     }
 
-    A = ctx->state[0];
-    B = ctx->state[1];
-    C = ctx->state[2];
-    D = ctx->state[3];
-    E = ctx->state[4];
-    F = ctx->state[5];
-    G = ctx->state[6];
-    H = ctx->state[7];
+    local.A = ctx->state[0];
+    local.B = ctx->state[1];
+    local.C = ctx->state[2];
+    local.D = ctx->state[3];
+    local.E = ctx->state[4];
+    local.F = ctx->state[5];
+    local.G = ctx->state[6];
+    local.H = ctx->state[7];
     i = 0;
 
     do
     {
-        P( A, B, C, D, E, F, G, H, W[i], K[i] ); i++;
-        P( H, A, B, C, D, E, F, G, W[i], K[i] ); i++;
-        P( G, H, A, B, C, D, E, F, W[i], K[i] ); i++;
-        P( F, G, H, A, B, C, D, E, W[i], K[i] ); i++;
-        P( E, F, G, H, A, B, C, D, W[i], K[i] ); i++;
-        P( D, E, F, G, H, A, B, C, W[i], K[i] ); i++;
-        P( C, D, E, F, G, H, A, B, W[i], K[i] ); i++;
-        P( B, C, D, E, F, G, H, A, W[i], K[i] ); i++;
+        P( local.A, local.B, local.C, local.D, local.E,
+           local.F, local.G, local.H, local.W[i], K[i] ); i++;
+        P( local.H, local.A, local.B, local.C, local.D,
+           local.E, local.F, local.G, local.W[i], K[i] ); i++;
+        P( local.G, local.H, local.A, local.B, local.C,
+           local.D, local.E, local.F, local.W[i], K[i] ); i++;
+        P( local.F, local.G, local.H, local.A, local.B,
+           local.C, local.D, local.E, local.W[i], K[i] ); i++;
+        P( local.E, local.F, local.G, local.H, local.A,
+           local.B, local.C, local.D, local.W[i], K[i] ); i++;
+        P( local.D, local.E, local.F, local.G, local.H,
+           local.A, local.B, local.C, local.W[i], K[i] ); i++;
+        P( local.C, local.D, local.E, local.F, local.G,
+           local.H, local.A, local.B, local.W[i], K[i] ); i++;
+        P( local.B, local.C, local.D, local.E, local.F,
+           local.G, local.H, local.A, local.W[i], K[i] ); i++;
     }
     while( i < 80 );
 
-    ctx->state[0] += A;
-    ctx->state[1] += B;
-    ctx->state[2] += C;
-    ctx->state[3] += D;
-    ctx->state[4] += E;
-    ctx->state[5] += F;
-    ctx->state[6] += G;
-    ctx->state[7] += H;
+    ctx->state[0] += local.A;
+    ctx->state[1] += local.B;
+    ctx->state[2] += local.C;
+    ctx->state[3] += local.D;
+    ctx->state[4] += local.E;
+    ctx->state[5] += local.F;
+    ctx->state[6] += local.G;
+    ctx->state[7] += local.H;
+
+    /* Zeroise buffers and variables to clear sensitive data from memory. */
+    mbedtls_platform_zeroize( &local, sizeof( local ) );
 
     return( 0 );
 }
diff --git a/thirdparty/mbedtls/library/ssl_srv.c b/thirdparty/mbedtls/library/ssl_srv.c
index 97b778452c..cbf6142ac2 100644
--- a/thirdparty/mbedtls/library/ssl_srv.c
+++ b/thirdparty/mbedtls/library/ssl_srv.c
@@ -3587,11 +3587,12 @@ static int ssl_parse_encrypted_pms( mbedtls_ssl_context *ssl,
     /* In case of a failure in decryption, the decryption may write less than
      * 2 bytes of output, but we always read the first two bytes. It doesn't
      * matter in the end because diff will be nonzero in that case due to
-     * peer_pmslen being less than 48, and we only care whether diff is 0.
-     * But do initialize peer_pms for robustness anyway. This also makes
-     * memory analyzers happy (don't access uninitialized memory, even
-     * if it's an unsigned char). */
+     * ret being nonzero, and we only care whether diff is 0.
+     * But do initialize peer_pms and peer_pmslen for robustness anyway. This
+     * also makes memory analyzers happy (don't access uninitialized memory,
+     * even if it's an unsigned char). */
     peer_pms[0] = peer_pms[1] = ~0;
+    peer_pmslen = 0;
 
     ret = ssl_decrypt_encrypted_pms( ssl, p, end,
                                      peer_pms,
diff --git a/thirdparty/mbedtls/library/ssl_tls.c b/thirdparty/mbedtls/library/ssl_tls.c
index 2471600c9a..c749a8611c 100644
--- a/thirdparty/mbedtls/library/ssl_tls.c
+++ b/thirdparty/mbedtls/library/ssl_tls.c
@@ -621,7 +621,7 @@ static void ssl_calc_finished_tls( mbedtls_ssl_context *, unsigned char *, int )
 #if defined(MBEDTLS_SSL_PROTO_TLS1_2)
 #if defined(MBEDTLS_SHA256_C)
 static void ssl_update_checksum_sha256( mbedtls_ssl_context *, const unsigned char *, size_t );
-static void ssl_calc_verify_tls_sha256( mbedtls_ssl_context *,unsigned char * );
+static void ssl_calc_verify_tls_sha256( mbedtls_ssl_context *, unsigned char * );
 static void ssl_calc_finished_tls_sha256( mbedtls_ssl_context *,unsigned char *, int );
 #endif
 
@@ -1142,7 +1142,7 @@ int mbedtls_ssl_derive_keys( mbedtls_ssl_context *ssl )
 }
 
 #if defined(MBEDTLS_SSL_PROTO_SSL3)
-void ssl_calc_verify_ssl( mbedtls_ssl_context *ssl, unsigned char hash[36] )
+void ssl_calc_verify_ssl( mbedtls_ssl_context *ssl, unsigned char *hash )
 {
     mbedtls_md5_context md5;
     mbedtls_sha1_context sha1;
@@ -1191,7 +1191,7 @@ void ssl_calc_verify_ssl( mbedtls_ssl_context *ssl, unsigned char hash[36] )
 #endif /* MBEDTLS_SSL_PROTO_SSL3 */
 
 #if defined(MBEDTLS_SSL_PROTO_TLS1) || defined(MBEDTLS_SSL_PROTO_TLS1_1)
-void ssl_calc_verify_tls( mbedtls_ssl_context *ssl, unsigned char hash[36] )
+void ssl_calc_verify_tls( mbedtls_ssl_context *ssl, unsigned char *hash )
 {
     mbedtls_md5_context md5;
     mbedtls_sha1_context sha1;
@@ -1219,7 +1219,7 @@ void ssl_calc_verify_tls( mbedtls_ssl_context *ssl, unsigned char hash[36] )
 
 #if defined(MBEDTLS_SSL_PROTO_TLS1_2)
 #if defined(MBEDTLS_SHA256_C)
-void ssl_calc_verify_tls_sha256( mbedtls_ssl_context *ssl, unsigned char hash[32] )
+void ssl_calc_verify_tls_sha256( mbedtls_ssl_context *ssl, unsigned char *hash )
 {
     mbedtls_sha256_context sha256;
 
@@ -1240,7 +1240,7 @@ void ssl_calc_verify_tls_sha256( mbedtls_ssl_context *ssl, unsigned char hash[32
 #endif /* MBEDTLS_SHA256_C */
 
 #if defined(MBEDTLS_SHA512_C)
-void ssl_calc_verify_tls_sha384( mbedtls_ssl_context *ssl, unsigned char hash[48] )
+void ssl_calc_verify_tls_sha384( mbedtls_ssl_context *ssl, unsigned char *hash )
 {
     mbedtls_sha512_context sha512;
 
@@ -6363,6 +6363,9 @@ static void ssl_calc_finished_tls_sha256(
 #endif /* MBEDTLS_SHA256_C */
 
 #if defined(MBEDTLS_SHA512_C)
+
+typedef int (*finish_sha384_t)(mbedtls_sha512_context*, unsigned char*);
+
 static void ssl_calc_finished_tls_sha384(
                 mbedtls_ssl_context *ssl, unsigned char *buf, int from )
 {
@@ -6370,6 +6373,12 @@ static void ssl_calc_finished_tls_sha384(
     const char *sender;
     mbedtls_sha512_context sha512;
     unsigned char padbuf[48];
+    /*
+     * For SHA-384, we can save 16 bytes by keeping padbuf 48 bytes long.
+     * However, to avoid stringop-overflow warning in gcc, we have to cast
+     * mbedtls_sha512_finish_ret().
+     */
+    finish_sha384_t finish_sha384 = (finish_sha384_t)mbedtls_sha512_finish_ret;
 
     mbedtls_ssl_session *session = ssl->session_negotiate;
     if( !session )
@@ -6396,7 +6405,7 @@ static void ssl_calc_finished_tls_sha384(
              ? "client finished"
              : "server finished";
 
-    mbedtls_sha512_finish_ret( &sha512, padbuf );
+    finish_sha384( &sha512, padbuf );
 
     ssl->handshake->tls_prf( session->master, 48, sender,
                              padbuf, 48, buf, len );
diff --git a/thirdparty/mbedtls/library/threading.c b/thirdparty/mbedtls/library/threading.c
index 61c4b94041..f4f29cff5e 100644
--- a/thirdparty/mbedtls/library/threading.c
+++ b/thirdparty/mbedtls/library/threading.c
@@ -73,7 +73,7 @@
 
 #if !( ( defined(_POSIX_VERSION) && _POSIX_VERSION >= 200809L ) ||     \
        ( defined(_POSIX_THREAD_SAFE_FUNCTIONS ) &&                     \
-         _POSIX_THREAD_SAFE_FUNCTIONS >= 20112L ) )
+         _POSIX_THREAD_SAFE_FUNCTIONS >= 200112L ) )
 /*
  * This is a convenience shorthand macro to avoid checking the long
  * preprocessor conditions above. Ideally, we could expose this macro in
@@ -88,7 +88,7 @@
 
 #endif /* !( ( defined(_POSIX_VERSION) && _POSIX_VERSION >= 200809L ) ||     \
              ( defined(_POSIX_THREAD_SAFE_FUNCTIONS ) &&                     \
-                _POSIX_THREAD_SAFE_FUNCTIONS >= 20112L ) ) */
+                _POSIX_THREAD_SAFE_FUNCTIONS >= 200112L ) ) */
 
 #endif /* MBEDTLS_HAVE_TIME_DATE && !MBEDTLS_PLATFORM_GMTIME_R_ALT */
 
diff --git a/thirdparty/mbedtls/library/x509_crt.c b/thirdparty/mbedtls/library/x509_crt.c
index de40eaaf58..c458c25ff4 100644
--- a/thirdparty/mbedtls/library/x509_crt.c
+++ b/thirdparty/mbedtls/library/x509_crt.c
@@ -1101,6 +1101,7 @@ static int x509_crt_parse_der_core( mbedtls_x509_crt *crt, const unsigned char *
 
     if( crt->sig_oid.len != sig_oid2.len ||
         memcmp( crt->sig_oid.p, sig_oid2.p, crt->sig_oid.len ) != 0 ||
+        sig_params1.tag != sig_params2.tag ||
         sig_params1.len != sig_params2.len ||
         ( sig_params1.len != 0 &&
           memcmp( sig_params1.p, sig_params2.p, sig_params1.len ) != 0 ) )
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md
new file mode 100644
index 0000000000..4fcd766d22
--- /dev/null
+++ b/thirdparty/meshoptimizer/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016-2020 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/thirdparty/meshoptimizer/allocator.cpp b/thirdparty/meshoptimizer/allocator.cpp
new file mode 100644
index 0000000000..da7cc540b2
--- /dev/null
+++ b/thirdparty/meshoptimizer/allocator.cpp
@@ -0,0 +1,8 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*))
+{
+	meshopt_Allocator::Storage::allocate = allocate;
+	meshopt_Allocator::Storage::deallocate = deallocate;
+}
diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp
new file mode 100644
index 0000000000..f7d88c5136
--- /dev/null
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@@ -0,0 +1,351 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
+// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
+// Jack Ritter. An Efficient Bounding Sphere. 1990
+namespace meshopt
+{
+
+static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
+{
+	assert(count > 0);
+
+	// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
+	size_t pmin[3] = {0, 0, 0};
+	size_t pmax[3] = {0, 0, 0};
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* p = points[i];
+
+		for (int axis = 0; axis < 3; ++axis)
+		{
+			pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
+			pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
+		}
+	}
+
+	// find the pair of points with largest distance
+	float paxisd2 = 0;
+	int paxis = 0;
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		const float* p1 = points[pmin[axis]];
+		const float* p2 = points[pmax[axis]];
+
+		float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
+
+		if (d2 > paxisd2)
+		{
+			paxisd2 = d2;
+			paxis = axis;
+		}
+	}
+
+	// use the longest segment as the initial sphere diameter
+	const float* p1 = points[pmin[paxis]];
+	const float* p2 = points[pmax[paxis]];
+
+	float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
+	float radius = sqrtf(paxisd2) / 2;
+
+	// iteratively adjust the sphere up until all points fit
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* p = points[i];
+		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+		if (d2 > radius * radius)
+		{
+			float d = sqrtf(d2);
+			assert(d > 0);
+
+			float k = 0.5f + (radius / d) / 2;
+
+			center[0] = center[0] * k + p[0] * (1 - k);
+			center[1] = center[1] * k + p[1] * (1 - k);
+			center[2] = center[2] * k + p[2] * (1 - k);
+			radius = (radius + d) / 2;
+		}
+	}
+
+	result[0] = center[0];
+	result[1] = center[1];
+	result[2] = center[2];
+	result[3] = radius;
+}
+
+} // namespace meshopt
+
+size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
+{
+	assert(index_count % 3 == 0);
+	assert(max_vertices >= 3);
+	assert(max_triangles >= 1);
+
+	// meshlet construction is limited by max vertices and max triangles per meshlet
+	// the worst case is that the input is an unindexed stream since this equally stresses both limits
+	// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
+	size_t max_vertices_conservative = max_vertices - 2;
+	size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
+	size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
+
+	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
+}
+
+size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	assert(index_count % 3 == 0);
+	assert(max_vertices >= 3);
+	assert(max_triangles >= 1);
+
+	meshopt_Allocator allocator;
+
+	meshopt_Meshlet meshlet;
+	memset(&meshlet, 0, sizeof(meshlet));
+
+	assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0]));
+	assert(max_triangles <= sizeof(meshlet.indices) / 3);
+
+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, -1, vertex_count);
+
+	size_t offset = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		unsigned char& av = used[a];
+		unsigned char& bv = used[b];
+		unsigned char& cv = used[c];
+
+		unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+
+		if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+		{
+			destination[offset++] = meshlet;
+
+			for (size_t j = 0; j < meshlet.vertex_count; ++j)
+				used[meshlet.vertices[j]] = 0xff;
+
+			memset(&meshlet, 0, sizeof(meshlet));
+		}
+
+		if (av == 0xff)
+		{
+			av = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = a;
+		}
+
+		if (bv == 0xff)
+		{
+			bv = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = b;
+		}
+
+		if (cv == 0xff)
+		{
+			cv = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = c;
+		}
+
+		meshlet.indices[meshlet.triangle_count][0] = av;
+		meshlet.indices[meshlet.triangle_count][1] = bv;
+		meshlet.indices[meshlet.triangle_count][2] = cv;
+		meshlet.triangle_count++;
+	}
+
+	if (meshlet.triangle_count)
+		destination[offset++] = meshlet;
+
+	assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+
+	return offset;
+}
+
+meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	assert(index_count / 3 <= 256);
+
+	(void)vertex_count;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	// compute triangle normals and gather triangle corners
+	float normals[256][3];
+	float corners[256][3][3];
+	size_t triangles = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* p0 = vertex_positions + vertex_stride_float * a;
+		const float* p1 = vertex_positions + vertex_stride_float * b;
+		const float* p2 = vertex_positions + vertex_stride_float * c;
+
+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+		// no need to include degenerate triangles - they will be invisible anyway
+		if (area == 0.f)
+			continue;
+
+		// record triangle normals & corners for future use; normal and corner 0 define a plane equation
+		normals[triangles][0] = normalx / area;
+		normals[triangles][1] = normaly / area;
+		normals[triangles][2] = normalz / area;
+		memcpy(corners[triangles][0], p0, 3 * sizeof(float));
+		memcpy(corners[triangles][1], p1, 3 * sizeof(float));
+		memcpy(corners[triangles][2], p2, 3 * sizeof(float));
+		triangles++;
+	}
+
+	meshopt_Bounds bounds = {};
+
+	// degenerate cluster, no valid triangles => trivial reject (cone data is 0)
+	if (triangles == 0)
+		return bounds;
+
+	// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
+	float psphere[4] = {};
+	computeBoundingSphere(psphere, corners[0], triangles * 3);
+
+	float center[3] = {psphere[0], psphere[1], psphere[2]};
+
+	// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
+	float nsphere[4] = {};
+	computeBoundingSphere(nsphere, normals, triangles);
+
+	float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
+	float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
+	float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
+
+	axis[0] *= invaxislength;
+	axis[1] *= invaxislength;
+	axis[2] *= invaxislength;
+
+	// compute a tight cone around all normals, mindp = cos(angle/2)
+	float mindp = 1.f;
+
+	for (size_t i = 0; i < triangles; ++i)
+	{
+		float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
+
+		mindp = (dp < mindp) ? dp : mindp;
+	}
+
+	// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
+	bounds.center[0] = center[0];
+	bounds.center[1] = center[1];
+	bounds.center[2] = center[2];
+	bounds.radius = psphere[3];
+
+	// degenerate cluster, normal cone is larger than a hemisphere => trivial accept
+	// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
+	// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
+	if (mindp <= 0.1f)
+	{
+		bounds.cone_cutoff = 1;
+		bounds.cone_cutoff_s8 = 127;
+		return bounds;
+	}
+
+	float maxt = 0;
+
+	// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
+	for (size_t i = 0; i < triangles; ++i)
+	{
+		// dot(center-t*axis-corner, trinormal) = 0
+		// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
+		float cx = center[0] - corners[i][0][0];
+		float cy = center[1] - corners[i][0][1];
+		float cz = center[2] - corners[i][0][2];
+
+		float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
+		float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
+
+		// dn should be larger than mindp cutoff above
+		assert(dn > 0.f);
+		float t = dc / dn;
+
+		maxt = (t > maxt) ? t : maxt;
+	}
+
+	// cone apex should be in the negative half-space of all cluster triangles by construction
+	bounds.cone_apex[0] = center[0] - axis[0] * maxt;
+	bounds.cone_apex[1] = center[1] - axis[1] * maxt;
+	bounds.cone_apex[2] = center[2] - axis[2] * maxt;
+
+	// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
+	bounds.cone_axis[0] = axis[0];
+	bounds.cone_axis[1] = axis[1];
+	bounds.cone_axis[2] = axis[2];
+
+	// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
+	// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
+	bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
+
+	// quantize axis & cutoff to 8-bit SNORM format
+	bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
+	bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
+	bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
+
+	// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
+	float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
+	float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
+	float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
+
+	// note that we need to round this up instead of rounding to nearest, hence +1
+	int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
+
+	bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
+
+	return bounds;
+}
+
+meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])];
+
+	for (size_t i = 0; i < meshlet->triangle_count; ++i)
+	{
+		unsigned int a = meshlet->vertices[meshlet->indices[i][0]];
+		unsigned int b = meshlet->vertices[meshlet->indices[i][1]];
+		unsigned int c = meshlet->vertices[meshlet->indices[i][2]];
+
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		indices[i * 3 + 0] = a;
+		indices[i * 3 + 1] = b;
+		indices[i * 3 + 2] = c;
+	}
+
+	return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
+}
diff --git a/thirdparty/meshoptimizer/indexcodec.cpp b/thirdparty/meshoptimizer/indexcodec.cpp
new file mode 100644
index 0000000000..eeb541e5be
--- /dev/null
+++ b/thirdparty/meshoptimizer/indexcodec.cpp
@@ -0,0 +1,752 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+// This work is based on:
+// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
+// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
+namespace meshopt
+{
+
+const unsigned char kIndexHeader = 0xe0;
+const unsigned char kSequenceHeader = 0xd0;
+
+static int gEncodeIndexVersion = 0;
+
+typedef unsigned int VertexFifo[16];
+typedef unsigned int EdgeFifo[16][2];
+
+static const unsigned int kTriangleIndexOrder[3][3] = {
+    {0, 1, 2},
+    {1, 2, 0},
+    {2, 0, 1},
+};
+
+static const unsigned char kCodeAuxEncodingTable[16] = {
+    0x00, 0x76, 0x87, 0x56, 0x67, 0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69,
+    0, 0, // last two entries aren't used for encoding
+};
+
+static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c, unsigned int next)
+{
+	(void)a;
+
+	return (b == next) ? 1 : (c == next) ? 2 : 0;
+}
+
+static int getEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		unsigned int e0 = fifo[index][0];
+		unsigned int e1 = fifo[index][1];
+
+		if (e0 == a && e1 == b)
+			return (i << 2) | 0;
+		if (e0 == b && e1 == c)
+			return (i << 2) | 1;
+		if (e0 == c && e1 == a)
+			return (i << 2) | 2;
+	}
+
+	return -1;
+}
+
+static void pushEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, size_t& offset)
+{
+	fifo[offset][0] = a;
+	fifo[offset][1] = b;
+	offset = (offset + 1) & 15;
+}
+
+static int getVertexFifo(VertexFifo fifo, unsigned int v, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		if (fifo[index] == v)
+			return i;
+	}
+
+	return -1;
+}
+
+static void pushVertexFifo(VertexFifo fifo, unsigned int v, size_t& offset, int cond = 1)
+{
+	fifo[offset] = v;
+	offset = (offset + cond) & 15;
+}
+
+static void encodeVByte(unsigned char*& data, unsigned int v)
+{
+	// encode 32-bit value in up to 5 7-bit groups
+	do
+	{
+		*data++ = (v & 127) | (v > 127 ? 128 : 0);
+		v >>= 7;
+	} while (v);
+}
+
+static unsigned int decodeVByte(const unsigned char*& data)
+{
+	unsigned char lead = *data++;
+
+	// fast path: single byte
+	if (lead < 128)
+		return lead;
+
+	// slow path: up to 4 extra bytes
+	// note that this loop always terminates, which is important for malformed data
+	unsigned int result = lead & 127;
+	unsigned int shift = 7;
+
+	for (int i = 0; i < 4; ++i)
+	{
+		unsigned char group = *data++;
+		result |= (group & 127) << shift;
+		shift += 7;
+
+		if (group < 128)
+			break;
+	}
+
+	return result;
+}
+
+static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int last)
+{
+	unsigned int d = index - last;
+	unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+	encodeVByte(data, v);
+}
+
+static unsigned int decodeIndex(const unsigned char*& data, unsigned int last)
+{
+	unsigned int v = decodeVByte(data);
+	unsigned int d = (v >> 1) ^ -int(v & 1);
+
+	return last + d;
+}
+
+static int getCodeAuxIndex(unsigned char v, const unsigned char* table)
+{
+	for (int i = 0; i < 16; ++i)
+		if (table[i] == v)
+			return i;
+
+	return -1;
+}
+
+static void writeTriangle(void* destination, size_t offset, size_t index_size, unsigned int a, unsigned int b, unsigned int c)
+{
+	if (index_size == 2)
+	{
+		static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
+		static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
+		static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
+	}
+	else
+	{
+		static_cast<unsigned int*>(destination)[offset + 0] = a;
+		static_cast<unsigned int*>(destination)[offset + 1] = b;
+		static_cast<unsigned int*>(destination)[offset + 2] = c;
+	}
+}
+
+#if TRACE
+static size_t sortTop16(unsigned char dest[16], size_t stats[256])
+{
+	size_t destsize = 0;
+
+	for (size_t i = 0; i < 256; ++i)
+	{
+		size_t j = 0;
+		for (; j < destsize; ++j)
+		{
+			if (stats[i] >= stats[dest[j]])
+			{
+				if (destsize < 16)
+					destsize++;
+
+				memmove(&dest[j + 1], &dest[j], destsize - 1 - j);
+				dest[j] = (unsigned char)i;
+				break;
+			}
+		}
+
+		if (j == destsize && destsize < 16)
+		{
+			dest[destsize] = (unsigned char)i;
+			destsize++;
+		}
+	}
+
+	return destsize;
+}
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+#if TRACE
+	size_t codestats[256] = {};
+	size_t codeauxstats[256] = {};
+#endif
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return 0;
+
+	int version = gEncodeIndexVersion;
+
+	buffer[0] = (unsigned char)(kIndexHeader | version);
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	unsigned char* code = buffer + 1;
+	unsigned char* data = code + index_count / 3;
+	unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	int fecmax = version >= 1 ? 13 : 15;
+
+	// use static encoding table; it's possible to pack the result and then build an optimal table and repack
+	// for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set
+	const unsigned char* codeaux_table = kCodeAuxEncodingTable;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough space to write a triangle
+		// each triangle writes at most 16 bytes: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can write without extra bounds checks
+		if (data > data_safe_end)
+			return 0;
+
+		int fer = getEdgeFifo(edgefifo, indices[i + 0], indices[i + 1], indices[i + 2], edgefifooffset);
+
+		if (fer >= 0 && (fer >> 2) < 15)
+		{
+			const unsigned int* order = kTriangleIndexOrder[fer & 3];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			// encode edge index and vertex fifo index, next or free index
+			int fe = fer >> 2;
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			int fec = (fc >= 1 && fc < fecmax) ? fc : (c == next) ? (next++, 0) : 15;
+
+			if (fec == 15 && version >= 1)
+			{
+				// encode last-1 and last+1 to optimize strip-like sequences
+				if (c + 1 == last)
+					fec = 13, last = c;
+				if (c == last + 1)
+					fec = 14, last = c;
+			}
+
+			*code++ = (unsigned char)((fe << 4) | fec);
+
+#if TRACE
+			codestats[code[-1]]++;
+#endif
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fec == 15)
+				encodeIndex(data, c, last), last = c;
+
+			// we only need to push third vertex since first two are likely already in the vertex fifo
+			if (fec == 0 || fec >= fecmax)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// we only need to push two new edges to edge fifo since the third one is already there
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+		else
+		{
+			int rotation = rotateTriangle(indices[i + 0], indices[i + 1], indices[i + 2], next);
+			const unsigned int* order = kTriangleIndexOrder[rotation];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			// if a/b/c are 0/1/2, we emit a reset code
+			bool reset = false;
+
+			if (a == 0 && b == 1 && c == 2 && next > 0 && version >= 1)
+			{
+				reset = true;
+				next = 0;
+
+				// reset vertex fifo to make sure we don't accidentally reference vertices from that in the future
+				// this makes sure next continues to get incremented instead of being stuck
+				memset(vertexfifo, -1, sizeof(vertexfifo));
+			}
+
+			int fb = getVertexFifo(vertexfifo, b, vertexfifooffset);
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+			int fea = (a == next) ? (next++, 0) : 15;
+			int feb = (fb >= 0 && fb < 14) ? (fb + 1) : (b == next) ? (next++, 0) : 15;
+			int fec = (fc >= 0 && fc < 14) ? (fc + 1) : (c == next) ? (next++, 0) : 15;
+
+			// we encode feb & fec in 4 bits using a table if possible, and as a full byte otherwise
+			unsigned char codeaux = (unsigned char)((feb << 4) | fec);
+			int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table);
+
+			// <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15
+			if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14 && !reset)
+			{
+				*code++ = (unsigned char)((15 << 4) | codeauxindex);
+			}
+			else
+			{
+				*code++ = (unsigned char)((15 << 4) | 14 | fea);
+				*data++ = codeaux;
+			}
+
+#if TRACE
+			codestats[code[-1]]++;
+			codeauxstats[codeaux]++;
+#endif
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fea == 15)
+				encodeIndex(data, a, last), last = a;
+
+			if (feb == 15)
+				encodeIndex(data, b, last), last = b;
+
+			if (fec == 15)
+				encodeIndex(data, c, last), last = c;
+
+			// only push vertices that weren't already in fifo
+			if (fea == 0 || fea == 15)
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+
+			if (feb == 0 || feb == 15)
+				pushVertexFifo(vertexfifo, b, vertexfifooffset);
+
+			if (fec == 0 || fec == 15)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// all three edges aren't in the fifo; pushing all of them is important so that we can match them for later triangles
+			pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+	}
+
+	// make sure we have enough space to write codeaux table
+	if (data > data_safe_end)
+		return 0;
+
+	// add codeaux encoding table to the end of the stream; this is used for decoding codeaux *and* as padding
+	// we need padding for decoding to be able to assume that each triangle is encoded as <= 16 bytes of extra data
+	// this is enough space for aux byte + 5 bytes per varint index which is the absolute worst case for any input
+	for (size_t i = 0; i < 16; ++i)
+	{
+		// decoder assumes that table entries never refer to separately encoded indices
+		assert((codeaux_table[i] & 0xf) != 0xf && (codeaux_table[i] >> 4) != 0xf);
+
+		*data++ = codeaux_table[i];
+	}
+
+	// since we encode restarts as codeaux without a table reference, we need to make sure 00 is encoded as a table reference
+	assert(codeaux_table[0] == 0);
+
+	assert(data >= buffer + index_count / 3 + 16);
+	assert(data <= buffer + buffer_size);
+
+#if TRACE
+	unsigned char codetop[16], codeauxtop[16];
+	size_t codetopsize = sortTop16(codetop, codestats);
+	size_t codeauxtopsize = sortTop16(codeauxtop, codeauxstats);
+
+	size_t sumcode = 0, sumcodeaux = 0;
+	for (size_t i = 0; i < 256; ++i)
+		sumcode += codestats[i], sumcodeaux += codeauxstats[i];
+
+	size_t acccode = 0, acccodeaux = 0;
+
+	printf("code\t\t\t\t\tcodeaux\n");
+
+	for (size_t i = 0; i < codetopsize && i < codeauxtopsize; ++i)
+	{
+		acccode += codestats[codetop[i]];
+		acccodeaux += codeauxstats[codeauxtop[i]];
+
+		printf("%2d: %02x = %d (%.1f%% ..%.1f%%)\t\t%2d: %02x = %d (%.1f%% ..%.1f%%)\n",
+		       int(i), codetop[i], int(codestats[codetop[i]]), double(codestats[codetop[i]]) / double(sumcode) * 100, double(acccode) / double(sumcode) * 100,
+		       int(i), codeauxtop[i], int(codeauxstats[codeauxtop[i]]), double(codeauxstats[codeauxtop[i]]) / double(sumcodeaux) * 100, double(acccodeaux) / double(sumcodeaux) * 100);
+	}
+#endif
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	// compute number of bits required for each index
+	unsigned int vertex_bits = 1;
+
+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+		vertex_bits++;
+
+	// worst-case encoding is 2 header bytes + 3 varint-7 encoded index deltas
+	unsigned int vertex_groups = (vertex_bits + 1 + 6) / 7;
+
+	return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16;
+}
+
+void meshopt_encodeIndexVersion(int version)
+{
+	assert(unsigned(version) <= 1);
+
+	meshopt::gEncodeIndexVersion = version;
+}
+
+int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(index_size == 2 || index_size == 4);
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return -2;
+
+	if ((buffer[0] & 0xf0) != kIndexHeader)
+		return -1;
+
+	int version = buffer[0] & 0x0f;
+	if (version > 1)
+		return -1;
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	int fecmax = version >= 1 ? 13 : 15;
+
+	// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
+	const unsigned char* code = buffer + 1;
+	const unsigned char* data = code + index_count / 3;
+	const unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	const unsigned char* codeaux_table = data_safe_end;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough data to read for a triangle
+		// each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can read without extra bounds checks
+		if (data > data_safe_end)
+			return -2;
+
+		unsigned char codetri = *code++;
+
+		if (codetri < 0xf0)
+		{
+			int fe = codetri >> 4;
+
+			// fifo reads are wrapped around 16 entry buffer
+			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
+			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+
+			int fec = codetri & 15;
+
+			// note: this is the most common path in the entire decoder
+			// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
+			if (fec < fecmax)
+			{
+				// fifo reads are wrapped around 16 entry buffer
+				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
+				unsigned int c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+			else
+			{
+				unsigned int c = 0;
+
+				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
+				// note that we need to update the last index since free indices are delta-encoded
+				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+		}
+		else
+		{
+			// fast path: read codeaux from the table
+			if (codetri < 0xfe)
+			{
+				unsigned char codeaux = codeaux_table[codetri & 15];
+
+				// note: table can't contain feb/fec=15
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = next++;
+
+				unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int b = (feb == 0) ? next : bf;
+
+				int feb0 = feb == 0;
+				next += feb0;
+
+				unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
+				unsigned int c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+			else
+			{
+				// slow path: read a full byte for codeaux instead of using a table lookup
+				unsigned char codeaux = *data++;
+
+				int fea = codetri == 0xfe ? 0 : 15;
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// reset: codeaux is 0 but encoded as not-a-table
+				if (codeaux == 0)
+					next = 0;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = (fea == 0) ? next++ : 0;
+				unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
+
+				// note that we need to update the last index since free indices are delta-encoded
+				if (fea == 15)
+					last = a = decodeIndex(data, last);
+
+				if (feb == 15)
+					last = b = decodeIndex(data, last);
+
+				if (fec == 15)
+					last = c = decodeIndex(data, last);
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+		}
+	}
+
+	// we should've read all data bytes and stopped at the boundary between data and codeaux table
+	if (data != data_safe_end)
+		return -3;
+
+	return 0;
+}
+
+size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+	using namespace meshopt;
+
+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+	if (buffer_size < 1 + index_count + 4)
+		return 0;
+
+	int version = gEncodeIndexVersion;
+
+	buffer[0] = (unsigned char)(kSequenceHeader | version);
+
+	unsigned int last[2] = {};
+	unsigned int current = 0;
+
+	unsigned char* data = buffer + 1;
+	unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		// make sure we have enough data to write
+		// each index writes at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+		// after this we can be sure we can write without extra bounds checks
+		if (data >= data_safe_end)
+			return 0;
+
+		unsigned int index = indices[i];
+
+		// this is a heuristic that switches between baselines when the delta grows too large
+		// we want the encoded delta to fit into one byte (7 bits), but 2 bits are used for sign and baseline index
+		// for now we immediately switch the baseline when delta grows too large - this can be adjusted arbitrarily
+		int cd = int(index - last[current]);
+		current ^= ((cd < 0 ? -cd : cd) >= 30);
+
+		// encode delta from the last index
+		unsigned int d = index - last[current];
+		unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+		// note: low bit encodes the index of the last baseline which will be used for reconstruction
+		encodeVByte(data, (v << 1) | current);
+
+		// update last for the next iteration that uses it
+		last[current] = index;
+	}
+
+	// make sure we have enough space to write tail
+	if (data > data_safe_end)
+		return 0;
+
+	for (int k = 0; k < 4; ++k)
+		*data++ = 0;
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count)
+{
+	// compute number of bits required for each index
+	unsigned int vertex_bits = 1;
+
+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+		vertex_bits++;
+
+	// worst-case encoding is 1 varint-7 encoded index delta for a K bit value and an extra bit
+	unsigned int vertex_groups = (vertex_bits + 1 + 1 + 6) / 7;
+
+	return 1 + index_count * vertex_groups + 4;
+}
+
+int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+	if (buffer_size < 1 + index_count + 4)
+		return -2;
+
+	if ((buffer[0] & 0xf0) != kSequenceHeader)
+		return -1;
+
+	int version = buffer[0] & 0x0f;
+	if (version > 1)
+		return -1;
+
+	const unsigned char* data = buffer + 1;
+	const unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+	unsigned int last[2] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		// make sure we have enough data to read
+		// each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+		// after this we can be sure we can read without extra bounds checks
+		if (data >= data_safe_end)
+			return -2;
+
+		unsigned int v = decodeVByte(data);
+
+		// decode the index of the last baseline
+		unsigned int current = v & 1;
+		v >>= 1;
+
+		// reconstruct index as a delta
+		unsigned int d = (v >> 1) ^ -int(v & 1);
+		unsigned int index = last[current] + d;
+
+		// update last for the next iteration that uses it
+		last[current] = index;
+
+		if (index_size == 2)
+		{
+			static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
+		}
+		else
+		{
+			static_cast<unsigned int*>(destination)[i] = index;
+		}
+	}
+
+	// we should've read all data bytes and stopped at the boundary between data and tail
+	if (data != data_safe_end)
+		return -3;
+
+	return 0;
+}
diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp
new file mode 100644
index 0000000000..aa4a30efa4
--- /dev/null
+++ b/thirdparty/meshoptimizer/indexgenerator.cpp
@@ -0,0 +1,347 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+namespace meshopt
+{
+
+static unsigned int hashUpdate4(unsigned int h, const unsigned char* key, size_t len)
+{
+	// MurmurHash2
+	const unsigned int m = 0x5bd1e995;
+	const int r = 24;
+
+	while (len >= 4)
+	{
+		unsigned int k = *reinterpret_cast<const unsigned int*>(key);
+
+		k *= m;
+		k ^= k >> r;
+		k *= m;
+
+		h *= m;
+		h ^= k;
+
+		key += 4;
+		len -= 4;
+	}
+
+	return h;
+}
+
+struct VertexHasher
+{
+	const unsigned char* vertices;
+	size_t vertex_size;
+	size_t vertex_stride;
+
+	size_t hash(unsigned int index) const
+	{
+		return hashUpdate4(0, vertices + index * vertex_stride, vertex_size);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return memcmp(vertices + lhs * vertex_stride, vertices + rhs * vertex_stride, vertex_size) == 0;
+	}
+};
+
+struct VertexStreamHasher
+{
+	const meshopt_Stream* streams;
+	size_t stream_count;
+
+	size_t hash(unsigned int index) const
+	{
+		unsigned int h = 0;
+
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			h = hashUpdate4(h, data + index * s.stride, s.size);
+		}
+
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			if (memcmp(data + lhs * s.stride, data + rhs * s.stride, s.size) != 0)
+				return false;
+		}
+
+		return true;
+	}
+};
+
+static size_t hashBuckets(size_t count)
+{
+	size_t buckets = 1;
+	while (buckets < count)
+		buckets *= 2;
+
+	return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+	assert(buckets > 0);
+	assert((buckets & (buckets - 1)) == 0);
+
+	size_t hashmod = buckets - 1;
+	size_t bucket = hash.hash(key) & hashmod;
+
+	for (size_t probe = 0; probe <= hashmod; ++probe)
+	{
+		T& item = table[bucket];
+
+		if (item == empty)
+			return &item;
+
+		if (hash.equal(item, key))
+			return &item;
+
+		// hash collision, quadratic probing
+		bucket = (bucket + probe + 1) & hashmod;
+	}
+
+	assert(false && "Hash table is full"); // unreachable
+	return 0;
+}
+
+} // namespace meshopt
+
+size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+			{
+				*entry = index;
+
+				destination[index] = next_vertex++;
+			}
+			else
+			{
+				assert(destination[*entry] != ~0u);
+
+				destination[index] = destination[*entry];
+			}
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+			{
+				*entry = index;
+
+				destination[index] = next_vertex++;
+			}
+			else
+			{
+				assert(destination[*entry] != ~0u);
+
+				destination[index] = destination[*entry];
+			}
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place remap
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (remap[i] != ~0u)
+		{
+			assert(remap[i] < vertex_count);
+
+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
+		}
+	}
+}
+
+void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap)
+{
+	assert(index_count % 3 == 0);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(remap[index] != ~0u);
+
+		destination[i] = remap[index];
+	}
+}
+
+void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size <= vertex_stride);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
+
+void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
new file mode 100644
index 0000000000..fde00f9c82
--- /dev/null
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -0,0 +1,951 @@
+/**
+ * meshoptimizer - version 0.15
+ *
+ * Copyright (C) 2016-2020, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
+ *
+ * This library is distributed under the MIT License. See notice at the end of this file.
+ */
+#pragma once
+
+#include <assert.h>
+#include <stddef.h>
+
+/* Version macro; major * 1000 + minor * 10 + patch */
+#define MESHOPTIMIZER_VERSION 150 /* 0.15 */
+
+/* If no API is defined, assume default */
+#ifndef MESHOPTIMIZER_API
+#define MESHOPTIMIZER_API
+#endif
+
+/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
+#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
+
+/* C interface */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Vertex attribute stream, similar to glVertexPointer
+ * Each element takes size bytes, with stride controlling the spacing between successive elements.
+ */
+struct meshopt_Stream
+{
+	const void* data;
+	size_t size;
+	size_t stride;
+};
+
+/**
+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Generates a vertex remap table from multiple vertex streams and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ * To remap vertex buffers, you will need to call meshopt_remapVertexBuffer for each vertex stream.
+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting vertex buffer (unique_vertex_count elements, returned by meshopt_generateVertexRemap)
+ * vertex_count should be the initial vertex count and not the value returned by meshopt_generateVertexRemap
+ */
+MESHOPTIMIZER_API void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap);
+
+/**
+ * Generate index buffer from the source index buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap);
+
+/**
+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt first vertex_size bytes) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
+
+/**
+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt specified streams) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Vertex transform cache optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Vertex transform cache optimizer for strip-like caches
+ * Produces inferior results to meshopt_optimizeVertexCache from the GPU vertex cache perspective
+ * However, the resulting index order is more optimal if the goal is to reduce the triangle strip length or improve compression efficiency
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Vertex transform cache optimizer for FIFO caches
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * cache_size should be less than the actual GPU cache size to avoid cache thrashing
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
+
+/**
+ * Overdraw optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
+ */
+MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
+
+/**
+ * Vertex fetch cache optimizer
+ * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count elements)
+ * indices is used both as an input and as an output index buffer
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Vertex fetch cache optimizer
+ * Generates vertex remap to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * The resulting remap table should be used to reorder vertex/index buffers using meshopt_remapVertexBuffer/meshopt_remapIndexBuffer
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Index buffer encoder
+ * Encodes index data into an array of bytes that is generally much smaller (<1.5 bytes/triangle) and compresses better (<1 bytes/triangle) compared to original.
+ * Input index buffer must represent a triangle list.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * For maximum efficiency the index buffer being encoded has to be optimized for vertex cache and vertex fetch first.
+ *
+ * buffer must contain enough space for the encoded index buffer (use meshopt_encodeIndexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
+
+/**
+ * Experimental: Set index encoder format version
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeIndexVersion(int version);
+
+/**
+ * Index buffer decoder
+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Experimental: Index sequence encoder
+ * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
+ * Input index sequence can represent arbitrary topology; for triangle lists meshopt_encodeIndexBuffer is likely to be better.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ *
+ * buffer must contain enough space for the encoded index sequence (use meshopt_encodeIndexSequenceBound to compute worst case size)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count);
+
+/**
+ * Index sequence decoder
+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexSequence
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
+ *
+ * destination must contain enough space for the resulting index sequence (index_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Vertex buffer encoder
+ * Encodes vertex data into an array of bytes that is generally smaller and compresses better compared to original.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
+ * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
+ *
+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
+
+/**
+ * Experimental: Set vertex encoder format version
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeVertexVersion(int version);
+
+/**
+ * Vertex buffer decoder
+ * Decodes vertex data from an array of bytes generated by meshopt_encodeVertexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Vertex buffer filters
+ * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
+ * count must be aligned by 4 and stride is fixed for each function to facilitate SIMD implementation.
+ *
+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
+ *
+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
+ * Each component is stored as an 16-bit integer; stride must be equal to 8.
+ *
+ * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
+ * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Experimental: Mesh simplifier
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+// -- GODOT start --
+//MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float *r_resulting_error);
+// -- GODOT end --
+
+/**
+ * Experimental: Mesh simplifier (sloppy)
+ * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance
+ * The algorithm doesn't preserve mesh topology but is always able to reach target triangle count.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
+
+/**
+ * Experimental: Point cloud simplifier
+ * Reduces the number of points in the cloud to reach the given target
+ * Returns the number of points after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);
+
+/**
+ * Mesh stripifier
+ * Converts a previously vertex cache optimized triangle list to triangle strip, stitching strips using restart index or degenerate triangles
+ * Returns the number of indices in the resulting strip, with destination containing new index data
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ * Using restart indices can result in ~10% smaller index buffers, but on some GPUs restart indices may result in decreased performance.
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_stripifyBound
+ * restart_index should be 0xffff or 0xffffffff depending on index size, or 0 to use degenerate triangles
+ */
+MESHOPTIMIZER_API size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index);
+MESHOPTIMIZER_API size_t meshopt_stripifyBound(size_t index_count);
+
+/**
+ * Mesh unstripifier
+ * Converts a triangle strip to a triangle list
+ * Returns the number of indices in the resulting list, with destination containing new index data
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_unstripifyBound
+ */
+MESHOPTIMIZER_API size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index);
+MESHOPTIMIZER_API size_t meshopt_unstripifyBound(size_t index_count);
+
+struct meshopt_VertexCacheStatistics
+{
+	unsigned int vertices_transformed;
+	unsigned int warps_executed;
+	float acmr; /* transformed vertices / triangle count; best case 0.5, worst case 3.0, optimum depends on topology */
+	float atvr; /* transformed vertices / vertex count; best case 1.0, worst case 6.0, optimum is 1.0 (each vertex is transformed once) */
+};
+
+/**
+ * Vertex transform cache analyzer
+ * Returns cache hit statistics using a simplified FIFO model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+
+struct meshopt_OverdrawStatistics
+{
+	unsigned int pixels_covered;
+	unsigned int pixels_shaded;
+	float overdraw; /* shaded pixels / covered pixels; best case 1.0 */
+};
+
+/**
+ * Overdraw analyzer
+ * Returns overdraw statistics using a software rasterizer
+ * Results may not match actual GPU performance
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+struct meshopt_VertexFetchStatistics
+{
+	unsigned int bytes_fetched;
+	float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+};
+
+/**
+ * Vertex fetch cache analyzer
+ * Returns cache hit statistics using a simplified direct mapped model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+
+struct meshopt_Meshlet
+{
+	unsigned int vertices[64];
+	unsigned char indices[126][3];
+	unsigned char triangle_count;
+	unsigned char vertex_count;
+};
+
+/**
+ * Experimental: Meshlet builder
+ * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
+ * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ *
+ * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
+ * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
+
+struct meshopt_Bounds
+{
+	/* bounding sphere, useful for frustum and occlusion culling */
+	float center[3];
+	float radius;
+
+	/* normal cone, useful for backface culling */
+	float cone_apex[3];
+	float cone_axis[3];
+	float cone_cutoff; /* = cos(angle/2) */
+
+	/* normal cone axis and cutoff, stored in 8-bit SNORM format; decode using x/127.0 */
+	signed char cone_axis_s8[3];
+	signed char cone_cutoff_s8;
+};
+
+/**
+ * Experimental: Cluster bounds generator
+ * Creates bounding volumes that can be used for frustum, backface and occlusion culling.
+ *
+ * For backface culling with orthographic projection, use the following formula to reject backfacing clusters:
+ *   dot(view, cone_axis) >= cone_cutoff
+ *
+ * For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff:
+ *   dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff
+ *
+ * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead:
+ *   dot(normalize(center - camera_position), cone_axis) >= cone_cutoff + radius / length(center - camera_position)
+ * or an equivalent formula that doesn't have a singularity at center = camera_position:
+ *   dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius
+ *
+ * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
+ * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size)
+ */
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Experimental: Spatial sorter
+ * Generates a remap table that can be used to reorder points for spatial locality.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Experimental: Spatial sorter
+ * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Set allocation callbacks
+ * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
+ * Note that all algorithms only allocate memory for temporary use.
+ * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
+ */
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*));
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+/* Quantization into commonly supported data formats */
+#ifdef __cplusplus
+/**
+ * Quantize a float in [0..1] range into an N-bit fixed point unorm value
+ * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
+ * Maximum reconstruction error: 1/2^(N+1)
+ */
+inline int meshopt_quantizeUnorm(float v, int N);
+
+/**
+ * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
+ * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
+ * Maximum reconstruction error: 1/2^N
+ */
+inline int meshopt_quantizeSnorm(float v, int N);
+
+/**
+ * Quantize a float into half-precision floating point value
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Representable magnitude range: [6e-5; 65504]
+ * Maximum relative reconstruction error: 5e-4
+ */
+inline unsigned short meshopt_quantizeHalf(float v);
+
+/**
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Assumes N is in a valid mantissa precision range, which is 1..23
+ */
+inline float meshopt_quantizeFloat(float v, int N);
+#endif
+
+/**
+ * C++ template interface
+ *
+ * These functions mirror the C interface the library provides, providing template-based overloads so that
+ * the caller can use an arbitrary type for the index data, both for input and output.
+ * When the supplied type is the same size as that of unsigned int, the wrappers are zero-cost; when it's not,
+ * the wrappers end up allocating memory and copying index data to convert from one type to another.
+ */
+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
+template <typename T>
+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename T>
+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
+template <typename T>
+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
+template <typename T>
+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename T>
+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
+template <typename T>
+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
+template <typename T>
+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
+template <typename T>
+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+template <typename T>
+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
+template <typename T>
+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+template <typename T>
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
+template <typename T>
+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
+template <typename T>
+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
+template <typename T>
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
+template <typename T>
+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+template <typename T>
+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+#endif
+
+/* Inline implementation */
+#ifdef __cplusplus
+inline int meshopt_quantizeUnorm(float v, int N)
+{
+	const float scale = float((1 << N) - 1);
+
+	v = (v >= 0) ? v : 0;
+	v = (v <= 1) ? v : 1;
+
+	return int(v * scale + 0.5f);
+}
+
+inline int meshopt_quantizeSnorm(float v, int N)
+{
+	const float scale = float((1 << (N - 1)) - 1);
+
+	float round = (v >= 0 ? 0.5f : -0.5f);
+
+	v = (v >= -1) ? v : -1;
+	v = (v <= +1) ? v : +1;
+
+	return int(v * scale + round);
+}
+
+inline unsigned short meshopt_quantizeHalf(float v)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	int s = (ui >> 16) & 0x8000;
+	int em = ui & 0x7fffffff;
+
+	/* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+	/* underflow: flush to zero; 113 encodes exponent -14 */
+	h = (em < (113 << 23)) ? 0 : h;
+
+	/* overflow: infinity; 143 encodes exponent 16 */
+	h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+	/* NaN; note that we convert all types of NaN to qNaN */
+	h = (em > (255 << 23)) ? 0x7e00 : h;
+
+	return (unsigned short)(s | h);
+}
+
+inline float meshopt_quantizeFloat(float v, int N)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	const int mask = (1 << (23 - N)) - 1;
+	const int round = (1 << (23 - N)) >> 1;
+
+	int e = ui & 0x7f800000;
+	unsigned int rui = (ui + round) & ~mask;
+
+	/* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
+	ui = e == 0x7f800000 ? ui : rui;
+
+	/* flush denormals to zero */
+	ui = e == 0 ? 0 : ui;
+
+	u.ui = ui;
+	return u.f;
+}
+#endif
+
+/* Internal implementation helpers */
+#ifdef __cplusplus
+class meshopt_Allocator
+{
+public:
+	template <typename T>
+	struct StorageT
+	{
+		static void* (*allocate)(size_t);
+		static void (*deallocate)(void*);
+	};
+
+	typedef StorageT<void> Storage;
+
+	meshopt_Allocator()
+		: blocks()
+		, count(0)
+	{
+	}
+
+	~meshopt_Allocator()
+	{
+		for (size_t i = count; i > 0; --i)
+			Storage::deallocate(blocks[i - 1]);
+	}
+
+	template <typename T> T* allocate(size_t size)
+	{
+		assert(count < sizeof(blocks) / sizeof(blocks[0]));
+		T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
+		blocks[count++] = result;
+		return result;
+	}
+
+private:
+	void* blocks[24];
+	size_t count;
+};
+
+// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
+template <typename T> void* (*meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
+template <typename T> void (*meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
+#endif
+
+/* Inline implementation for C++ templated wrappers */
+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
+template <typename T, bool ZeroCopy = sizeof(T) == sizeof(unsigned int)>
+struct meshopt_IndexAdapter;
+
+template <typename T>
+struct meshopt_IndexAdapter<T, false>
+{
+	T* result;
+	unsigned int* data;
+	size_t count;
+
+	meshopt_IndexAdapter(T* result_, const T* input, size_t count_)
+	    : result(result_)
+	    , data(0)
+	    , count(count_)
+	{
+		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
+
+		data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
+
+		if (input)
+		{
+			for (size_t i = 0; i < count; ++i)
+				data[i] = input[i];
+		}
+	}
+
+	~meshopt_IndexAdapter()
+	{
+		if (result)
+		{
+			for (size_t i = 0; i < count; ++i)
+				result[i] = T(data[i]);
+		}
+
+		meshopt_Allocator::Storage::deallocate(data);
+	}
+};
+
+template <typename T>
+struct meshopt_IndexAdapter<T, true>
+{
+	unsigned int* data;
+
+	meshopt_IndexAdapter(T* result, const T* input, size_t)
+	    : data(reinterpret_cast<unsigned int*>(result ? result : const_cast<T*>(input)))
+	{
+	}
+};
+
+template <typename T>
+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemap(destination, indices ? in.data : 0, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : 0, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_remapIndexBuffer(out.data, indices ? in.data : 0, index_count, remap);
+}
+
+template <typename T>
+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride);
+}
+
+template <typename T>
+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCacheStrip(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size);
+}
+
+template <typename T>
+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> inout(indices, indices, index_count);
+
+	return meshopt_optimizeVertexFetch(destination, inout.data, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count);
+}
+
+template <typename T>
+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
+{
+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
+	(void)index_size_valid;
+
+	return meshopt_decodeIndexBuffer(destination, index_count, sizeof(T), buffer, buffer_size);
+}
+
+template <typename T>
+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_encodeIndexSequence(buffer, buffer_size, in.data, index_count);
+}
+
+template <typename T>
+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
+{
+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
+	(void)index_size_valid;
+
+	return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
+}
+
+template <typename T>
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error);
+}
+
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, target_index_count);
+
+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count);
+}
+
+template <typename T>
+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, (index_count / 3) * 5);
+
+	return meshopt_stripify(out.data, in.data, index_count, vertex_count, unsigned(restart_index));
+}
+
+template <typename T>
+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, (index_count - 2) * 3);
+
+	return meshopt_unstripify(out.data, in.data, index_count, unsigned(restart_index));
+}
+
+template <typename T>
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
+}
+
+template <typename T>
+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles);
+}
+
+template <typename T>
+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_spatialSortTriangles(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+#endif
+
+/**
+ * Copyright (c) 2016-2020 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/thirdparty/meshoptimizer/overdrawanalyzer.cpp b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
new file mode 100644
index 0000000000..8d5859ba39
--- /dev/null
+++ b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
@@ -0,0 +1,230 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Nicolas Capens. Advanced Rasterization. 2004
+namespace meshopt
+{
+
+const int kViewport = 256;
+
+struct OverdrawBuffer
+{
+	float z[kViewport][kViewport][2];
+	unsigned int overdraw[kViewport][kViewport][2];
+};
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef max
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
+{
+	// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
+	// z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1)
+	// (x2-x1 y2-y1)(dzdx) = (z2-z1)
+	// (x3-x1 y3-y1)(dzdy)   (z3-z1)
+	// we'll solve it with Cramer's rule
+	float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
+	float invdet = (det == 0) ? 0 : 1 / det;
+
+	dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
+	dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
+
+	return det;
+}
+
+// half-space fixed point triangle rasterizer
+static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, float v2x, float v2y, float v2z, float v3x, float v3y, float v3z)
+{
+	// compute depth gradients
+	float DZx, DZy;
+	float det = computeDepthGradients(DZx, DZy, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
+	int sign = det > 0;
+
+	// flip backfacing triangles to simplify rasterization logic
+	if (sign)
+	{
+		// flipping v2 & v3 preserves depth gradients since they're based on v1
+		float t;
+		t = v2x, v2x = v3x, v3x = t;
+		t = v2y, v2y = v3y, v3y = t;
+		t = v2z, v2z = v3z, v3z = t;
+
+		// flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below
+		v1z = kViewport - v1z;
+		DZx = -DZx;
+		DZy = -DZy;
+	}
+
+	// coordinates, 28.4 fixed point
+	int X1 = int(16.0f * v1x + 0.5f);
+	int X2 = int(16.0f * v2x + 0.5f);
+	int X3 = int(16.0f * v3x + 0.5f);
+
+	int Y1 = int(16.0f * v1y + 0.5f);
+	int Y2 = int(16.0f * v2y + 0.5f);
+	int Y3 = int(16.0f * v3y + 0.5f);
+
+	// bounding rectangle, clipped against viewport
+	// since we rasterize pixels with covered centers, min >0.5 should round up
+	// as for max, due to top-left filling convention we will never rasterize right/bottom edges
+	// so max >= 0.5 should round down
+	int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
+	int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
+	int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
+	int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
+
+	// deltas, 28.4 fixed point
+	int DX12 = X1 - X2;
+	int DX23 = X2 - X3;
+	int DX31 = X3 - X1;
+
+	int DY12 = Y1 - Y2;
+	int DY23 = Y2 - Y3;
+	int DY31 = Y3 - Y1;
+
+	// fill convention correction
+	int TL1 = DY12 < 0 || (DY12 == 0 && DX12 > 0);
+	int TL2 = DY23 < 0 || (DY23 == 0 && DX23 > 0);
+	int TL3 = DY31 < 0 || (DY31 == 0 && DX31 > 0);
+
+	// half edge equations, 24.8 fixed point
+	// note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers
+	int FX = (minx << 4) + 8;
+	int FY = (miny << 4) + 8;
+	int CY1 = DX12 * (FY - Y1) - DY12 * (FX - X1) + TL1 - 1;
+	int CY2 = DX23 * (FY - Y2) - DY23 * (FX - X2) + TL2 - 1;
+	int CY3 = DX31 * (FY - Y3) - DY31 * (FX - X3) + TL3 - 1;
+	float ZY = v1z + (DZx * float(FX - X1) + DZy * float(FY - Y1)) * (1 / 16.f);
+
+	for (int y = miny; y < maxy; y++)
+	{
+		int CX1 = CY1;
+		int CX2 = CY2;
+		int CX3 = CY3;
+		float ZX = ZY;
+
+		for (int x = minx; x < maxx; x++)
+		{
+			// check if all CXn are non-negative
+			if ((CX1 | CX2 | CX3) >= 0)
+			{
+				if (ZX >= buffer->z[y][x][sign])
+				{
+					buffer->z[y][x][sign] = ZX;
+					buffer->overdraw[y][x][sign]++;
+				}
+			}
+
+			// signed left shift is UB for negative numbers so use unsigned-signed casts
+			CX1 -= int(unsigned(DY12) << 4);
+			CX2 -= int(unsigned(DY23) << 4);
+			CX3 -= int(unsigned(DY31) << 4);
+			ZX += DZx;
+		}
+
+		// signed left shift is UB for negative numbers so use unsigned-signed casts
+		CY1 += int(unsigned(DX12) << 4);
+		CY2 += int(unsigned(DX23) << 4);
+		CY3 += int(unsigned(DX31) << 4);
+		ZY += DZy;
+	}
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_OverdrawStatistics result = {};
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions + i * vertex_stride_float;
+
+		for (int j = 0; j < 3; ++j)
+		{
+			minv[j] = min(minv[j], v[j]);
+			maxv[j] = max(maxv[j], v[j]);
+		}
+	}
+
+	float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
+	float scale = kViewport / extent;
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		const float* v = vertex_positions + index * vertex_stride_float;
+
+		triangles[i * 3 + 0] = (v[0] - minv[0]) * scale;
+		triangles[i * 3 + 1] = (v[1] - minv[1]) * scale;
+		triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
+	}
+
+	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		memset(buffer, 0, sizeof(OverdrawBuffer));
+
+		for (size_t i = 0; i < index_count; i += 3)
+		{
+			const float* vn0 = &triangles[3 * (i + 0)];
+			const float* vn1 = &triangles[3 * (i + 1)];
+			const float* vn2 = &triangles[3 * (i + 2)];
+
+			switch (axis)
+			{
+			case 0:
+				rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+				break;
+			case 1:
+				rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+				break;
+			case 2:
+				rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+				break;
+			}
+		}
+
+		for (int y = 0; y < kViewport; ++y)
+			for (int x = 0; x < kViewport; ++x)
+				for (int s = 0; s < 2; ++s)
+				{
+					unsigned int overdraw = buffer->overdraw[y][x][s];
+
+					result.pixels_covered += overdraw > 0;
+					result.pixels_shaded += overdraw;
+				}
+	}
+
+	result.overdraw = result.pixels_covered ? float(result.pixels_shaded) / float(result.pixels_covered) : 0.f;
+
+	return result;
+}
diff --git a/thirdparty/meshoptimizer/overdrawoptimizer.cpp b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
new file mode 100644
index 0000000000..143656ed76
--- /dev/null
+++ b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
@@ -0,0 +1,333 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float mesh_centroid[3] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		const float* p = vertex_positions + vertex_stride_float * indices[i];
+
+		mesh_centroid[0] += p[0];
+		mesh_centroid[1] += p[1];
+		mesh_centroid[2] += p[2];
+	}
+
+	mesh_centroid[0] /= index_count;
+	mesh_centroid[1] /= index_count;
+	mesh_centroid[2] /= index_count;
+
+	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
+	{
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		float cluster_area = 0;
+		float cluster_centroid[3] = {};
+		float cluster_normal[3] = {};
+
+		for (size_t i = cluster_begin; i < cluster_end; i += 3)
+		{
+			const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
+			const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
+			const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
+
+			float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+			float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+			float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+			float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+			float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+			float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+			cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
+			cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
+			cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
+			cluster_normal[0] += normalx;
+			cluster_normal[1] += normaly;
+			cluster_normal[2] += normalz;
+			cluster_area += area;
+		}
+
+		float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
+
+		cluster_centroid[0] *= inv_cluster_area;
+		cluster_centroid[1] *= inv_cluster_area;
+		cluster_centroid[2] *= inv_cluster_area;
+
+		float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
+		float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
+
+		cluster_normal[0] *= inv_cluster_normal_length;
+		cluster_normal[1] *= inv_cluster_normal_length;
+		cluster_normal[2] *= inv_cluster_normal_length;
+
+		float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
+
+		sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
+	}
+}
+
+static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
+{
+	// compute sort data bounds and renormalize, using fixed point snorm
+	float sort_data_max = 1e-3f;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		float dpa = fabsf(sort_data[i]);
+
+		sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
+	}
+
+	const int sort_bits = 11;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		// note that we flip distribution since high dot product should come first
+		float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
+
+		sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
+	}
+
+	// fill histogram for counting sort
+	unsigned int histogram[1 << sort_bits];
+	memset(histogram, 0, sizeof(histogram));
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		histogram[sort_keys[i]]++;
+	}
+
+	// compute offsets based on histogram data
+	size_t histogram_sum = 0;
+
+	for (size_t i = 0; i < 1 << sort_bits; ++i)
+	{
+		size_t count = histogram[i];
+		histogram[i] = unsigned(histogram_sum);
+		histogram_sum += count;
+	}
+
+	assert(histogram_sum == cluster_count);
+
+	// compute sort order based on offsets
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		sort_order[histogram[sort_keys[i]]++] = unsigned(i);
+	}
+}
+
+static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
+{
+	unsigned int cache_misses = 0;
+
+	// if vertex is not in cache, put it in cache
+	if (timestamp - cache_timestamps[a] > cache_size)
+	{
+		cache_timestamps[a] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[b] > cache_size)
+	{
+		cache_timestamps[b] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[c] > cache_size)
+	{
+		cache_timestamps[c] = timestamp++;
+		cache_misses++;
+	}
+
+	return cache_misses;
+}
+
+static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	size_t face_count = index_count / 3;
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+		// when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
+		// that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
+		// suggests an inefficiency in the vertex cache optimization algorithm
+		// usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
+		if (i == 0 || m == 3)
+		{
+			destination[result++] = unsigned(i);
+		}
+	}
+
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = 0;
+
+	size_t result = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		size_t start = clusters[it];
+		size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
+		assert(start < end);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		// measure cluster ACMR
+		unsigned int cluster_misses = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			cluster_misses += m;
+		}
+
+		float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
+
+		// first cluster always starts from the hard cluster boundary
+		destination[result++] = unsigned(start);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		unsigned int running_misses = 0;
+		unsigned int running_faces = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			running_misses += m;
+			running_faces += 1;
+
+			if (float(running_misses) / float(running_faces) <= cluster_threshold)
+			{
+				// we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
+				// note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
+				// cluster is empty; however, the 'pop_back' after the loop will clean it up
+				destination[result++] = unsigned(i + 1);
+
+				// reset cache
+				timestamp += cache_size + 1;
+
+				running_misses = 0;
+				running_faces = 0;
+			}
+		}
+
+		// each time we reach the target ACMR we flush the cluster
+		// this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
+		// in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
+		// thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
+		// there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
+		// to the cluster boundary array which we need to remove anyway - this code will do that automatically
+		if (destination[result - 1] != start)
+		{
+			result--;
+		}
+	}
+
+	assert(result >= cluster_count);
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+
+	// generate hard boundaries from full-triangle cache misses
+	unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
+	size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
+
+	// generate soft boundaries
+	unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
+	size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
+
+	const unsigned int* clusters = soft_clusters;
+	size_t cluster_count = soft_cluster_count;
+
+	// fill sort data
+	float* sort_data = allocator.allocate<float>(cluster_count);
+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
+
+	// sort clusters using sort data
+	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
+	unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
+	calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
+
+	// fill output buffer
+	size_t offset = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		unsigned int cluster = sort_order[it];
+		assert(cluster < cluster_count);
+
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
+		offset += cluster_end - cluster_begin;
+	}
+
+	assert(offset == index_count);
+}
diff --git a/thirdparty/meshoptimizer/patches/simplifier_get_resulting_error.patch b/thirdparty/meshoptimizer/patches/simplifier_get_resulting_error.patch
new file mode 100644
index 0000000000..1be38e45d2
--- /dev/null
+++ b/thirdparty/meshoptimizer/patches/simplifier_get_resulting_error.patch
@@ -0,0 +1,96 @@
+diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
+index a442d103c8..fde00f9c82 100644
+--- a/thirdparty/meshoptimizer/meshoptimizer.h
++++ b/thirdparty/meshoptimizer/meshoptimizer.h
+@@ -266,7 +266,10 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t ver
+  * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!)
+  * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+  */
+-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
++// -- GODOT start --
++//MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
++MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float *r_resulting_error);
++// -- GODOT end --
+ 
+ /**
+  * Experimental: Mesh simplifier (sloppy)
+diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
+index bd523275ce..51cf634186 100644
+--- a/thirdparty/meshoptimizer/simplifier.cpp
++++ b/thirdparty/meshoptimizer/simplifier.cpp
+@@ -1143,7 +1143,10 @@ unsigned int* meshopt_simplifyDebugLoop = 0;
+ unsigned int* meshopt_simplifyDebugLoopBack = 0;
+ #endif
+ 
+-size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
++// -- GODOT start --
++//size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
++size_t meshopt_simplify(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float *r_resulting_error)
++// -- GODOT end --
+ {
+ 	using namespace meshopt;
+ 
+@@ -1198,10 +1201,13 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+ 	if (result != indices)
+ 		memcpy(result, indices, index_count * sizeof(unsigned int));
+ 
++// -- GODOT start --
+ #if TRACE
+ 	size_t pass_count = 0;
+-	float worst_error = 0;
++	//float worst_error = 0;
+ #endif
++	float worst_error = 0;
++// -- GODOT end --
+ 
+ 	Collapse* edge_collapses = allocator.allocate<Collapse>(index_count);
+ 	unsigned int* collapse_order = allocator.allocate<unsigned int>(index_count);
+@@ -1213,6 +1219,12 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+ 	// target_error input is linear; we need to adjust it to match quadricError units
+ 	float error_limit = target_error * target_error;
+ 
++// -- GODOT start --
++	if (r_resulting_error) {
++		*r_resulting_error = 1.0;
++	}
++// -- GODOT end --
++
+ 	while (result_count > target_index_count)
+ 	{
+ 		size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, result, result_count, remap, vertex_kind, loop);
+@@ -1257,7 +1269,8 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+ 		size_t new_count = remapIndexBuffer(result, result_count, collapse_remap);
+ 		assert(new_count < result_count);
+ 
+-#if TRACE
++// -- GODOT start --
++//#if TRACE
+ 		float pass_error = 0.f;
+ 		for (size_t i = 0; i < edge_collapse_count; ++i)
+ 		{
+@@ -1267,15 +1280,24 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+ 				pass_error = c.error;
+ 		}
+ 
+-		pass_count++;
++		//pass_count++;
+ 		worst_error = (worst_error < pass_error) ? pass_error : worst_error;
+ 
++#if TRACE
++		pass_count++;
+ 		printf("pass %d: triangles: %d -> %d, collapses: %d/%d (goal: %d), error: %e (limit %e goal %e)\n", int(pass_count), int(result_count / 3), int(new_count / 3), int(collapses), int(edge_collapse_count), int(edge_collapse_goal), pass_error, error_limit, error_goal);
+ #endif
++// -- GODOT end --
+ 
+ 		result_count = new_count;
+ 	}
+ 
++// -- GODOT start --
++	if (r_resulting_error) {
++		*r_resulting_error = sqrt(worst_error);
++	}
++// -- GODOT end --
++
+ #if TRACE
+ 	printf("passes: %d, worst error: %e\n", int(pass_count), worst_error);
+ #endif
diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
new file mode 100644
index 0000000000..b195a8cb5d
--- /dev/null
+++ b/thirdparty/meshoptimizer/simplifier.cpp
@@ -0,0 +1,1562 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <string.h>
+
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+// This work is based on:
+// Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
+// Michael Garland. Quadric-based polygonal surface simplification. 1999
+// Peter Lindstrom. Out-of-Core Simplification of Large Polygonal Models. 2000
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
+// Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
+namespace meshopt
+{
+
+struct EdgeAdjacency
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill edge counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill edge data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = b;
+		adjacency.data[adjacency.offsets[b]++] = c;
+		adjacency.data[adjacency.offsets[c]++] = a;
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
+struct PositionHasher
+{
+	const float* vertex_positions;
+	size_t vertex_stride_float;
+
+	size_t hash(unsigned int index) const
+	{
+		const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float);
+
+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
+		return (key[0] * 73856093) ^ (key[1] * 19349663) ^ (key[2] * 83492791);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return memcmp(vertex_positions + lhs * vertex_stride_float, vertex_positions + rhs * vertex_stride_float, sizeof(float) * 3) == 0;
+	}
+};
+
+static size_t hashBuckets2(size_t count)
+{
+	size_t buckets = 1;
+	while (buckets < count)
+		buckets *= 2;
+
+	return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup2(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+	assert(buckets > 0);
+	assert((buckets & (buckets - 1)) == 0);
+
+	size_t hashmod = buckets - 1;
+	size_t bucket = hash.hash(key) & hashmod;
+
+	for (size_t probe = 0; probe <= hashmod; ++probe)
+	{
+		T& item = table[bucket];
+
+		if (item == empty)
+			return &item;
+
+		if (hash.equal(item, key))
+			return &item;
+
+		// hash collision, quadratic probing
+		bucket = (bucket + probe + 1) & hashmod;
+	}
+
+	assert(false && "Hash table is full"); // unreachable
+	return 0;
+}
+
+static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
+{
+	PositionHasher hasher = {vertex_positions_data, vertex_positions_stride / sizeof(float)};
+
+	size_t table_size = hashBuckets2(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	// build forward remap: for each vertex, which other (canonical) vertex does it map to?
+	// we use position equivalence for this, and remap vertices to other existing vertices
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int index = unsigned(i);
+		unsigned int* entry = hashLookup2(table, table_size, hasher, index, ~0u);
+
+		if (*entry == ~0u)
+			*entry = index;
+
+		remap[index] = *entry;
+	}
+
+	// build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex?
+	// entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i
+	for (size_t i = 0; i < vertex_count; ++i)
+		wedge[i] = unsigned(i);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		if (remap[i] != i)
+		{
+			unsigned int r = remap[i];
+
+			wedge[i] = wedge[r];
+			wedge[r] = unsigned(i);
+		}
+}
+
+enum VertexKind
+{
+	Kind_Manifold, // not on an attribute seam, not on any boundary
+	Kind_Border,   // not on an attribute seam, has exactly two open edges
+	Kind_Seam,     // on an attribute seam with exactly two attribute seam edges
+	Kind_Complex,  // none of the above; these vertices can move as long as all wedges move to the target vertex
+	Kind_Locked,   // none of the above; these vertices can't move
+
+	Kind_Count
+};
+
+// manifold vertices can collapse onto anything
+// border/seam vertices can only be collapsed onto border/seam respectively
+// complex vertices can collapse onto complex/locked
+// a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex
+// for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore
+const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
+    {1, 1, 1, 1, 1},
+    {0, 1, 0, 0, 0},
+    {0, 0, 1, 0, 0},
+    {0, 0, 0, 1, 1},
+    {0, 0, 0, 0, 0},
+};
+
+// if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge
+// note that for seam edges, the opposite edge isn't present in the attribute-based topology
+// but is present if you consider a position-only mesh variant
+const unsigned char kHasOpposite[Kind_Count][Kind_Count] = {
+    {1, 1, 1, 0, 1},
+    {1, 0, 1, 0, 0},
+    {1, 1, 1, 0, 1},
+    {0, 0, 0, 0, 0},
+    {1, 0, 1, 0, 0},
+};
+
+static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b)
+{
+	unsigned int count = adjacency.counts[a];
+	const unsigned int* data = adjacency.data + adjacency.offsets[a];
+
+	for (size_t i = 0; i < count; ++i)
+		if (data[i] == b)
+			return true;
+
+	return false;
+}
+
+static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge)
+{
+	memset(loop, -1, vertex_count * sizeof(unsigned int));
+	memset(loopback, -1, vertex_count * sizeof(unsigned int));
+
+	// incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1
+	// note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam
+	// but here it's okay to fill the data out for other types of vertices as well
+	unsigned int* openinc = loopback;
+	unsigned int* openout = loop;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int vertex = unsigned(i);
+
+		unsigned int count = adjacency.counts[vertex];
+		const unsigned int* data = adjacency.data + adjacency.offsets[vertex];
+
+		for (size_t j = 0; j < count; ++j)
+		{
+			unsigned int target = data[j];
+
+			if (!hasEdge(adjacency, target, vertex))
+			{
+				openinc[target] = (openinc[target] == ~0u) ? vertex : target;
+				openout[vertex] = (openout[vertex] == ~0u) ? target : vertex;
+			}
+		}
+	}
+
+#if TRACE
+	size_t lockedstats[4] = {};
+#define TRACELOCKED(i) lockedstats[i]++;
+#else
+#define TRACELOCKED(i) (void)0
+#endif
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (remap[i] == i)
+		{
+			if (wedge[i] == i)
+			{
+				// no attribute seam, need to check if it's manifold
+				unsigned int openi = openinc[i], openo = openout[i];
+
+				// note: we classify any vertices with no open edges as manifold
+				// this is technically incorrect - if 4 triangles share an edge, we'll classify vertices as manifold
+				// it's unclear if this is a problem in practice
+				if (openi == ~0u && openo == ~0u)
+				{
+					result[i] = Kind_Manifold;
+				}
+				else if (openi != i && openo != i)
+				{
+					result[i] = Kind_Border;
+				}
+				else
+				{
+					result[i] = Kind_Locked;
+					TRACELOCKED(0);
+				}
+			}
+			else if (wedge[wedge[i]] == i)
+			{
+				// attribute seam; need to distinguish between Seam and Locked
+				unsigned int w = wedge[i];
+				unsigned int openiv = openinc[i], openov = openout[i];
+				unsigned int openiw = openinc[w], openow = openout[w];
+
+				// seam should have one open half-edge for each vertex, and the edges need to "connect" - point to the same vertex post-remap
+				if (openiv != ~0u && openiv != i && openov != ~0u && openov != i &&
+				    openiw != ~0u && openiw != w && openow != ~0u && openow != w)
+				{
+					if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw])
+					{
+						result[i] = Kind_Seam;
+					}
+					else
+					{
+						result[i] = Kind_Locked;
+						TRACELOCKED(1);
+					}
+				}
+				else
+				{
+					result[i] = Kind_Locked;
+					TRACELOCKED(2);
+				}
+			}
+			else
+			{
+				// more than one vertex maps to this one; we don't have classification available
+				result[i] = Kind_Locked;
+				TRACELOCKED(3);
+			}
+		}
+		else
+		{
+			assert(remap[i] < i);
+
+			result[i] = result[remap[i]];
+		}
+	}
+
+#if TRACE
+	printf("locked: many open edges %d, disconnected seam %d, many seam edges %d, many wedges %d\n",
+	       int(lockedstats[0]), int(lockedstats[1]), int(lockedstats[2]), int(lockedstats[3]));
+#endif
+}
+
+struct Vector3
+{
+	float x, y, z;
+};
+// -- GODOT start --
+//static void rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+// -- GODOT end --
+
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		result[i].x = v[0];
+		result[i].y = v[1];
+		result[i].z = v[2];
+
+		for (int j = 0; j < 3; ++j)
+		{
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
+		}
+	}
+
+	float extent = 0.f;
+
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	float scale = extent == 0 ? 0.f : 1.f / extent;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		result[i].x = (result[i].x - minv[0]) * scale;
+		result[i].y = (result[i].y - minv[1]) * scale;
+		result[i].z = (result[i].z - minv[2]) * scale;
+	}
+// -- GODOT start --	
+	return extent;
+// -- GODOT end --
+
+}
+
+struct Quadric
+{
+	float a00, a11, a22;
+	float a10, a20, a21;
+	float b0, b1, b2, c;
+	float w;
+};
+
+struct Collapse
+{
+	unsigned int v0;
+	unsigned int v1;
+
+	union
+	{
+		unsigned int bidi;
+		float error;
+		unsigned int errorui;
+	};
+};
+
+static float normalize(Vector3& v)
+{
+	float length = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
+
+	if (length > 0)
+	{
+		v.x /= length;
+		v.y /= length;
+		v.z /= length;
+	}
+
+	return length;
+}
+
+static void quadricAdd(Quadric& Q, const Quadric& R)
+{
+	Q.a00 += R.a00;
+	Q.a11 += R.a11;
+	Q.a22 += R.a22;
+	Q.a10 += R.a10;
+	Q.a20 += R.a20;
+	Q.a21 += R.a21;
+	Q.b0 += R.b0;
+	Q.b1 += R.b1;
+	Q.b2 += R.b2;
+	Q.c += R.c;
+	Q.w += R.w;
+}
+
+static float quadricError(const Quadric& Q, const Vector3& v)
+{
+	float rx = Q.b0;
+	float ry = Q.b1;
+	float rz = Q.b2;
+
+	rx += Q.a10 * v.y;
+	ry += Q.a21 * v.z;
+	rz += Q.a20 * v.x;
+
+	rx *= 2;
+	ry *= 2;
+	rz *= 2;
+
+	rx += Q.a00 * v.x;
+	ry += Q.a11 * v.y;
+	rz += Q.a22 * v.z;
+
+	float r = Q.c;
+	r += rx * v.x;
+	r += ry * v.y;
+	r += rz * v.z;
+
+	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
+
+	return fabsf(r) * s;
+}
+
+static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
+{
+	float aw = a * w;
+	float bw = b * w;
+	float cw = c * w;
+	float dw = d * w;
+
+	Q.a00 = a * aw;
+	Q.a11 = b * bw;
+	Q.a22 = c * cw;
+	Q.a10 = a * bw;
+	Q.a20 = a * cw;
+	Q.a21 = b * cw;
+	Q.b0 = a * dw;
+	Q.b1 = b * dw;
+	Q.b2 = c * dw;
+	Q.c = d * dw;
+	Q.w = w;
+}
+
+static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
+{
+	// we need to encode (x - X) ^ 2 + (y - Y)^2 + (z - Z)^2 into the quadric
+	Q.a00 = w;
+	Q.a11 = w;
+	Q.a22 = w;
+	Q.a10 = 0.f;
+	Q.a20 = 0.f;
+	Q.a21 = 0.f;
+	Q.b0 = -2.f * x * w;
+	Q.b1 = -2.f * y * w;
+	Q.b2 = -2.f * z * w;
+	Q.c = (x * x + y * y + z * z) * w;
+	Q.w = w;
+}
+
+static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
+{
+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+
+	// normal = cross(p1 - p0, p2 - p0)
+	Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
+	float area = normalize(normal);
+
+	float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+
+	// we use sqrtf(area) so that the error is scaled linearly; this tends to improve silhouettes
+	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, sqrtf(area) * weight);
+}
+
+static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
+{
+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+	float length = normalize(p10);
+
+	// p20p = length of projection of p2-p0 onto normalize(p1 - p0)
+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+	float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z;
+
+	// normal = altitude of triangle from point p2 onto edge p1-p0
+	Vector3 normal = {p20.x - p10.x * p20p, p20.y - p10.y * p20p, p20.z - p10.z * p20p};
+	normalize(normal);
+
+	float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+
+	// note: the weight is scaled linearly with edge length; this has to match the triangle weight
+	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
+}
+
+static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int i0 = indices[i + 0];
+		unsigned int i1 = indices[i + 1];
+		unsigned int i2 = indices[i + 2];
+
+		Quadric Q;
+		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
+
+		quadricAdd(vertex_quadrics[remap[i0]], Q);
+		quadricAdd(vertex_quadrics[remap[i1]], Q);
+		quadricAdd(vertex_quadrics[remap[i2]], Q);
+	}
+}
+
+static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[3] = {1, 2, 0};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			unsigned char k0 = vertex_kind[i0];
+			unsigned char k1 = vertex_kind[i1];
+
+			// check that either i0 or i1 are border/seam and are on the same edge loop
+			// note that we need to add the error even for edged that connect e.g. border & locked
+			// if we don't do that, the adjacent border->border edge won't have correct errors for corners
+			if (k0 != Kind_Border && k0 != Kind_Seam && k1 != Kind_Border && k1 != Kind_Seam)
+				continue;
+
+			if ((k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+				continue;
+
+			if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0)
+				continue;
+
+			// seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
+			if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
+				continue;
+
+			unsigned int i2 = indices[i + next[next[e]]];
+
+			// we try hard to maintain border edge geometry; seam edges can move more freely
+			// due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical
+			const float kEdgeWeightSeam = 1.f;
+			const float kEdgeWeightBorder = 10.f;
+
+			float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam;
+
+			Quadric Q;
+			quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
+
+			quadricAdd(vertex_quadrics[remap[i0]], Q);
+			quadricAdd(vertex_quadrics[remap[i1]], Q);
+		}
+	}
+}
+
+static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop)
+{
+	size_t collapse_count = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[3] = {1, 2, 0};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			// this can happen either when input has a zero-length edge, or when we perform collapses for complex
+			// topology w/seams and collapse a manifold vertex that connects to both wedges onto one of them
+			// we leave edges like this alone since they may be important for preserving mesh integrity
+			if (remap[i0] == remap[i1])
+				continue;
+
+			unsigned char k0 = vertex_kind[i0];
+			unsigned char k1 = vertex_kind[i1];
+
+			// the edge has to be collapsible in at least one direction
+			if (!(kCanCollapse[k0][k1] | kCanCollapse[k1][k0]))
+				continue;
+
+			// manifold and seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
+			if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
+				continue;
+
+			// two vertices are on a border or a seam, but there's no direct edge between them
+			// this indicates that they belong to two different edge loops and we should not collapse this edge
+			// loop[] tracks half edges so we only need to check i0->i1
+			if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+				continue;
+
+			// edge can be collapsed in either direction - we will pick the one with minimum error
+			// note: we evaluate error later during collapse ranking, here we just tag the edge as bidirectional
+			if (kCanCollapse[k0][k1] & kCanCollapse[k1][k0])
+			{
+				Collapse c = {i0, i1, {/* bidi= */ 1}};
+				collapses[collapse_count++] = c;
+			}
+			else
+			{
+				// edge can only be collapsed in one direction
+				unsigned int e0 = kCanCollapse[k0][k1] ? i0 : i1;
+				unsigned int e1 = kCanCollapse[k0][k1] ? i1 : i0;
+
+				Collapse c = {e0, e1, {/* bidi= */ 0}};
+				collapses[collapse_count++] = c;
+			}
+		}
+	}
+
+	return collapse_count;
+}
+
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const Quadric* vertex_quadrics, const unsigned int* remap)
+{
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		Collapse& c = collapses[i];
+
+		unsigned int i0 = c.v0;
+		unsigned int i1 = c.v1;
+
+		// most edges are bidirectional which means we need to evaluate errors for two collapses
+		// to keep this code branchless we just use the same edge for unidirectional edges
+		unsigned int j0 = c.bidi ? i1 : i0;
+		unsigned int j1 = c.bidi ? i0 : i1;
+
+		const Quadric& qi = vertex_quadrics[remap[i0]];
+		const Quadric& qj = vertex_quadrics[remap[j0]];
+
+		float ei = quadricError(qi, vertex_positions[i1]);
+		float ej = quadricError(qj, vertex_positions[j1]);
+
+		// pick edge direction with minimal error
+		c.v0 = ei <= ej ? i0 : j0;
+		c.v1 = ei <= ej ? i1 : j1;
+		c.error = ei <= ej ? ei : ej;
+	}
+}
+
+#if TRACE > 1
+static void dumpEdgeCollapses(const Collapse* collapses, size_t collapse_count, const unsigned char* vertex_kind)
+{
+	size_t ckinds[Kind_Count][Kind_Count] = {};
+	float cerrors[Kind_Count][Kind_Count] = {};
+
+	for (int k0 = 0; k0 < Kind_Count; ++k0)
+		for (int k1 = 0; k1 < Kind_Count; ++k1)
+			cerrors[k0][k1] = FLT_MAX;
+
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		unsigned int i0 = collapses[i].v0;
+		unsigned int i1 = collapses[i].v1;
+
+		unsigned char k0 = vertex_kind[i0];
+		unsigned char k1 = vertex_kind[i1];
+
+		ckinds[k0][k1]++;
+		cerrors[k0][k1] = (collapses[i].error < cerrors[k0][k1]) ? collapses[i].error : cerrors[k0][k1];
+	}
+
+	for (int k0 = 0; k0 < Kind_Count; ++k0)
+		for (int k1 = 0; k1 < Kind_Count; ++k1)
+			if (ckinds[k0][k1])
+				printf("collapses %d -> %d: %d, min error %e\n", k0, k1, int(ckinds[k0][k1]), cerrors[k0][k1]);
+}
+
+static void dumpLockedCollapses(const unsigned int* indices, size_t index_count, const unsigned char* vertex_kind)
+{
+	size_t locked_collapses[Kind_Count][Kind_Count] = {};
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[3] = {1, 2, 0};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			unsigned char k0 = vertex_kind[i0];
+			unsigned char k1 = vertex_kind[i1];
+
+			locked_collapses[k0][k1] += !kCanCollapse[k0][k1] && !kCanCollapse[k1][k0];
+		}
+	}
+
+	for (int k0 = 0; k0 < Kind_Count; ++k0)
+		for (int k1 = 0; k1 < Kind_Count; ++k1)
+			if (locked_collapses[k0][k1])
+				printf("locked collapses %d -> %d: %d\n", k0, k1, int(locked_collapses[k0][k1]));
+}
+#endif
+
+static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count)
+{
+	const int sort_bits = 11;
+
+	// fill histogram for counting sort
+	unsigned int histogram[1 << sort_bits];
+	memset(histogram, 0, sizeof(histogram));
+
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		// skip sign bit since error is non-negative
+		unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+
+		histogram[key]++;
+	}
+
+	// compute offsets based on histogram data
+	size_t histogram_sum = 0;
+
+	for (size_t i = 0; i < 1 << sort_bits; ++i)
+	{
+		size_t count = histogram[i];
+		histogram[i] = unsigned(histogram_sum);
+		histogram_sum += count;
+	}
+
+	assert(histogram_sum == collapse_count);
+
+	// compute sort order based on offsets
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		// skip sign bit since error is non-negative
+		unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+
+		sort_order[histogram[key]++] = unsigned(i);
+	}
+}
+
+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, size_t triangle_collapse_goal, float error_goal, float error_limit)
+{
+	size_t edge_collapses = 0;
+	size_t triangle_collapses = 0;
+
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		const Collapse& c = collapses[collapse_order[i]];
+
+		if (c.error > error_limit)
+			break;
+
+		if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 10)
+			break;
+
+		if (triangle_collapses >= triangle_collapse_goal)
+			break;
+
+		unsigned int i0 = c.v0;
+		unsigned int i1 = c.v1;
+
+		unsigned int r0 = remap[i0];
+		unsigned int r1 = remap[i1];
+
+		// we don't collapse vertices that had source or target vertex involved in a collapse
+		// it's important to not move the vertices twice since it complicates the tracking/remapping logic
+		// it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass
+		if (collapse_locked[r0] | collapse_locked[r1])
+			continue;
+
+		assert(collapse_remap[r0] == r0);
+		assert(collapse_remap[r1] == r1);
+
+		quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
+
+		if (vertex_kind[i0] == Kind_Complex)
+		{
+			unsigned int v = i0;
+
+			do
+			{
+				collapse_remap[v] = r1;
+				v = wedge[v];
+			} while (v != i0);
+		}
+		else if (vertex_kind[i0] == Kind_Seam)
+		{
+			// remap v0 to v1 and seam pair of v0 to seam pair of v1
+			unsigned int s0 = wedge[i0];
+			unsigned int s1 = wedge[i1];
+
+			assert(s0 != i0 && s1 != i1);
+			assert(wedge[s0] == i0 && wedge[s1] == i1);
+
+			collapse_remap[i0] = i1;
+			collapse_remap[s0] = s1;
+		}
+		else
+		{
+			assert(wedge[i0] == i0);
+
+			collapse_remap[i0] = i1;
+		}
+
+		collapse_locked[r0] = 1;
+		collapse_locked[r1] = 1;
+
+		// border edges collapse 1 triangle, other edges collapse 2 or more
+		triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
+		edge_collapses++;
+	}
+
+	return edge_collapses;
+}
+
+static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap)
+{
+	size_t write = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int v0 = collapse_remap[indices[i + 0]];
+		unsigned int v1 = collapse_remap[indices[i + 1]];
+		unsigned int v2 = collapse_remap[indices[i + 2]];
+
+		// we never move the vertex twice during a single pass
+		assert(collapse_remap[v0] == v0);
+		assert(collapse_remap[v1] == v1);
+		assert(collapse_remap[v2] == v2);
+
+		if (v0 != v1 && v0 != v2 && v1 != v2)
+		{
+			indices[write + 0] = v0;
+			indices[write + 1] = v1;
+			indices[write + 2] = v2;
+			write += 3;
+		}
+	}
+
+	return write;
+}
+
+static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsigned int* collapse_remap)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (loop[i] != ~0u)
+		{
+			unsigned int l = loop[i];
+			unsigned int r = collapse_remap[l];
+
+			// i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes
+			loop[i] = (i == r) ? loop[l] : r;
+		}
+	}
+}
+
+struct CellHasher
+{
+	const unsigned int* vertex_ids;
+
+	size_t hash(unsigned int i) const
+	{
+		unsigned int h = vertex_ids[i];
+
+		// MurmurHash2 finalizer
+		h ^= h >> 13;
+		h *= 0x5bd1e995;
+		h ^= h >> 15;
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return vertex_ids[lhs] == vertex_ids[rhs];
+	}
+};
+
+struct IdHasher
+{
+	size_t hash(unsigned int id) const
+	{
+		unsigned int h = id;
+
+		// MurmurHash2 finalizer
+		h ^= h >> 13;
+		h *= 0x5bd1e995;
+		h ^= h >> 15;
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return lhs == rhs;
+	}
+};
+
+struct TriangleHasher
+{
+	unsigned int* indices;
+
+	size_t hash(unsigned int i) const
+	{
+		const unsigned int* tri = indices + i * 3;
+
+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
+		return (tri[0] * 73856093) ^ (tri[1] * 19349663) ^ (tri[2] * 83492791);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		const unsigned int* lt = indices + lhs * 3;
+		const unsigned int* rt = indices + rhs * 3;
+
+		return lt[0] == rt[0] && lt[1] == rt[1] && lt[2] == rt[2];
+	}
+};
+
+static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size)
+{
+	assert(grid_size >= 1 && grid_size <= 1024);
+	float cell_scale = float(grid_size - 1);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const Vector3& v = vertex_positions[i];
+
+		int xi = int(v.x * cell_scale + 0.5f);
+		int yi = int(v.y * cell_scale + 0.5f);
+		int zi = int(v.z * cell_scale + 0.5f);
+
+		vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
+	}
+}
+
+static size_t countTriangles(const unsigned int* vertex_ids, const unsigned int* indices, size_t index_count)
+{
+	size_t result = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int id0 = vertex_ids[indices[i + 0]];
+		unsigned int id1 = vertex_ids[indices[i + 1]];
+		unsigned int id2 = vertex_ids[indices[i + 2]];
+
+		result += (id0 != id1) & (id0 != id2) & (id1 != id2);
+	}
+
+	return result;
+}
+
+static size_t fillVertexCells(unsigned int* table, size_t table_size, unsigned int* vertex_cells, const unsigned int* vertex_ids, size_t vertex_count)
+{
+	CellHasher hasher = {vertex_ids};
+
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int* entry = hashLookup2(table, table_size, hasher, unsigned(i), ~0u);
+
+		if (*entry == ~0u)
+		{
+			*entry = unsigned(i);
+			vertex_cells[i] = unsigned(result++);
+		}
+		else
+		{
+			vertex_cells[i] = vertex_cells[*entry];
+		}
+	}
+
+	return result;
+}
+
+static size_t countVertexCells(unsigned int* table, size_t table_size, const unsigned int* vertex_ids, size_t vertex_count)
+{
+	IdHasher hasher;
+
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int id = vertex_ids[i];
+		unsigned int* entry = hashLookup2(table, table_size, hasher, id, ~0u);
+
+		result += (*entry == ~0u);
+		*entry = id;
+	}
+
+	return result;
+}
+
+static void fillCellQuadrics(Quadric* cell_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* vertex_cells)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int i0 = indices[i + 0];
+		unsigned int i1 = indices[i + 1];
+		unsigned int i2 = indices[i + 2];
+
+		unsigned int c0 = vertex_cells[i0];
+		unsigned int c1 = vertex_cells[i1];
+		unsigned int c2 = vertex_cells[i2];
+
+		bool single_cell = (c0 == c1) & (c0 == c2);
+
+		Quadric Q;
+		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], single_cell ? 3.f : 1.f);
+
+		if (single_cell)
+		{
+			quadricAdd(cell_quadrics[c0], Q);
+		}
+		else
+		{
+			quadricAdd(cell_quadrics[c0], Q);
+			quadricAdd(cell_quadrics[c1], Q);
+			quadricAdd(cell_quadrics[c2], Q);
+		}
+	}
+}
+
+static void fillCellQuadrics(Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* vertex_cells)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int c = vertex_cells[i];
+		const Vector3& v = vertex_positions[i];
+
+		Quadric Q;
+		quadricFromPoint(Q, v.x, v.y, v.z, 1.f);
+
+		quadricAdd(cell_quadrics[c], Q);
+	}
+}
+
+static void fillCellRemap(unsigned int* cell_remap, float* cell_errors, size_t cell_count, const unsigned int* vertex_cells, const Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count)
+{
+	memset(cell_remap, -1, cell_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int cell = vertex_cells[i];
+		float error = quadricError(cell_quadrics[cell], vertex_positions[i]);
+
+		if (cell_remap[cell] == ~0u || cell_errors[cell] > error)
+		{
+			cell_remap[cell] = unsigned(i);
+			cell_errors[cell] = error;
+		}
+	}
+}
+
+static size_t filterTriangles(unsigned int* destination, unsigned int* tritable, size_t tritable_size, const unsigned int* indices, size_t index_count, const unsigned int* vertex_cells, const unsigned int* cell_remap)
+{
+	TriangleHasher hasher = {destination};
+
+	memset(tritable, -1, tritable_size * sizeof(unsigned int));
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int c0 = vertex_cells[indices[i + 0]];
+		unsigned int c1 = vertex_cells[indices[i + 1]];
+		unsigned int c2 = vertex_cells[indices[i + 2]];
+
+		if (c0 != c1 && c0 != c2 && c1 != c2)
+		{
+			unsigned int a = cell_remap[c0];
+			unsigned int b = cell_remap[c1];
+			unsigned int c = cell_remap[c2];
+
+			if (b < a && b < c)
+			{
+				unsigned int t = a;
+				a = b, b = c, c = t;
+			}
+			else if (c < a && c < b)
+			{
+				unsigned int t = c;
+				c = b, b = a, a = t;
+			}
+
+			destination[result * 3 + 0] = a;
+			destination[result * 3 + 1] = b;
+			destination[result * 3 + 2] = c;
+
+			unsigned int* entry = hashLookup2(tritable, tritable_size, hasher, unsigned(result), ~0u);
+
+			if (*entry == ~0u)
+				*entry = unsigned(result++);
+		}
+	}
+
+	return result * 3;
+}
+
+static float interpolate(float y, float x0, float y0, float x1, float y1, float x2, float y2)
+{
+	// three point interpolation from "revenge of interpolation search" paper
+	float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
+	float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
+	return x1 + num / den;
+}
+
+} // namespace meshopt
+
+#ifndef NDEBUG
+unsigned char* meshopt_simplifyDebugKind = 0;
+unsigned int* meshopt_simplifyDebugLoop = 0;
+unsigned int* meshopt_simplifyDebugLoopBack = 0;
+#endif
+
+// -- GODOT start --
+//size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
+size_t meshopt_simplify(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float *r_resulting_error)
+// -- GODOT end --
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_index_count <= index_count);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* result = destination;
+
+	// build adjacency information
+	EdgeAdjacency adjacency = {};
+	buildEdgeAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// build position remap that maps each vertex to the one with identical position
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, allocator);
+
+	// classify vertices; vertex kind determines collapse rules, see kCanCollapse
+	unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
+	unsigned int* loop = allocator.allocate<unsigned int>(vertex_count);
+	unsigned int* loopback = allocator.allocate<unsigned int>(vertex_count);
+	classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge);
+
+#if TRACE
+	size_t unique_positions = 0;
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_positions += remap[i] == i;
+
+	printf("position remap: %d vertices => %d positions\n", int(vertex_count), int(unique_positions));
+
+	size_t kinds[Kind_Count] = {};
+	for (size_t i = 0; i < vertex_count; ++i)
+		kinds[vertex_kind[i]] += remap[i] == i;
+
+	printf("kinds: manifold %d, border %d, seam %d, complex %d, locked %d\n",
+	       int(kinds[Kind_Manifold]), int(kinds[Kind_Border]), int(kinds[Kind_Seam]), int(kinds[Kind_Complex]), int(kinds[Kind_Locked]));
+#endif
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+// -- GODOT start --
+	//rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);	
+	float extent = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+// -- GODOT end --
+
+	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
+	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
+
+	fillFaceQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap);
+	fillEdgeQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
+
+	if (result != indices)
+		memcpy(result, indices, index_count * sizeof(unsigned int));
+
+// -- GODOT start --
+#if TRACE
+	size_t pass_count = 0;
+	//float worst_error = 0;
+#endif
+	float worst_error = 0;
+// -- GODOT end --
+
+	Collapse* edge_collapses = allocator.allocate<Collapse>(index_count);
+	unsigned int* collapse_order = allocator.allocate<unsigned int>(index_count);
+	unsigned int* collapse_remap = allocator.allocate<unsigned int>(vertex_count);
+	unsigned char* collapse_locked = allocator.allocate<unsigned char>(vertex_count);
+
+	size_t result_count = index_count;
+
+	// target_error input is linear; we need to adjust it to match quadricError units
+	float error_limit = target_error * target_error;
+
+// -- GODOT start --
+	if (r_resulting_error) {
+		*r_resulting_error = 1.0;
+	}
+// -- GODOT end --
+
+	while (result_count > target_index_count)
+	{
+		size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, result, result_count, remap, vertex_kind, loop);
+
+		// no edges can be collapsed any more due to topology restrictions
+		if (edge_collapse_count == 0)
+			break;
+
+		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_quadrics, remap);
+
+#if TRACE > 1
+		dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind);
+#endif
+
+		sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
+
+		// most collapses remove 2 triangles; use this to establish a bound on the pass in terms of error limit
+		// note that edge_collapse_goal is an estimate; triangle_collapse_goal will be used to actually limit collapses
+		size_t triangle_collapse_goal = (result_count - target_index_count) / 3;
+		size_t edge_collapse_goal = triangle_collapse_goal / 2;
+
+		// we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked
+		// as they will share vertices with other successfull collapses, we need to increase the acceptable error by this factor
+		const float kPassErrorBound = 1.5f;
+
+		float error_goal = edge_collapse_goal < edge_collapse_count ? edge_collapses[collapse_order[edge_collapse_goal]].error * kPassErrorBound : FLT_MAX;
+
+		for (size_t i = 0; i < vertex_count; ++i)
+			collapse_remap[i] = unsigned(i);
+
+		memset(collapse_locked, 0, vertex_count);
+
+		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, triangle_collapse_goal, error_goal, error_limit);
+
+		// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
+		if (collapses == 0)
+			break;
+
+		remapEdgeLoops(loop, vertex_count, collapse_remap);
+		remapEdgeLoops(loopback, vertex_count, collapse_remap);
+
+		size_t new_count = remapIndexBuffer(result, result_count, collapse_remap);
+		assert(new_count < result_count);
+
+// -- GODOT start --
+//#if TRACE
+		float pass_error = 0.f;
+		for (size_t i = 0; i < edge_collapse_count; ++i)
+		{
+			Collapse& c = edge_collapses[collapse_order[i]];
+
+			if (collapse_remap[c.v0] == c.v1)
+				pass_error = c.error;
+		}
+
+		//pass_count++;
+		worst_error = (worst_error < pass_error) ? pass_error : worst_error;
+
+#if TRACE
+		pass_count++;
+		printf("pass %d: triangles: %d -> %d, collapses: %d/%d (goal: %d), error: %e (limit %e goal %e)\n", int(pass_count), int(result_count / 3), int(new_count / 3), int(collapses), int(edge_collapse_count), int(edge_collapse_goal), pass_error, error_limit, error_goal);
+#endif
+// -- GODOT end --
+
+		result_count = new_count;
+	}
+
+// -- GODOT start --
+	if (r_resulting_error) {
+		*r_resulting_error = sqrt(worst_error) * extent;
+	}
+// -- GODOT end --
+
+#if TRACE
+	printf("passes: %d, worst error: %e\n", int(pass_count), worst_error);
+#endif
+
+#if TRACE > 1
+	dumpLockedCollapses(result, result_count, vertex_kind);
+#endif
+
+#ifndef NDEBUG
+	if (meshopt_simplifyDebugKind)
+		memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count);
+
+	if (meshopt_simplifyDebugLoop)
+		memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int));
+
+	if (meshopt_simplifyDebugLoopBack)
+		memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int));
+#endif
+
+	return result_count;
+}
+
+size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_index_count <= index_count);
+
+	// we expect to get ~2 triangles/vertex in the output
+	size_t target_cell_count = target_index_count / 6;
+
+	if (target_cell_count == 0)
+		return 0;
+
+	meshopt_Allocator allocator;
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+	// find the optimal grid size using guided binary search
+#if TRACE
+	printf("source: %d vertices, %d triangles\n", int(vertex_count), int(index_count / 3));
+	printf("target: %d cells, %d triangles\n", int(target_cell_count), int(target_index_count / 3));
+#endif
+
+	unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
+
+	const int kInterpolationPasses = 5;
+
+	// invariant: # of triangles in min_grid <= target_count
+	int min_grid = 0;
+	int max_grid = 1025;
+	size_t min_triangles = 0;
+	size_t max_triangles = index_count / 3;
+
+	// instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
+	int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
+
+	for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
+	{
+		assert(min_triangles < target_index_count / 3);
+		assert(max_grid - min_grid > 1);
+
+		// we clamp the prediction of the grid size to make sure that the search converges
+		int grid_size = next_grid_size;
+		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size;
+
+		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+		size_t triangles = countTriangles(vertex_ids, indices, index_count);
+
+#if TRACE
+		printf("pass %d (%s): grid size %d, triangles %d, %s\n",
+		       pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary",
+		       grid_size, int(triangles),
+		       (triangles <= target_index_count / 3) ? "under" : "over");
+#endif
+
+		float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
+
+		if (triangles <= target_index_count / 3)
+		{
+			min_grid = grid_size;
+			min_triangles = triangles;
+		}
+		else
+		{
+			max_grid = grid_size;
+			max_triangles = triangles;
+		}
+
+		if (triangles == target_index_count / 3 || max_grid - min_grid <= 1)
+			break;
+
+		// we start by using interpolation search - it usually converges faster
+		// however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
+		next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
+	}
+
+	if (min_triangles == 0)
+		return 0;
+
+	// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
+	size_t table_size = hashBuckets2(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+
+	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
+
+	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
+
+	// build a quadric for each target cell
+	Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count);
+	memset(cell_quadrics, 0, cell_count * sizeof(Quadric));
+
+	fillCellQuadrics(cell_quadrics, indices, index_count, vertex_positions, vertex_cells);
+
+	// for each target cell, find the vertex with the minimal error
+	unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
+	float* cell_errors = allocator.allocate<float>(cell_count);
+
+	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count);
+
+	// collapse triangles!
+	// note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :(
+	size_t tritable_size = hashBuckets2(min_triangles);
+	unsigned int* tritable = allocator.allocate<unsigned int>(tritable_size);
+
+	size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap);
+	assert(write <= target_index_count);
+
+#if TRACE
+	printf("result: %d cells, %d triangles (%d unfiltered)\n", int(cell_count), int(write / 3), int(min_triangles));
+#endif
+
+	return write;
+}
+
+size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_vertex_count <= vertex_count);
+
+	size_t target_cell_count = target_vertex_count;
+
+	if (target_cell_count == 0)
+		return 0;
+
+	meshopt_Allocator allocator;
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+	// find the optimal grid size using guided binary search
+#if TRACE
+	printf("source: %d vertices\n", int(vertex_count));
+	printf("target: %d cells\n", int(target_cell_count));
+#endif
+
+	unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
+
+	size_t table_size = hashBuckets2(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+
+	const int kInterpolationPasses = 5;
+
+	// invariant: # of vertices in min_grid <= target_count
+	int min_grid = 0;
+	int max_grid = 1025;
+	size_t min_vertices = 0;
+	size_t max_vertices = vertex_count;
+
+	// instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
+	int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
+
+	for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
+	{
+		assert(min_vertices < target_vertex_count);
+		assert(max_grid - min_grid > 1);
+
+		// we clamp the prediction of the grid size to make sure that the search converges
+		int grid_size = next_grid_size;
+		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size;
+
+		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+		size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count);
+
+#if TRACE
+		printf("pass %d (%s): grid size %d, vertices %d, %s\n",
+		       pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary",
+		       grid_size, int(vertices),
+		       (vertices <= target_vertex_count) ? "under" : "over");
+#endif
+
+		float tip = interpolate(float(target_vertex_count), float(min_grid), float(min_vertices), float(grid_size), float(vertices), float(max_grid), float(max_vertices));
+
+		if (vertices <= target_vertex_count)
+		{
+			min_grid = grid_size;
+			min_vertices = vertices;
+		}
+		else
+		{
+			max_grid = grid_size;
+			max_vertices = vertices;
+		}
+
+		if (vertices == target_vertex_count || max_grid - min_grid <= 1)
+			break;
+
+		// we start by using interpolation search - it usually converges faster
+		// however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
+		next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
+	}
+
+	if (min_vertices == 0)
+		return 0;
+
+	// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
+	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
+
+	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
+
+	// build a quadric for each target cell
+	Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count);
+	memset(cell_quadrics, 0, cell_count * sizeof(Quadric));
+
+	fillCellQuadrics(cell_quadrics, vertex_positions, vertex_count, vertex_cells);
+
+	// for each target cell, find the vertex with the minimal error
+	unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
+	float* cell_errors = allocator.allocate<float>(cell_count);
+
+	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count);
+
+	// copy results to the output
+	assert(cell_count <= target_vertex_count);
+	memcpy(destination, cell_remap, sizeof(unsigned int) * cell_count);
+
+#if TRACE
+	printf("result: %d cells\n", int(cell_count));
+#endif
+
+	return cell_count;
+}
diff --git a/thirdparty/meshoptimizer/spatialorder.cpp b/thirdparty/meshoptimizer/spatialorder.cpp
new file mode 100644
index 0000000000..b09f80ac6f
--- /dev/null
+++ b/thirdparty/meshoptimizer/spatialorder.cpp
@@ -0,0 +1,194 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Fabian Giesen. Decoding Morton codes. 2009
+namespace meshopt
+{
+
+// "Insert" two 0 bits after each of the 10 low bits of x
+inline unsigned int part1By2(unsigned int x)
+{
+	x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
+	x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+	x = (x ^ (x << 8)) & 0x0300f00f;  // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+	x = (x ^ (x << 4)) & 0x030c30c3;  // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+	x = (x ^ (x << 2)) & 0x09249249;  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	return x;
+}
+
+static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		for (int j = 0; j < 3; ++j)
+		{
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
+		}
+	}
+
+	float extent = 0.f;
+
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	float scale = extent == 0 ? 0.f : 1.f / extent;
+
+	// generate Morton order based on the position inside a unit cube
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
+		int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
+		int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
+
+		result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+	}
+}
+
+static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
+{
+	memset(hist, 0, sizeof(hist));
+
+	// compute 3 10-bit histograms in parallel
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = data[i];
+
+		hist[(id >> 0) & 1023][0]++;
+		hist[(id >> 10) & 1023][1]++;
+		hist[(id >> 20) & 1023][2]++;
+	}
+
+	unsigned int sumx = 0, sumy = 0, sumz = 0;
+
+	// replace histogram data with prefix histogram sums in-place
+	for (int i = 0; i < 1024; ++i)
+	{
+		unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
+
+		hist[i][0] = sumx;
+		hist[i][1] = sumy;
+		hist[i][2] = sumz;
+
+		sumx += hx;
+		sumy += hy;
+		sumz += hz;
+	}
+
+	assert(sumx == count && sumy == count && sumz == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+{
+	int bitoff = pass * 10;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = (keys[source[i]] >> bitoff) & 1023;
+
+		destination[hist[id][pass]++] = source[i];
+	}
+}
+
+} // namespace meshopt
+
+void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* keys = allocator.allocate<unsigned int>(vertex_count);
+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
+
+	unsigned int hist[1024][3];
+	computeHistogram(hist, keys, vertex_count);
+
+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		destination[i] = unsigned(i);
+
+	// 3-pass radix sort computes the resulting order into scratch
+	radixPass(scratch, destination, keys, vertex_count, hist, 0);
+	radixPass(destination, scratch, keys, vertex_count, hist, 1);
+	radixPass(scratch, destination, keys, vertex_count, hist, 2);
+
+	// since our remap table is mapping old=>new, we need to reverse it
+	for (size_t i = 0; i < vertex_count; ++i)
+		destination[scratch[i]] = unsigned(i);
+}
+
+void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	(void)vertex_count;
+
+	size_t face_count = index_count / 3;
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_Allocator allocator;
+
+	float* centroids = allocator.allocate<float>(face_count * 3);
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* va = vertex_positions + a * vertex_stride_float;
+		const float* vb = vertex_positions + b * vertex_stride_float;
+		const float* vc = vertex_positions + c * vertex_stride_float;
+
+		centroids[i * 3 + 0] = (va[0] + vb[0] + vc[0]) / 3.f;
+		centroids[i * 3 + 1] = (va[1] + vb[1] + vc[1]) / 3.f;
+		centroids[i * 3 + 2] = (va[2] + vb[2] + vc[2]) / 3.f;
+	}
+
+	unsigned int* remap = allocator.allocate<unsigned int>(face_count);
+
+	meshopt_spatialSortRemap(remap, centroids, face_count, sizeof(float) * 3);
+
+	// support in-order remap
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		unsigned int r = remap[i];
+
+		destination[r * 3 + 0] = a;
+		destination[r * 3 + 1] = b;
+		destination[r * 3 + 2] = c;
+	}
+}
diff --git a/thirdparty/meshoptimizer/stripifier.cpp b/thirdparty/meshoptimizer/stripifier.cpp
new file mode 100644
index 0000000000..8ce17ef3dc
--- /dev/null
+++ b/thirdparty/meshoptimizer/stripifier.cpp
@@ -0,0 +1,295 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <string.h>
+
+// This work is based on:
+// Francine Evans, Steven Skiena and Amitabh Varshney. Optimizing Triangle Strips for Fast Rendering. 1996
+namespace meshopt
+{
+
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
+{
+	unsigned int index = 0;
+	unsigned int iv = ~0u;
+
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+		unsigned int v = (va < vb && va < vc) ? va : (vb < vc) ? vb : vc;
+
+		if (v < iv)
+		{
+			index = unsigned(i);
+			iv = v;
+		}
+	}
+
+	return index;
+}
+
+static int findStripNext(const unsigned int buffer[][3], unsigned int buffer_size, unsigned int e0, unsigned int e1)
+{
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+		if (e0 == a && e1 == b)
+			return (int(i) << 2) | 2;
+		else if (e0 == b && e1 == c)
+			return (int(i) << 2) | 0;
+		else if (e0 == c && e1 == a)
+			return (int(i) << 2) | 1;
+	}
+
+	return -1;
+}
+
+} // namespace meshopt
+
+size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index)
+{
+	assert(destination != indices);
+	assert(index_count % 3 == 0);
+
+	using namespace meshopt;
+
+	meshopt_Allocator allocator;
+
+	const size_t buffer_capacity = 8;
+
+	unsigned int buffer[buffer_capacity][3] = {};
+	unsigned int buffer_size = 0;
+
+	size_t index_offset = 0;
+
+	unsigned int strip[2] = {};
+	unsigned int parity = 0;
+
+	size_t strip_size = 0;
+
+	// compute vertex valence; this is used to prioritize starting triangle for strips
+	unsigned int* valence = allocator.allocate<unsigned int>(vertex_count);
+	memset(valence, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		valence[index]++;
+	}
+
+	int next = -1;
+
+	while (buffer_size > 0 || index_offset < index_count)
+	{
+		assert(next < 0 || (size_t(next >> 2) < buffer_size && (next & 3) < 3));
+
+		// fill triangle buffer
+		while (buffer_size < buffer_capacity && index_offset < index_count)
+		{
+			buffer[buffer_size][0] = indices[index_offset + 0];
+			buffer[buffer_size][1] = indices[index_offset + 1];
+			buffer[buffer_size][2] = indices[index_offset + 2];
+
+			buffer_size++;
+			index_offset += 3;
+		}
+
+		assert(buffer_size > 0);
+
+		if (next >= 0)
+		{
+			unsigned int i = next >> 2;
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+			unsigned int v = buffer[i][next & 3];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// find next triangle (note that edge order flips on every iteration)
+			// in some cases we need to perform a swap to pick a different outgoing triangle edge
+			// for [a b c], the default strip edge is [b c], but we might want to use [a c]
+			int cont = findStripNext(buffer, buffer_size, parity ? strip[1] : v, parity ? v : strip[1]);
+			int swap = cont < 0 ? findStripNext(buffer, buffer_size, parity ? v : strip[0], parity ? strip[0] : v) : -1;
+
+			if (cont < 0 && swap >= 0)
+			{
+				// [a b c] => [a b a c]
+				destination[strip_size++] = strip[0];
+				destination[strip_size++] = v;
+
+				// next strip has same winding
+				// ? a b => b a v
+				strip[1] = v;
+
+				next = swap;
+			}
+			else
+			{
+				// emit the next vertex in the strip
+				destination[strip_size++] = v;
+
+				// next strip has flipped winding
+				strip[0] = strip[1];
+				strip[1] = v;
+				parity ^= 1;
+
+				next = cont;
+			}
+		}
+		else
+		{
+			// if we didn't find anything, we need to find the next new triangle
+			// we use a heuristic to maximize the strip length
+			unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// we need to pre-rotate the triangle so that we will find a match in the existing buffer on the next iteration
+			int ea = findStripNext(buffer, buffer_size, c, b);
+			int eb = findStripNext(buffer, buffer_size, a, c);
+			int ec = findStripNext(buffer, buffer_size, b, a);
+
+			// in some cases we can have several matching edges; since we can pick any edge, we pick the one with the smallest
+			// triangle index in the buffer. this reduces the effect of stripification on ACMR and additionally - for unclear
+			// reasons - slightly improves the stripification efficiency
+			int mine = INT_MAX;
+			mine = (ea >= 0 && mine > ea) ? ea : mine;
+			mine = (eb >= 0 && mine > eb) ? eb : mine;
+			mine = (ec >= 0 && mine > ec) ? ec : mine;
+
+			if (ea == mine)
+			{
+				// keep abc
+				next = ea;
+			}
+			else if (eb == mine)
+			{
+				// abc -> bca
+				unsigned int t = a;
+				a = b, b = c, c = t;
+
+				next = eb;
+			}
+			else if (ec == mine)
+			{
+				// abc -> cab
+				unsigned int t = c;
+				c = b, b = a, a = t;
+
+				next = ec;
+			}
+
+			if (restart_index)
+			{
+				if (strip_size)
+					destination[strip_size++] = restart_index;
+
+				destination[strip_size++] = a;
+				destination[strip_size++] = b;
+				destination[strip_size++] = c;
+
+				// new strip always starts with the same edge winding
+				strip[0] = b;
+				strip[1] = c;
+				parity = 1;
+			}
+			else
+			{
+				if (strip_size)
+				{
+					// connect last strip using degenerate triangles
+					destination[strip_size++] = strip[1];
+					destination[strip_size++] = a;
+				}
+
+				// note that we may need to flip the emitted triangle based on parity
+				// we always end up with outgoing edge "cb" in the end
+				unsigned int e0 = parity ? c : b;
+				unsigned int e1 = parity ? b : c;
+
+				destination[strip_size++] = a;
+				destination[strip_size++] = e0;
+				destination[strip_size++] = e1;
+
+				strip[0] = e0;
+				strip[1] = e1;
+				parity ^= 1;
+			}
+		}
+	}
+
+	return strip_size;
+}
+
+size_t meshopt_stripifyBound(size_t index_count)
+{
+	assert(index_count % 3 == 0);
+
+	// worst case without restarts is 2 degenerate indices and 3 indices per triangle
+	// worst case with restarts is 1 restart index and 3 indices per triangle
+	return (index_count / 3) * 5;
+}
+
+size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index)
+{
+	assert(destination != indices);
+
+	size_t offset = 0;
+	size_t start = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		if (restart_index && indices[i] == restart_index)
+		{
+			start = i + 1;
+		}
+		else if (i - start >= 2)
+		{
+			unsigned int a = indices[i - 2], b = indices[i - 1], c = indices[i];
+
+			// flip winding for odd triangles
+			if ((i - start) & 1)
+			{
+				unsigned int t = a;
+				a = b, b = t;
+			}
+
+			// although we use restart indices, strip swaps still produce degenerate triangles, so skip them
+			if (a != b && a != c && b != c)
+			{
+				destination[offset + 0] = a;
+				destination[offset + 1] = b;
+				destination[offset + 2] = c;
+				offset += 3;
+			}
+		}
+	}
+
+	return offset;
+}
+
+size_t meshopt_unstripifyBound(size_t index_count)
+{
+	assert(index_count == 0 || index_count >= 3);
+
+	return (index_count == 0) ? 0 : (index_count - 2) * 3;
+}
diff --git a/thirdparty/meshoptimizer/vcacheanalyzer.cpp b/thirdparty/meshoptimizer/vcacheanalyzer.cpp
new file mode 100644
index 0000000000..3682743820
--- /dev/null
+++ b/thirdparty/meshoptimizer/vcacheanalyzer.cpp
@@ -0,0 +1,73 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
+{
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+	assert(warp_size == 0 || warp_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexCacheStatistics result = {};
+
+	unsigned int warp_offset = 0;
+	unsigned int primgroup_offset = 0;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		bool ac = (timestamp - cache_timestamps[a]) > cache_size;
+		bool bc = (timestamp - cache_timestamps[b]) > cache_size;
+		bool cc = (timestamp - cache_timestamps[c]) > cache_size;
+
+		// flush cache if triangle doesn't fit into warp or into the primitive buffer
+		if ((primgroup_size && primgroup_offset == primgroup_size) || (warp_size && warp_offset + ac + bc + cc > warp_size))
+		{
+			result.warps_executed += warp_offset > 0;
+
+			warp_offset = 0;
+			primgroup_offset = 0;
+
+			// reset cache
+			timestamp += cache_size + 1;
+		}
+
+		// update cache and add vertices to warp
+		for (int j = 0; j < 3; ++j)
+		{
+			unsigned int index = indices[i + j];
+
+			if (timestamp - cache_timestamps[index] > cache_size)
+			{
+				cache_timestamps[index] = timestamp++;
+				result.vertices_transformed++;
+				warp_offset++;
+			}
+		}
+
+		primgroup_offset++;
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += cache_timestamps[i] > 0;
+
+	result.warps_executed += warp_offset > 0;
+
+	result.acmr = index_count == 0 ? 0 : float(result.vertices_transformed) / float(index_count / 3);
+	result.atvr = unique_vertex_count == 0 ? 0 : float(result.vertices_transformed) / float(unique_vertex_count);
+
+	return result;
+}
diff --git a/thirdparty/meshoptimizer/vcacheoptimizer.cpp b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
new file mode 100644
index 0000000000..fb8ade4b77
--- /dev/null
+++ b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
@@ -0,0 +1,473 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// This work is based on:
+// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+const size_t kCacheSizeMax = 16;
+const size_t kValenceMax = 8;
+
+struct VertexScoreTable
+{
+	float cache[1 + kCacheSizeMax];
+	float live[1 + kValenceMax];
+};
+
+// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
+static const VertexScoreTable kVertexScoreTable = {
+    {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
+    {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
+};
+
+// Tuned to minimize the encoded index buffer size
+static const VertexScoreTable kVertexScoreTableStrip = {
+    {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
+    {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
+};
+
+struct TriangleAdjacency
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
+static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
+{
+	// check dead-end stack
+	while (dead_end_top)
+	{
+		unsigned int vertex = dead_end[--dead_end_top];
+
+		if (live_triangles[vertex] > 0)
+			return vertex;
+	}
+
+	// input order
+	while (input_cursor < vertex_count)
+	{
+		if (live_triangles[input_cursor] > 0)
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
+{
+	unsigned int best_candidate = ~0u;
+	int best_priority = -1;
+
+	for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
+	{
+		unsigned int vertex = *next_candidate;
+
+		// otherwise we don't need to process it
+		if (live_triangles[vertex] > 0)
+		{
+			int priority = 0;
+
+			// will it be in cache after fanning?
+			if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
+			{
+				priority = timestamp - cache_timestamps[vertex]; // position in cache
+			}
+
+			if (priority > best_priority)
+			{
+				best_candidate = vertex;
+				best_priority = priority;
+			}
+		}
+	}
+
+	return best_candidate;
+}
+
+static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
+{
+	assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
+
+	unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
+
+	return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
+}
+
+static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
+{
+	// input order
+	while (input_cursor < face_count)
+	{
+		if (!emitted_flags[input_cursor])
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+	assert(cache_size <= kCacheSizeMax);
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	// compute initial vertex scores
+	float* vertex_scores = allocator.allocate<float>(vertex_count);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
+
+	// compute triangle scores
+	float* triangle_scores = allocator.allocate<float>(face_count);
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0];
+		unsigned int b = indices[i * 3 + 1];
+		unsigned int c = indices[i * 3 + 2];
+
+		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
+	}
+
+	unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
+	unsigned int* cache = cache_holder;
+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
+	size_t cache_count = 0;
+
+	unsigned int current_triangle = 0;
+	unsigned int input_cursor = 1;
+
+	unsigned int output_triangle = 0;
+
+	while (current_triangle != ~0u)
+	{
+		assert(output_triangle < face_count);
+
+		unsigned int a = indices[current_triangle * 3 + 0];
+		unsigned int b = indices[current_triangle * 3 + 1];
+		unsigned int c = indices[current_triangle * 3 + 2];
+
+		// output indices
+		destination[output_triangle * 3 + 0] = a;
+		destination[output_triangle * 3 + 1] = b;
+		destination[output_triangle * 3 + 2] = c;
+		output_triangle++;
+
+		// update emitted flags
+		emitted_flags[current_triangle] = true;
+		triangle_scores[current_triangle] = 0;
+
+		// new triangle
+		size_t cache_write = 0;
+		cache_new[cache_write++] = a;
+		cache_new[cache_write++] = b;
+		cache_new[cache_write++] = c;
+
+		// old triangles
+		for (size_t i = 0; i < cache_count; ++i)
+		{
+			unsigned int index = cache[i];
+
+			if (index != a && index != b && index != c)
+			{
+				cache_new[cache_write++] = index;
+			}
+		}
+
+		unsigned int* cache_temp = cache;
+		cache = cache_new, cache_new = cache_temp;
+		cache_count = cache_write > cache_size ? cache_size : cache_write;
+
+		// update live triangle counts
+		live_triangles[a]--;
+		live_triangles[b]--;
+		live_triangles[c]--;
+
+		// remove emitted triangle from adjacency data
+		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		for (size_t k = 0; k < 3; ++k)
+		{
+			unsigned int index = indices[current_triangle * 3 + k];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t i = 0; i < neighbours_size; ++i)
+			{
+				unsigned int tri = neighbours[i];
+
+				if (tri == current_triangle)
+				{
+					neighbours[i] = neighbours[neighbours_size - 1];
+					adjacency.counts[index]--;
+					break;
+				}
+			}
+		}
+
+		unsigned int best_triangle = ~0u;
+		float best_score = 0;
+
+		// update cache positions, vertex scores and triangle scores, and find next best triangle
+		for (size_t i = 0; i < cache_write; ++i)
+		{
+			unsigned int index = cache[i];
+
+			int cache_position = i >= cache_size ? -1 : int(i);
+
+			// update vertex score
+			float score = vertexScore(table, cache_position, live_triangles[index]);
+			float score_diff = score - vertex_scores[index];
+
+			vertex_scores[index] = score;
+
+			// update scores of vertex triangles
+			const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
+			const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
+
+			for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+			{
+				unsigned int tri = *it;
+				assert(!emitted_flags[tri]);
+
+				float tri_score = triangle_scores[tri] + score_diff;
+				assert(tri_score > 0);
+
+				if (best_score < tri_score)
+				{
+					best_triangle = tri;
+					best_score = tri_score;
+				}
+
+				triangle_scores[tri] = tri_score;
+			}
+		}
+
+		// step through input triangles in order if we hit a dead-end
+		current_triangle = best_triangle;
+
+		if (current_triangle == ~0u)
+		{
+			current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
+		}
+	}
+
+	assert(input_cursor == face_count);
+	assert(output_triangle == face_count);
+}
+
+void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
+}
+
+void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
+}
+
+void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	// cache time stamps
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	// dead-end stack
+	unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
+	unsigned int dead_end_top = 0;
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	unsigned int current_vertex = 0;
+
+	unsigned int timestamp = cache_size + 1;
+	unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
+
+	unsigned int output_triangle = 0;
+
+	while (current_vertex != ~0u)
+	{
+		const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
+
+		// emit all vertex neighbours
+		const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
+		const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
+
+		for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+		{
+			unsigned int triangle = *it;
+
+			if (!emitted_flags[triangle])
+			{
+				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+				// output indices
+				destination[output_triangle * 3 + 0] = a;
+				destination[output_triangle * 3 + 1] = b;
+				destination[output_triangle * 3 + 2] = c;
+				output_triangle++;
+
+				// update dead-end stack
+				dead_end[dead_end_top + 0] = a;
+				dead_end[dead_end_top + 1] = b;
+				dead_end[dead_end_top + 2] = c;
+				dead_end_top += 3;
+
+				// update live triangle counts
+				live_triangles[a]--;
+				live_triangles[b]--;
+				live_triangles[c]--;
+
+				// update cache info
+				// if vertex is not in cache, put it in cache
+				if (timestamp - cache_timestamps[a] > cache_size)
+					cache_timestamps[a] = timestamp++;
+
+				if (timestamp - cache_timestamps[b] > cache_size)
+					cache_timestamps[b] = timestamp++;
+
+				if (timestamp - cache_timestamps[c] > cache_size)
+					cache_timestamps[c] = timestamp++;
+
+				// update emitted flags
+				emitted_flags[triangle] = true;
+			}
+		}
+
+		// next candidates are the ones we pushed to dead-end stack just now
+		const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
+
+		// get next vertex
+		current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
+
+		if (current_vertex == ~0u)
+		{
+			current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
+		}
+	}
+
+	assert(output_triangle == face_count);
+}
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp
new file mode 100644
index 0000000000..784c9a13db
--- /dev/null
+++ b/thirdparty/meshoptimizer/vertexcodec.cpp
@@ -0,0 +1,1265 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+
+// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
+#if defined(__AVX__) || defined(__SSSE3__)
+#define SIMD_SSE
+#endif
+
+// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
+#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
+#undef SIMD_SSE
+#define SIMD_AVX
+#endif
+
+// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
+#define SIMD_SSE
+#define SIMD_FALLBACK
+#endif
+
+// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
+#define SIMD_SSE
+#define SIMD_FALLBACK
+#define SIMD_TARGET __attribute__((target("ssse3")))
+#endif
+
+// GCC/clang define these when NEON support is available
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define SIMD_NEON
+#endif
+
+// On MSVC, we assume that ARM builds always target NEON-capable devices
+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#define SIMD_NEON
+#endif
+
+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
+#if defined(__wasm_simd128__)
+#define SIMD_WASM
+#endif
+
+#ifndef SIMD_TARGET
+#define SIMD_TARGET
+#endif
+
+#endif // !MESHOPTIMIZER_NO_SIMD
+
+#ifdef SIMD_SSE
+#include <tmmintrin.h>
+#endif
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+#ifdef _MSC_VER
+#include <intrin.h> // __cpuid
+#else
+#include <cpuid.h> // __cpuid
+#endif
+#endif
+
+#ifdef SIMD_AVX
+#include <immintrin.h>
+#endif
+
+#ifdef SIMD_NEON
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#ifdef SIMD_WASM
+#include <wasm_simd128.h>
+#endif
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+#ifdef SIMD_WASM
+#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
+#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
+#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
+#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
+#endif
+
+namespace meshopt
+{
+
+const unsigned char kVertexHeader = 0xa0;
+
+static int gEncodeVertexVersion = 0;
+
+const size_t kVertexBlockSizeBytes = 8192;
+const size_t kVertexBlockMaxSize = 256;
+const size_t kByteGroupSize = 16;
+const size_t kByteGroupDecodeLimit = 24;
+const size_t kTailMaxSize = 32;
+
+static size_t getVertexBlockSize(size_t vertex_size)
+{
+	// make sure the entire block fits into the scratch buffer
+	size_t result = kVertexBlockSizeBytes / vertex_size;
+
+	// align to byte group size; we encode each byte as a byte group
+	// if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
+	result &= ~(kByteGroupSize - 1);
+
+	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
+}
+
+inline unsigned char zigzag8(unsigned char v)
+{
+	return ((signed char)(v) >> 7) ^ (v << 1);
+}
+
+inline unsigned char unzigzag8(unsigned char v)
+{
+	return -(v & 1) ^ (v >> 1);
+}
+
+#if TRACE
+struct Stats
+{
+	size_t size;
+	size_t header;
+	size_t bitg[4];
+	size_t bitb[4];
+};
+
+Stats* bytestats;
+Stats vertexstats[256];
+#endif
+
+static bool encodeBytesGroupZero(const unsigned char* buffer)
+{
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+		if (buffer[i])
+			return false;
+
+	return true;
+}
+
+static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
+{
+	assert(bits >= 1 && bits <= 8);
+
+	if (bits == 1)
+		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
+
+	if (bits == 8)
+		return kByteGroupSize;
+
+	size_t result = kByteGroupSize * bits / 8;
+
+	unsigned char sentinel = (1 << bits) - 1;
+
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+		result += buffer[i] >= sentinel;
+
+	return result;
+}
+
+static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
+{
+	assert(bits >= 1 && bits <= 8);
+
+	if (bits == 1)
+		return data;
+
+	if (bits == 8)
+	{
+		memcpy(data, buffer, kByteGroupSize);
+		return data + kByteGroupSize;
+	}
+
+	size_t byte_size = 8 / bits;
+	assert(kByteGroupSize % byte_size == 0);
+
+	// fixed portion: bits bits for each value
+	// variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
+	unsigned char sentinel = (1 << bits) - 1;
+
+	for (size_t i = 0; i < kByteGroupSize; i += byte_size)
+	{
+		unsigned char byte = 0;
+
+		for (size_t k = 0; k < byte_size; ++k)
+		{
+			unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
+
+			byte <<= bits;
+			byte |= enc;
+		}
+
+		*data++ = byte;
+	}
+
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+	{
+		if (buffer[i] >= sentinel)
+		{
+			*data++ = buffer[i];
+		}
+	}
+
+	return data;
+}
+
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+
+	unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	memset(header, 0, header_size);
+
+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
+			return 0;
+
+		int best_bits = 8;
+		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+
+		for (int bits = 1; bits < 8; bits *= 2)
+		{
+			size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+
+			if (size < best_size)
+			{
+				best_bits = bits;
+				best_size = size;
+			}
+		}
+
+		int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
+		assert((1 << bitslog2) == best_bits);
+
+		size_t header_offset = i / kByteGroupSize;
+
+		header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
+
+		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
+
+		assert(data + best_size == next);
+		data = next;
+
+#if TRACE > 1
+		bytestats->bitg[bitslog2]++;
+		bytestats->bitb[bitslog2] += best_size;
+#endif
+	}
+
+#if TRACE > 1
+	bytestats->header += header_size;
+#endif
+
+	return data;
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize];
+	assert(sizeof(buffer) % kByteGroupSize == 0);
+
+	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+	memset(buffer, 0, sizeof(buffer));
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		size_t vertex_offset = k;
+
+		unsigned char p = last_vertex[k];
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
+
+			p = vertex_data[vertex_offset];
+
+			vertex_offset += vertex_size;
+		}
+
+#if TRACE
+		const unsigned char* olddata = data;
+		bytestats = &vertexstats[k];
+#endif
+
+		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
+		if (!data)
+			return 0;
+
+#if TRACE
+		bytestats = 0;
+		vertexstats[k].size += data - olddata;
+#endif
+	}
+
+	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+
+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+#define READ() byte = *data++
+#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
+
+	unsigned char byte, enc, encv;
+	const unsigned char* data_var;
+
+	switch (bitslog2)
+	{
+	case 0:
+		memset(buffer, 0, kByteGroupSize);
+		return data;
+	case 1:
+		data_var = data + 4;
+
+		// 4 groups with 4 2-bit values in each byte
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+
+		return data_var;
+	case 2:
+		data_var = data + 8;
+
+		// 8 groups with 2 4-bit values in each byte
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+
+		return data_var;
+	case 3:
+		memcpy(buffer, data, kByteGroupSize);
+		return data + kByteGroupSize;
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+
+#undef READ
+#undef NEXT
+}
+
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+
+	const unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
+			return 0;
+
+		size_t header_offset = i / kByteGroupSize;
+
+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+		data = decodeBytesGroup(data, buffer + i, bitslog2);
+	}
+
+	return data;
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize];
+	unsigned char transposed[kVertexBlockSizeBytes];
+
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
+		if (!data)
+			return 0;
+
+		size_t vertex_offset = k;
+
+		unsigned char p = last_vertex[k];
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			unsigned char v = unzigzag8(buffer[i]) + p;
+
+			transposed[vertex_offset] = v;
+			p = v;
+
+			vertex_offset += vertex_size;
+		}
+	}
+
+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+static unsigned char kDecodeBytesGroupShuffle[256][8];
+static unsigned char kDecodeBytesGroupCount[256];
+
+#ifdef __wasm__
+__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
+#endif
+static bool
+decodeBytesGroupBuildTables()
+{
+	for (int mask = 0; mask < 256; ++mask)
+	{
+		unsigned char shuffle[8];
+		unsigned char count = 0;
+
+		for (int i = 0; i < 8; ++i)
+		{
+			int maski = (mask >> i) & 1;
+			shuffle[i] = maski ? count : 0x80;
+			count += (unsigned char)(maski);
+		}
+
+		memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
+		kDecodeBytesGroupCount[mask] = count;
+	}
+
+	return true;
+}
+
+static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
+#endif
+
+#ifdef SIMD_SSE
+SIMD_TARGET
+static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+{
+	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
+	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
+	__m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
+
+	__m128i sm1r = _mm_add_epi8(sm1, sm1off);
+
+	return _mm_unpacklo_epi64(sm0, sm1r);
+}
+
+SIMD_TARGET
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		__m128i result = _mm_setzero_si128();
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data;
+	}
+
+	case 1:
+	{
+#ifdef __GNUC__
+		typedef int __attribute__((aligned(1))) unaligned_int;
+#else
+		typedef int unaligned_int;
+#endif
+
+		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
+
+		__m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
+		__m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
+		__m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
+
+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
+		int mask16 = _mm_movemask_epi8(mask);
+		unsigned char mask0 = (unsigned char)(mask16 & 255);
+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+
+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
+
+		__m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
+		__m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
+
+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
+		int mask16 = _mm_movemask_epi8(mask);
+		unsigned char mask0 = (unsigned char)(mask16 & 255);
+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+
+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_AVX
+static const __m128i decodeBytesGroupConfig[] = {
+    _mm_set1_epi8(3),
+    _mm_set1_epi8(15),
+    _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
+    _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
+};
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		__m128i result = _mm_setzero_si128();
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data;
+	}
+
+	case 1:
+	case 2:
+	{
+		const unsigned char* skip = data + (bitslog2 << 2);
+
+		__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
+
+		__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
+		__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
+
+		__m128i selw = _mm_shuffle_epi32(selb, 0x44);
+		__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
+		__mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
+
+		__m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return skip + _mm_popcnt_u32(mask16);
+	}
+
+	case 3:
+	{
+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_NEON
+static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+{
+	uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
+	uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
+
+	uint8x8_t r0 = vtbl1_u8(rest0, sm0);
+	uint8x8_t r1 = vtbl1_u8(rest1, sm1);
+
+	return vcombine_u8(r0, r1);
+}
+
+static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+{
+	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
+
+	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
+	uint8x16_t masked = vandq_u8(mask, byte_mask);
+
+#ifdef __aarch64__
+	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
+	mask0 = vaddv_u8(vget_low_u8(masked));
+	mask1 = vaddv_u8(vget_high_u8(masked));
+#else
+	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
+	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
+	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
+	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
+
+	mask0 = vget_lane_u8(sum3, 0);
+	mask1 = vget_lane_u8(sum3, 1);
+#endif
+}
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		uint8x16_t result = vdupq_n_u8(0);
+
+		vst1q_u8(buffer, result);
+
+		return data;
+	}
+
+	case 1:
+	{
+		uint8x8_t sel2 = vld1_u8(data);
+		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
+		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
+		uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
+
+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
+		unsigned char mask0, mask1;
+		neonMoveMask(mask, mask0, mask1);
+
+		uint8x8_t rest0 = vld1_u8(data + 4);
+		uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+		vst1q_u8(buffer, result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		uint8x8_t sel4 = vld1_u8(data);
+		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
+		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
+
+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
+		unsigned char mask0, mask1;
+		neonMoveMask(mask, mask0, mask1);
+
+		uint8x8_t rest0 = vld1_u8(data + 8);
+		uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+		vst1q_u8(buffer, result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		uint8x16_t result = vld1q_u8(data);
+
+		vst1q_u8(buffer, result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_WASM
+SIMD_TARGET
+static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+{
+	v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
+	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
+
+	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
+	sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
+
+	return wasmx_unpacklo_v64x2(sm0, sm1r);
+}
+
+SIMD_TARGET
+static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
+{
+	v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3);
+
+	uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
+	uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
+
+	// TODO: This can use v8x16_bitmask in the future
+	uint64_t mask_2 = mask_1a | mask_1b;
+	uint64_t mask_4 = mask_2 | (mask_2 >> 16);
+	uint64_t mask_8 = mask_4 | (mask_4 >> 8);
+
+	mask0 = uint8_t(mask_8);
+	mask1 = uint8_t(mask_8 >> 32);
+}
+
+SIMD_TARGET
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	unsigned char byte, enc, encv;
+	const unsigned char* data_var;
+
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		v128_t result = wasm_i8x16_splat(0);
+
+		wasm_v128_store(buffer, result);
+
+		return data;
+	}
+
+	case 1:
+	{
+		v128_t sel2 = wasm_v128_load(data);
+		v128_t rest = wasm_v128_load(data + 4);
+
+		v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
+		v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
+		v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
+
+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
+
+		unsigned char mask0, mask1;
+		wasmMoveMask(mask, mask0, mask1);
+
+		v128_t shuf = decodeShuffleMask(mask0, mask1);
+
+		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		v128_t sel4 = wasm_v128_load(data);
+		v128_t rest = wasm_v128_load(data + 8);
+
+		v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
+		v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
+
+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
+
+		unsigned char mask0, mask1;
+		wasmMoveMask(mask, mask0, mask1);
+
+		v128_t shuf = decodeShuffleMask(mask0, mask1);
+
+		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		v128_t result = wasm_v128_load(data);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+SIMD_TARGET
+static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+{
+	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
+	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
+	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
+	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
+
+	x0 = _mm_unpacklo_epi16(t0, t2);
+	x1 = _mm_unpackhi_epi16(t0, t2);
+	x2 = _mm_unpacklo_epi16(t1, t3);
+	x3 = _mm_unpackhi_epi16(t1, t3);
+}
+
+SIMD_TARGET
+static __m128i unzigzag8(__m128i v)
+{
+	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
+	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
+
+	return _mm_xor_si128(xl, xr);
+}
+#endif
+
+#ifdef SIMD_NEON
+static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+{
+	uint8x16x2_t t01 = vzipq_u8(x0, x1);
+	uint8x16x2_t t23 = vzipq_u8(x2, x3);
+
+	uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
+	uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
+
+	x0 = vreinterpretq_u8_u16(x01.val[0]);
+	x1 = vreinterpretq_u8_u16(x01.val[1]);
+	x2 = vreinterpretq_u8_u16(x23.val[0]);
+	x3 = vreinterpretq_u8_u16(x23.val[1]);
+}
+
+static uint8x16_t unzigzag8(uint8x16_t v)
+{
+	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
+	uint8x16_t xr = vshrq_n_u8(v, 1);
+
+	return veorq_u8(xl, xr);
+}
+#endif
+
+#ifdef SIMD_WASM
+SIMD_TARGET
+static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
+{
+	v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
+	v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
+	v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
+	v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
+
+	x0 = wasmx_unpacklo_v16x8(t0, t2);
+	x1 = wasmx_unpackhi_v16x8(t0, t2);
+	x2 = wasmx_unpacklo_v16x8(t1, t3);
+	x3 = wasmx_unpackhi_v16x8(t1, t3);
+}
+
+SIMD_TARGET
+static v128_t unzigzag8(v128_t v)
+{
+	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
+	v128_t xr = wasm_u8x16_shr(v, 1);
+
+	return wasm_v128_xor(xl, xr);
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
+SIMD_TARGET
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+	assert(kByteGroupSize == 16);
+
+	const unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	size_t i = 0;
+
+	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
+	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
+	{
+		size_t header_offset = i / kByteGroupSize;
+		unsigned char header_byte = header[header_offset / 4];
+
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+	}
+
+	// slow-path: process remaining groups
+	for (; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
+			return 0;
+
+		size_t header_offset = i / kByteGroupSize;
+
+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+		data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+	}
+
+	return data;
+}
+
+SIMD_TARGET
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize * 4];
+	unsigned char transposed[kVertexBlockSizeBytes];
+
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+	for (size_t k = 0; k < vertex_size; k += 4)
+	{
+		for (size_t j = 0; j < 4; ++j)
+		{
+			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
+			if (!data)
+				return 0;
+		}
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
+#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
+#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+#ifdef SIMD_WASM
+#define TEMP v128_t
+#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
+#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
+#endif
+
+		PREP();
+
+		unsigned char* savep = transposed + k;
+
+		for (size_t j = 0; j < vertex_count_aligned; j += 16)
+		{
+			LOAD(0);
+			LOAD(1);
+			LOAD(2);
+			LOAD(3);
+
+			r0 = unzigzag8(r0);
+			r1 = unzigzag8(r1);
+			r2 = unzigzag8(r2);
+			r3 = unzigzag8(r3);
+
+			transpose8(r0, r1, r2, r3);
+
+			TEMP t0, t1, t2, t3;
+
+			GRP4(0);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(1);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(2);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(3);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+		}
+	}
+
+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+#endif
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+static unsigned int getCpuFeatures()
+{
+	int cpuinfo[4] = {};
+#ifdef _MSC_VER
+	__cpuid(cpuinfo, 1);
+#else
+	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
+#endif
+	return cpuinfo[2];
+}
+
+unsigned int cpuid = getCpuFeatures();
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+#if TRACE
+	memset(vertexstats, 0, sizeof(vertexstats));
+#endif
+
+	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
+
+	unsigned char* data = buffer;
+	unsigned char* data_end = buffer + buffer_size;
+
+	if (size_t(data_end - data) < 1 + vertex_size)
+		return 0;
+
+	int version = gEncodeVertexVersion;
+
+	*data++ = (unsigned char)(kVertexHeader | version);
+
+	unsigned char first_vertex[256] = {};
+	if (vertex_count > 0)
+		memcpy(first_vertex, vertex_data, vertex_size);
+
+	unsigned char last_vertex[256] = {};
+	memcpy(last_vertex, first_vertex, vertex_size);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+	size_t vertex_offset = 0;
+
+	while (vertex_offset < vertex_count)
+	{
+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		if (!data)
+			return 0;
+
+		vertex_offset += block_size;
+	}
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	if (size_t(data_end - data) < tail_size)
+		return 0;
+
+	// write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
+	if (vertex_size < kTailMaxSize)
+	{
+		memset(data, 0, kTailMaxSize - vertex_size);
+		data += kTailMaxSize - vertex_size;
+	}
+
+	memcpy(data, first_vertex, vertex_size);
+	data += vertex_size;
+
+	assert(data >= buffer + tail_size);
+	assert(data <= buffer + buffer_size);
+
+#if TRACE
+	size_t total_size = data - buffer;
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		const Stats& vsk = vertexstats[k];
+
+		printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
+
+#if TRACE > 1
+		printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)",
+		       int(vsk.header),
+		       int(vsk.bitg[0]), int(vsk.bitb[0]),
+		       int(vsk.bitg[1]), int(vsk.bitb[1]),
+		       int(vsk.bitg[2]), int(vsk.bitb[2]),
+		       int(vsk.bitg[3]), int(vsk.bitb[3]));
+#endif
+
+		printf("\n");
+	}
+#endif
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
+
+	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
+	size_t vertex_block_data_size = vertex_block_size;
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
+}
+
+void meshopt_encodeVertexVersion(int version)
+{
+	assert(unsigned(version) <= 0);
+
+	meshopt::gEncodeVertexVersion = version;
+}
+
+int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
+#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	decode = decodeVertexBlockSimd;
+#else
+	decode = decodeVertexBlock;
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	assert(gDecodeBytesGroupInitialized);
+	(void)gDecodeBytesGroupInitialized;
+#endif
+
+	unsigned char* vertex_data = static_cast<unsigned char*>(destination);
+
+	const unsigned char* data = buffer;
+	const unsigned char* data_end = buffer + buffer_size;
+
+	if (size_t(data_end - data) < 1 + vertex_size)
+		return -2;
+
+	unsigned char data_header = *data++;
+
+	if ((data_header & 0xf0) != kVertexHeader)
+		return -1;
+
+	int version = data_header & 0x0f;
+	if (version > 0)
+		return -1;
+
+	unsigned char last_vertex[256];
+	memcpy(last_vertex, data_end - vertex_size, vertex_size);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+	size_t vertex_offset = 0;
+
+	while (vertex_offset < vertex_count)
+	{
+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		if (!data)
+			return -2;
+
+		vertex_offset += block_size;
+	}
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	if (size_t(data_end - data) != tail_size)
+		return -3;
+
+	return 0;
+}
+
+#undef SIMD_NEON
+#undef SIMD_SSE
+#undef SIMD_AVX
+#undef SIMD_WASM
+#undef SIMD_FALLBACK
+#undef SIMD_TARGET
diff --git a/thirdparty/meshoptimizer/vertexfilter.cpp b/thirdparty/meshoptimizer/vertexfilter.cpp
new file mode 100644
index 0000000000..e7ad2c9d39
--- /dev/null
+++ b/thirdparty/meshoptimizer/vertexfilter.cpp
@@ -0,0 +1,825 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <math.h>
+
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+
+// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
+#if defined(__SSE2__)
+#define SIMD_SSE
+#endif
+
+// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
+#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
+#define SIMD_SSE
+#endif
+
+// GCC/clang define these when NEON support is available
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define SIMD_NEON
+#endif
+
+// On MSVC, we assume that ARM builds always target NEON-capable devices
+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#define SIMD_NEON
+#endif
+
+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
+#if defined(__wasm_simd128__)
+#define SIMD_WASM
+#endif
+
+#endif // !MESHOPTIMIZER_NO_SIMD
+
+#ifdef SIMD_SSE
+#include <emmintrin.h>
+#include <stdint.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#ifdef SIMD_NEON
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#ifdef SIMD_WASM
+#include <wasm_simd128.h>
+#endif
+
+#ifdef SIMD_WASM
+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
+#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
+#endif
+
+namespace meshopt
+{
+
+#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
+template <typename T>
+static void decodeFilterOct(T* data, size_t count)
+{
+	const float max = float((1 << (sizeof(T) * 8 - 1)) - 1);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		float x = float(data[i * 4 + 0]);
+		float y = float(data[i * 4 + 1]);
+		float z = float(data[i * 4 + 2]) - fabsf(x) - fabsf(y);
+
+		// fixup octahedral coordinates for z<0
+		float t = (z >= 0.f) ? 0.f : z;
+
+		x += (x >= 0.f) ? t : -t;
+		y += (y >= 0.f) ? t : -t;
+
+		// compute normal length & scale
+		float l = sqrtf(x * x + y * y + z * z);
+		float s = max / l;
+
+		// rounded signed float->int
+		int xf = int(x * s + (x >= 0.f ? 0.5f : -0.5f));
+		int yf = int(y * s + (y >= 0.f ? 0.5f : -0.5f));
+		int zf = int(z * s + (z >= 0.f ? 0.5f : -0.5f));
+
+		data[i * 4 + 0] = T(xf);
+		data[i * 4 + 1] = T(yf);
+		data[i * 4 + 2] = T(zf);
+	}
+}
+
+static void decodeFilterQuat(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		// recover scale from the high byte of the component
+		int sf = data[i * 4 + 3] | 3;
+		float ss = scale / float(sf);
+
+		// convert x/y/z to [-1..1] (scaled...)
+		float x = float(data[i * 4 + 0]) * ss;
+		float y = float(data[i * 4 + 1]) * ss;
+		float z = float(data[i * 4 + 2]) * ss;
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		float ww = 1.f - x * x - y * y - z * z;
+		float w = sqrtf(ww >= 0.f ? ww : 0.f);
+
+		// rounded signed float->int
+		int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
+		int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
+		int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
+		int wf = int(w * 32767.f + 0.5f);
+
+		int qc = data[i * 4 + 3] & 3;
+
+		// output order is dictated by input index
+		data[i * 4 + ((qc + 1) & 3)] = short(xf);
+		data[i * 4 + ((qc + 2) & 3)] = short(yf);
+		data[i * 4 + ((qc + 3) & 3)] = short(zf);
+		data[i * 4 + ((qc + 0) & 3)] = short(wf);
+	}
+}
+
+static void decodeFilterExp(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int v = data[i];
+
+		// decode mantissa and exponent
+		int m = int(v << 8) >> 8;
+		int e = int(v) >> 24;
+
+		union
+		{
+			float f;
+			unsigned int ui;
+		} u;
+
+		// optimized version of ldexp(float(m), e)
+		u.ui = unsigned(e + 127) << 23;
+		u.f = u.f * float(m);
+
+		data[i] = u.ui;
+	}
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+inline uint64_t rotateleft64(uint64_t v, int x)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+	return _rotl64(v, x);
+// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
+// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
+#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
+	return __builtin_rotateleft64(v, x);
+#else
+	return (v << (x & 63)) | (v >> ((64 - x) & 63));
+#endif
+}
+#endif
+
+#ifdef SIMD_SSE
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+	const __m128 sign = _mm_set1_ps(-0.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
+
+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24);
+		__m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24);
+
+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		__m128 x = _mm_cvtepi32_ps(xf);
+		__m128 y = _mm_cvtepi32_ps(yf);
+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
+
+		// fixup octahedral coordinates for z<0
+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
+
+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
+
+		// compute normal length & scale
+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
+		__m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll));
+
+		// rounded signed float->int
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+
+		// combine xr/yr/zr into final value
+		__m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000));
+		res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff)));
+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8));
+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+	const __m128 sign = _mm_set1_ps(-0.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
+		__m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		__m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0)));
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16);
+		__m128i yf = _mm_srai_epi32(n4, 16);
+
+		// unpack z; note that z is unsigned so we don't need to sign extend it
+		__m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1)));
+		__m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff));
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		__m128 x = _mm_cvtepi32_ps(xf);
+		__m128 y = _mm_cvtepi32_ps(yf);
+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
+
+		// fixup octahedral coordinates for z<0
+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
+
+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
+
+		// compute normal length & scale
+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
+		__m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll));
+
+		// rounded signed float->int
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+
+		// mix x/z and y/0 to make 16-bit unpack easier
+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
+		__m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff));
+
+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+		__m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
+		__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
+
+		// patch in .w
+		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
+		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
+	}
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
+		__m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		__m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0)));
+		__m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1)));
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16);
+		__m128i yf = _mm_srai_epi32(q4_xy, 16);
+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16);
+		__m128i cf = _mm_srai_epi32(q4_zc, 16);
+
+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+		__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
+		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
+
+		// convert x/y/z to [-1..1] (scaled...)
+		__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
+		__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
+		__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
+		__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
+
+		__m128 s = _mm_set1_ps(32767.f);
+
+		// rounded signed float->int
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
+
+		// mix x/z and w/y to make 16-bit unpack easier
+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
+		__m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16));
+
+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+		__m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
+		__m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
+
+		// store results to stack so that we can rotate using scalar instructions
+		uint64_t res[4];
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0);
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1);
+
+		// rotate and store
+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
+
+		out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
+		out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
+		out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
+		out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
+	}
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
+
+		// decode exponent into 2^x directly
+		__m128i ef = _mm_srai_epi32(v, 24);
+		__m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		__m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8);
+		__m128 m = _mm_cvtepi32_ps(mf);
+
+		__m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
+
+		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
+	}
+}
+#endif
+
+#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
+inline float32x4_t vsqrtq_f32(float32x4_t x)
+{
+	float32x4_t r = vrsqrteq_f32(x);
+	r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
+	return vmulq_f32(r, x);
+}
+
+inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
+{
+	float32x4_t r = vrecpeq_f32(y);
+	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
+	return vmulq_f32(x, r);
+}
+#endif
+
+#ifdef SIMD_NEON
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+	const int32x4_t sign = vdupq_n_s32(0x80000000);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
+
+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24);
+		int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24);
+
+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		float32x4_t x = vcvtq_f32_s32(xf);
+		float32x4_t y = vcvtq_f32_s32(yf);
+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
+
+		// fixup octahedral coordinates for z<0
+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
+
+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
+
+		// compute normal length & scale
+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t rl = vrsqrteq_f32(ll);
+		float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+
+		// combine xr/yr/zr into final value
+		int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
+		res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+	const int32x4_t sign = vdupq_n_s32(0x80000000);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+		int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0];
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16);
+		int32x4_t yf = vshrq_n_s32(n4, 16);
+
+		// unpack z; note that z is unsigned so we don't need to sign extend it
+		int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1];
+		int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff));
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		float32x4_t x = vcvtq_f32_s32(xf);
+		float32x4_t y = vcvtq_f32_s32(yf);
+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
+
+		// fixup octahedral coordinates for z<0
+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
+
+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
+
+		// compute normal length & scale
+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t rl = vrsqrteq_f32(ll);
+		rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
+		float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+
+		// mix x/z and y/0 to make 16-bit unpack easier
+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+		int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
+
+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]);
+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]);
+
+		// patch in .w
+		res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0);
+		res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1);
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
+	}
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+		int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0];
+		int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1];
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16);
+		int32x4_t yf = vshrq_n_s32(q4_xy, 16);
+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16);
+		int32x4_t cf = vshrq_n_s32(q4_zc, 16);
+
+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+		int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
+		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
+
+		// convert x/y/z to [-1..1] (scaled...)
+		float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
+		float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
+		float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
+		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
+
+		float32x4_t s = vdupq_n_f32(32767.f);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
+
+		// mix x/z and w/y to make 16-bit unpack easier
+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+		int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
+
+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+
+		// rotate and store
+		uint64_t* out = (uint64_t*)&data[i * 4];
+
+		out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
+		out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
+		out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
+		out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
+	}
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
+
+		// decode exponent into 2^x directly
+		int32x4_t ef = vshrq_n_s32(v, 24);
+		int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8);
+		float32x4_t m = vcvtq_f32_s32(mf);
+
+		float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
+
+		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
+	}
+}
+#endif
+
+#ifdef SIMD_WASM
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+	const v128_t sign = wasm_f32x4_splat(-0.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t n4 = wasm_v128_load(&data[i * 4]);
+
+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 24), 24);
+		v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 24);
+
+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 8), 24);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		v128_t x = wasm_f32x4_convert_i32x4(xf);
+		v128_t y = wasm_f32x4_convert_i32x4(yf);
+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
+
+		// fixup octahedral coordinates for z<0
+		// note: i32x4_min with 0 is equvalent to f32x4_min
+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
+
+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
+
+		// compute normal length & scale
+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+
+		// combine xr/yr/zr into final value
+		v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000));
+		res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(0xff)));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(0xff)), 8));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(0xff)), 16));
+
+		wasm_v128_store(&data[i * 4], res);
+	}
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+	const v128_t sign = wasm_f32x4_splat(-0.f);
+	const v128_t zmask = wasm_i32x4_splat(0x7fff);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t n4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+		v128_t n4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1);
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 16);
+		v128_t yf = wasm_i32x4_shr(n4, 16);
+
+		// unpack z; note that z is unsigned so we don't need to sign extend it
+		v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1);
+		v128_t zf = wasm_v128_and(z4, zmask);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		v128_t x = wasm_f32x4_convert_i32x4(xf);
+		v128_t y = wasm_f32x4_convert_i32x4(yf);
+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
+
+		// fixup octahedral coordinates for z<0
+		// note: i32x4_min with 0 is equvalent to f32x4_min
+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
+
+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
+
+		// compute normal length & scale
+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+
+		// mix x/z and y/0 to make 16-bit unpack easier
+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
+		v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(0xffff));
+
+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+		v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r);
+		v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
+
+		// patch in .w
+		res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000)));
+		res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000)));
+
+		wasm_v128_store(&data[(i + 0) * 4], res_0);
+		wasm_v128_store(&data[(i + 2) * 4], res_1);
+	}
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t q4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+		v128_t q4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1);
+		v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1);
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16);
+		v128_t yf = wasm_i32x4_shr(q4_xy, 16);
+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16);
+		v128_t cf = wasm_i32x4_shr(q4_zc, 16);
+
+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+		v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
+
+		// convert x/y/z to [-1..1] (scaled...)
+		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
+		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
+		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		// note: i32x4_max with 0 is equivalent to f32x4_max
+		v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
+		v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
+
+		v128_t s = wasm_f32x4_splat(32767.f);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
+
+		// mix x/z and w/y to make 16-bit unpack easier
+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
+		v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(yr, 16));
+
+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+		v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr);
+		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
+
+		// compute component index shifted left by 4 (and moved into i32x4 slot)
+		// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
+		volatile v128_t cm = wasm_i32x4_shl(cf, 4);
+
+		// rotate and store
+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
+
+		out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
+		out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
+		out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
+		out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
+	}
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t v = wasm_v128_load(&data[i]);
+
+		// decode exponent into 2^x directly
+		v128_t ef = wasm_i32x4_shr(v, 24);
+		v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8);
+		v128_t m = wasm_f32x4_convert_i32x4(mf);
+
+		v128_t r = wasm_f32x4_mul(es, m);
+
+		wasm_v128_store(&data[i], r);
+	}
+}
+#endif
+
+} // namespace meshopt
+
+void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size == 4 || vertex_size == 8);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	if (vertex_size == 4)
+		decodeFilterOctSimd(static_cast<signed char*>(buffer), vertex_count);
+	else
+		decodeFilterOctSimd(static_cast<short*>(buffer), vertex_count);
+#else
+	if (vertex_size == 4)
+		decodeFilterOct(static_cast<signed char*>(buffer), vertex_count);
+	else
+		decodeFilterOct(static_cast<short*>(buffer), vertex_count);
+#endif
+}
+
+void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size == 8);
+	(void)vertex_size;
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	decodeFilterQuatSimd(static_cast<short*>(buffer), vertex_count);
+#else
+	decodeFilterQuat(static_cast<short*>(buffer), vertex_count);
+#endif
+}
+
+void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size % 4 == 0);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	decodeFilterExpSimd(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#else
+	decodeFilterExp(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#endif
+}
+
+#undef SIMD_SSE
+#undef SIMD_NEON
+#undef SIMD_WASM
diff --git a/thirdparty/meshoptimizer/vfetchanalyzer.cpp b/thirdparty/meshoptimizer/vfetchanalyzer.cpp
new file mode 100644
index 0000000000..51dca873f8
--- /dev/null
+++ b/thirdparty/meshoptimizer/vfetchanalyzer.cpp
@@ -0,0 +1,58 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexFetchStatistics result = {};
+
+	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
+	memset(vertex_visited, 0, vertex_count);
+
+	const size_t kCacheLine = 64;
+	const size_t kCacheSize = 128 * 1024;
+
+	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+	size_t cache[kCacheSize / kCacheLine] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		vertex_visited[index] = 1;
+
+		size_t start_address = index * vertex_size;
+		size_t end_address = start_address + vertex_size;
+
+		size_t start_tag = start_address / kCacheLine;
+		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+		assert(start_tag < end_tag);
+
+		for (size_t tag = start_tag; tag < end_tag; ++tag)
+		{
+			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+			// we store +1 since cache is filled with 0 by default
+			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+			cache[line] = tag + 1;
+		}
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += vertex_visited[i];
+
+	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+	return result;
+}
diff --git a/thirdparty/meshoptimizer/vfetchoptimizer.cpp b/thirdparty/meshoptimizer/vfetchoptimizer.cpp
new file mode 100644
index 0000000000..465d6df5ca
--- /dev/null
+++ b/thirdparty/meshoptimizer/vfetchoptimizer.cpp
@@ -0,0 +1,74 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			destination[index] = next_vertex++;
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place optimization
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	// build vertex remap table
+	unsigned int* vertex_remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(vertex_remap, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		unsigned int& remap = vertex_remap[index];
+
+		if (remap == ~0u) // vertex was not added to destination VB
+		{
+			// add vertex
+			memcpy(static_cast<unsigned char*>(destination) + next_vertex * vertex_size, static_cast<const unsigned char*>(vertices) + index * vertex_size, vertex_size);
+
+			remap = next_vertex++;
+		}
+
+		// modify indices in place
+		indices[i] = remap;
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
diff --git a/thirdparty/rvo2/src/API.h b/thirdparty/rvo2/API.h
index c64efb452c..c64efb452c 100644
--- a/thirdparty/rvo2/src/API.h
+++ b/thirdparty/rvo2/API.h
diff --git a/thirdparty/rvo2/src/Agent.cpp b/thirdparty/rvo2/Agent.cpp
index 851d780758..851d780758 100644
--- a/thirdparty/rvo2/src/Agent.cpp
+++ b/thirdparty/rvo2/Agent.cpp
diff --git a/thirdparty/rvo2/src/Agent.h b/thirdparty/rvo2/Agent.h
index 16f75a08f6..16f75a08f6 100644
--- a/thirdparty/rvo2/src/Agent.h
+++ b/thirdparty/rvo2/Agent.h
diff --git a/thirdparty/rvo2/src/Definitions.h b/thirdparty/rvo2/Definitions.h
index a73aca9908..a73aca9908 100644
--- a/thirdparty/rvo2/src/Definitions.h
+++ b/thirdparty/rvo2/Definitions.h
diff --git a/thirdparty/rvo2/src/KdTree.cpp b/thirdparty/rvo2/KdTree.cpp
index bc224614f0..bc224614f0 100644
--- a/thirdparty/rvo2/src/KdTree.cpp
+++ b/thirdparty/rvo2/KdTree.cpp
diff --git a/thirdparty/rvo2/src/KdTree.h b/thirdparty/rvo2/KdTree.h
index 1dbad00ea4..1dbad00ea4 100644
--- a/thirdparty/rvo2/src/KdTree.h
+++ b/thirdparty/rvo2/KdTree.h
diff --git a/thirdparty/rvo2/src/Vector3.h b/thirdparty/rvo2/Vector3.h
index 8c8835c865..8c8835c865 100644
--- a/thirdparty/rvo2/src/Vector3.h
+++ b/thirdparty/rvo2/Vector3.h