diff --git a/include/bounce/dynamics/cloth/spring_cloth.h b/include/bounce/dynamics/cloth/spring_cloth.h
index 66a3aaf..5fa0736 100644
--- a/include/bounce/dynamics/cloth/spring_cloth.h
+++ b/include/bounce/dynamics/cloth/spring_cloth.h
@@ -20,13 +20,14 @@
 #define B3_SPRING_CLOTH_H
 
 #include <bounce/common/math/mat33.h>
-#include <bounce/collision/shapes/sphere.h>
 
-#define B3_CLOTH_SPHERE_CAPACITY 32
+#define B3_CLOTH_SHAPE_CAPACITY 32
 
 class b3StackAllocator;
 class b3Draw;
 
+class b3Shape;
+
 struct b3Mesh;
 
 struct b3SpringClothDef
@@ -94,13 +95,13 @@ enum b3MassType
 	e_dynamicMass
 };
 
-// This structure represents an acceleration constraint.
-struct b3MassCollision
+// 
+struct b3MassContact
 {
+	b3Vec3 n, t1, t2;
+	float32 Fn, Ft1, Ft2;
 	u32 j;
-	float32 s;
-	b3Vec3 n;
-	bool active;
+	bool lockOnSurface, slideOnSurface;
 };
 
 // Time step statistics
@@ -136,7 +137,13 @@ public:
 	b3MassType GetType(u32 i) const;
 
 	//
-	b3Sphere* CreateSphere(const b3Vec3& center, float32 radius);
+	void AddShape(b3Shape* shape);
+
+	// 
+	u32 GetShapeCount() const;
+
+	// 
+	b3Shape** GetShapes();
 
 	// 
 	const b3SpringClothStep& GetStep() const;
@@ -150,11 +157,16 @@ public:
 	//
 	void Draw(b3Draw* draw) const;
 protected:
-	void UpdateCollisions() const;
+	friend class b3SpringSolver;
+	
+	// Update contacts. 
+	// This is where some contacts might be initiated or terminated.
+	void UpdateContacts();
 	
 	b3StackAllocator* m_allocator;
 
 	b3Mesh* m_mesh;
+	float32 m_r;
 
 	b3Vec3 m_gravity;
 
@@ -163,17 +175,16 @@ protected:
 	b3Vec3* m_f;
 	float32* m_inv_m;
 	b3Vec3* m_y;
-	b3MassType* m_massTypes;
-	b3MassCollision* m_collisions;
+	b3MassType* m_types;
 	u32 m_massCount;
 
+	b3MassContact* m_contacts;
+	
 	b3Spring* m_springs;
 	u32 m_springCount;
 
-	float32 m_r;
-
-	b3Sphere m_spheres[B3_CLOTH_SPHERE_CAPACITY];
-	u32 m_sphereCount;
+	b3Shape* m_shapes[B3_CLOTH_SHAPE_CAPACITY];
+	u32 m_shapeCount;
 
 	b3SpringClothStep m_step;
 };
@@ -191,13 +202,23 @@ inline void b3SpringCloth::SetGravity(const b3Vec3& gravity)
 inline b3MassType b3SpringCloth::GetType(u32 i) const
 {
 	B3_ASSERT(i < m_massCount);
-	return m_massTypes[i];
+	return m_types[i];
 }
 
 inline void b3SpringCloth::SetType(u32 i, b3MassType type)
 {
 	B3_ASSERT(i < m_massCount);
-	m_massTypes[i] = type;
+	m_types[i] = type;
+}
+
+inline u32 b3SpringCloth::GetShapeCount() const
+{
+	return m_shapeCount;
+}
+
+inline b3Shape** b3SpringCloth::GetShapes() 
+{
+	return m_shapes;
 }
 
 inline const b3SpringClothStep& b3SpringCloth::GetStep() const
diff --git a/include/bounce/dynamics/cloth/spring_solver.h b/include/bounce/dynamics/cloth/spring_solver.h
new file mode 100644
index 0000000..f521790
--- /dev/null
+++ b/include/bounce/dynamics/cloth/spring_solver.h
@@ -0,0 +1,93 @@
+/*
+* Copyright (c) 2016-2016 Irlan Robson http://www.irlan.net
+*
+* This software is provided 'as-is', without any express or implied
+* warranty.  In no event will the authors be held liable for any damages
+* arising from the use of this software.
+* Permission is granted to anyone to use this software for any purpose,
+* including commercial applications, and to alter it and redistribute it
+* freely, subject to the following restrictions:
+* 1. The origin of this software must not be misrepresented; you must not
+* claim that you wrote the original software. If you use this software
+* in a product, an acknowledgment in the product documentation would be
+* appreciated but is not required.
+* 2. Altered source versions must be plainly marked as such, and must not be
+* misrepresented as being the original software.
+* 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_SPRING_SOLVER_H
+#define B3_SPRING_SOLVER_H
+
+#include <bounce/common/math/vec3.h>
+#include <bounce/common/math/mat33.h>
+
+class b3SpringCloth;
+class b3StackAllocator;
+
+struct b3MassContact;
+struct b3Spring;
+
+enum b3MassType;
+
+struct b3SpringSolverDef
+{
+	b3SpringCloth* cloth;
+	float32 dt;
+};
+
+class b3SpringSolver
+{
+public:
+	b3SpringSolver(const b3SpringSolverDef& def);
+
+	~b3SpringSolver();
+
+	void Solve(b3Vec3* constraintForces);
+
+	u32 GetIterations() const;
+private:
+	// Apply internal forces and store their unique derivatives.
+	void InitializeSpringForces();
+
+	// Initialize b, from Ax = b
+	void Compute_b(b3Vec3* b) const;
+
+	// Solve Ax = b using the Modified Conjugate Gradient (MCG). 
+	// Output x and the residual error f.
+	void Solve_MCG(b3Vec3* x, b3Vec3* f, u32& iterations, const b3Vec3* b) const;
+
+	// Solve Ax = b using MCG with Jacobi preconditioning.
+	// Output x and the residual error f.
+	// This method is slower than MCG because we have to compute the preconditioning 
+	// matrix P, but it can improve convergence.
+	void Solve_MPCG(b3Vec3* x, b3Vec3* f, u32& iterations, const b3Vec3* b) const;
+
+	b3SpringCloth * m_cloth;
+	float32 m_h;
+	b3Mat33* m_Jx;
+	b3Mat33* m_Jv;
+	u32 m_iterations;
+
+	b3StackAllocator* m_allocator;
+
+	b3Vec3* m_x;
+	b3Vec3* m_v;
+	b3Vec3* m_f;
+	float32* m_inv_m;
+	b3Vec3* m_y;
+	b3MassType* m_types;
+	u32 m_massCount;
+
+	b3MassContact* m_contacts;
+	
+	b3Spring* m_springs;
+	u32 m_springCount;
+};
+
+inline u32 b3SpringSolver::GetIterations() const
+{
+	return m_iterations;
+}
+
+#endif
\ No newline at end of file
diff --git a/src/bounce/dynamics/cloth/spring_cloth.cpp b/src/bounce/dynamics/cloth/spring_cloth.cpp
index 1ea3b9f..04cedff 100644
--- a/src/bounce/dynamics/cloth/spring_cloth.cpp
+++ b/src/bounce/dynamics/cloth/spring_cloth.cpp
@@ -17,11 +17,12 @@
 */
 
 #include <bounce/dynamics/cloth/spring_cloth.h>
+#include <bounce/dynamics/cloth/spring_solver.h>
+#include <bounce/dynamics/shapes/shape.h>
 #include <bounce/collision/shapes/mesh.h>
 #include <bounce/common/memory/stack_allocator.h>
 
-// Here, we solve Ax = b using the Modified Conjugate Gradient method.
-// This work is based on the paper "Large Steps in Cloth Simulation - David Baraff, Andrew Witkin".
+#define B3_FORCE_THRESHOLD (0.1f)
 
 b3SpringCloth::b3SpringCloth()
 {
@@ -36,16 +37,16 @@ b3SpringCloth::b3SpringCloth()
 	m_f = nullptr;
 	m_y = nullptr;
 	m_inv_m = nullptr;
-	m_massTypes = nullptr;
-	m_collisions = nullptr;
+	m_types = nullptr;
+	m_contacts = nullptr;
 	m_massCount = 0;
-	
+
 	m_springs = nullptr;
-	m_springCount = 0; 
-	
+	m_springCount = 0;
+
 	m_r = 0.0f;
 
-	m_sphereCount = 0;
+	m_shapeCount = 0;
 
 	m_step.iterations = 0;
 }
@@ -57,8 +58,8 @@ b3SpringCloth::~b3SpringCloth()
 	b3Free(m_f);
 	b3Free(m_inv_m);
 	b3Free(m_y);
-	b3Free(m_massTypes);
-	b3Free(m_collisions);
+	b3Free(m_types);
+	b3Free(m_contacts);
 	b3Free(m_springs);
 }
 
@@ -72,7 +73,7 @@ void b3SpringCloth::Initialize(const b3SpringClothDef& def)
 	m_gravity = def.gravity;
 
 	m_r = def.r;
-	
+
 	const b3Mesh* m = m_mesh;
 
 	m_massCount = m->vertexCount;
@@ -81,18 +82,23 @@ void b3SpringCloth::Initialize(const b3SpringClothDef& def)
 	m_f = (b3Vec3*)b3Alloc(m_massCount * sizeof(b3Vec3));
 	m_inv_m = (float32*)b3Alloc(m_massCount * sizeof(float32));
 	m_y = (b3Vec3*)b3Alloc(m_massCount * sizeof(b3Vec3));
-	m_massTypes = (b3MassType*)b3Alloc(m_massCount * sizeof(b3MassType));
-	m_collisions = (b3MassCollision*)b3Alloc(m_massCount * sizeof(b3MassCollision));
+	m_types = (b3MassType*)b3Alloc(m_massCount * sizeof(b3MassType));
+	m_contacts = (b3MassContact*)b3Alloc(m_massCount * sizeof(b3MassContact));
 
 	for (u32 i = 0; i < m->vertexCount; ++i)
 	{
-		m_collisions[i].active = false;
+		m_contacts[i].Fn = 0.0f;
+		m_contacts[i].Ft1 = 0.0f;
+		m_contacts[i].Ft2 = 0.0f;
+		m_contacts[i].lockOnSurface = false;
+		m_contacts[i].slideOnSurface = false;
+
 		m_x[i] = m->vertices[i];
 		m_v[i].SetZero();
 		m_f[i].SetZero();
 		m_inv_m[i] = 0.0f;
 		m_y[i].SetZero();
-		m_massTypes[i] = e_staticMass;
+		m_types[i] = e_staticMass;
 	}
 
 	// Initialize mass
@@ -113,25 +119,25 @@ void b3SpringCloth::Initialize(const b3SpringClothDef& def)
 		m_inv_m[t->v2] += inv3 * mass;
 		m_inv_m[t->v3] += inv3 * mass;
 	}
-	
+
 	// Invert
 	for (u32 i = 0; i < m_massCount; ++i)
 	{
 		if (m_inv_m[i] > 0.0f)
 		{
 			m_inv_m[i] = 1.0f / m_inv_m[i];
-			m_massTypes[i] = e_dynamicMass;
+			m_types[i] = e_dynamicMass;
 		}
 	}
 
 	// Initialize springs
 	m_springs = (b3Spring*)b3Alloc(3 * m->triangleCount * sizeof(b3Spring));
-	
+
 	// Streching
 	for (u32 i = 0; i < m->triangleCount; ++i)
 	{
 		b3Triangle* t = m->triangles + i;
-		
+
 		u32 is[3] = { t->v1, t->v2, t->v3 };
 		for (u32 j = 0; j < 3; ++j)
 		{
@@ -168,624 +174,185 @@ void b3SpringCloth::Initialize(const b3SpringClothDef& def)
 	}
 }
 
-b3Sphere* b3SpringCloth::CreateSphere(const b3Vec3& center, float32 radius)
+void b3SpringCloth::AddShape(b3Shape* shape)
 {
-	B3_ASSERT(m_sphereCount < B3_CLOTH_SPHERE_CAPACITY);
-	
-	if (m_sphereCount == B3_CLOTH_SPHERE_CAPACITY)
-	{
-		return nullptr;
-	}
+	B3_ASSERT(m_shapeCount < B3_CLOTH_SHAPE_CAPACITY);
 
-	b3Sphere* sphere = m_spheres + m_sphereCount;
-	sphere->vertex = center;
-	sphere->radius = radius;
-	++m_sphereCount;
-	return sphere;
-}
-
-static B3_FORCE_INLINE void b3Make_z(b3Vec3* out, u32 size,
-	const b3MassType* types, const b3MassCollision* collisions)
-{
-	for (u32 i = 0; i < size; ++i)
-	{
-		switch (types[i])
-		{
-		case e_staticMass:
-		{
-			out[i].SetZero();
-			break;
-		}
-		case e_dynamicMass:
-		{
-			if (collisions[i].active)
-			{
-				out[i].SetZero();
-				break;
-			}
-
-			out[i].SetZero();
-			break;
-		}
-		default:
-		{
-			B3_ASSERT(false);
-			break;
-		}
-		}
-	}
-}
-
-static B3_FORCE_INLINE void b3Filter(b3Vec3* out, const b3Vec3* v, u32 size, 
-	const b3MassType* types, const b3MassCollision* collisions)
-{
-	for (u32 i = 0; i < size; ++i)
-	{
-		switch (types[i])
-		{
-		case e_staticMass:
-		{
-			out[i].SetZero();
-			break;
-		}
-		case e_dynamicMass:
-		{
-			if (collisions[i].active)
-			{
-				b3Vec3 n = collisions[i].n;
-
-				b3Mat33 S = b3Mat33_identity - b3Outer(n, n);
-
-				out[i] = S * v[i];
-				break;
-			}
-
-			out[i] = v[i];
-			break;
-		}
-		default:
-		{
-			B3_ASSERT(false);
-			break;
-		}
-		}
-	}
-}
-
-static B3_FORCE_INLINE void b3SetZero(b3Vec3* out, u32 size)
-{
-	for (u32 i = 0; i < size; ++i)
-	{
-		out[i].SetZero();
-	}
-}
-
-static B3_FORCE_INLINE void b3Copy(b3Vec3* out, const b3Vec3* v, u32 size)
-{
-	for (u32 i = 0; i < size; ++i)
-	{
-		out[i] = v[i];
-	}
-}
-
-static B3_FORCE_INLINE void b3Add(b3Vec3* out, const b3Vec3* a, const b3Vec3* b, u32 size)
-{
-	for (u32 i = 0; i < size; ++i)
-	{
-		out[i] = a[i] + b[i];
-	}
-}
-
-static B3_FORCE_INLINE void b3Sub(b3Vec3* out, const b3Vec3* a, const b3Vec3* b, u32 size)
-{
-	for (u32 i = 0; i < size; ++i)
-	{
-		out[i] = a[i] - b[i];
-	}
-}
-
-static B3_FORCE_INLINE float32 b3Dot(const b3Vec3* a, const b3Vec3* b, u32 size)
-{
-	float32 result = 0.0f;
-	for (u32 i = 0; i < size; ++i)
-	{
-		result += b3Dot(a[i], b[i]);
-	}
-	return result;
-}
-
-#define B3_INDEX(i, j, size) (i + j * size)
-
-static B3_FORCE_INLINE void b3SetZero(b3Mat33* out, u32 size)
-{
-	for (u32 i = 0; i < size; ++i)
-	{
-		for (u32 j = 0; j < size; ++j)
-		{
-			out[B3_INDEX(i, j, size)].SetZero();
-		}
-	}
-}
-
-static B3_FORCE_INLINE void b3Mul(b3Vec3* out, const b3Mat33* M, const b3Vec3* v, u32 size)
-{
-	for (u32 i = 0; i < size; ++i)
-	{
-		out[i].SetZero();
-
-		for (u32 j = 0; j < size; ++j)
-		{
-			out[i] += M[ B3_INDEX(i, j, size) ] * v[j];
-		}
-	}
-}
-
-// J = dfdx or dvdx
-static B3_FORCE_INLINE void b3Mul_Jacobian(b3Vec3* out, const b3Vec3* v, u32 mass_size, 
-	const b3Mat33* J_ii, const b3Spring* springs, u32 spring_size)
-{
-	b3SetZero(out, mass_size);
-
-	for (u32 i = 0; i < spring_size; ++i)
-	{
-		const b3Spring* S = springs + i;
-		u32 i1 = S->i1;
-		u32 i2 = S->i2;
-
-		b3Mat33 J_11 = J_ii[i];
-		b3Mat33 J_12 = -J_11;
-		b3Mat33 J_21 = J_12;
-		b3Mat33 J_22 = J_11;
-
-		out[i1] += J_11 * v[i1] + J_12 * v[i2];
-		out[i2] += J_21 * v[i1] + J_22 * v[i2];
-	}
-}
-
-static B3_FORCE_INLINE void b3SetZero_Jacobian(b3Mat33* out, u32 size)
-{
-	for (u32 i = 0; i < size; ++i)
-	{
-		out[i].SetZero();
-	}
-}
-
-// A = M - h * dfdv - h * h * dfdx
-// A * v = (M - h * dfdv - h * h * dfdx) * v = M * v + (-h * dfdv * v) + (-h * h * dfdx * v) 
-static B3_FORCE_INLINE void b3Mul_A(b3Vec3* out, const b3Vec3* v, u32 mass_size, 
-	b3StackAllocator* allocator, 
-	const float32* inv_m, float32 h, const b3Mat33* Jx, const b3Mat33* Jv, const b3Spring* springs, u32 spring_size)
-{
-	// v1 = M * v
-	b3Vec3* v1 = (b3Vec3*)allocator->Allocate(mass_size * sizeof(b3Vec3));
-	for (u32 i = 0; i < mass_size; ++i)
-	{
-		float32 m = inv_m[i] != 0.0f ? 1.0f / inv_m[i] : 0.0f;
-		v1[i] = m * v[i];
-	}
-
-	// v2 = (-h * dfdv * v)
-	b3Vec3* v2 = (b3Vec3*)allocator->Allocate(mass_size * sizeof(b3Vec3));
-	b3Mul_Jacobian(v2, v, mass_size, Jv, springs, spring_size);
-	for (u32 i = 0; i < mass_size; ++i)
-	{
-		v2[i] *= -h;
-	}
-
-	// v3 = (-h * h * dfdx * v)
-	b3Vec3* v3 = (b3Vec3*)allocator->Allocate(mass_size * sizeof(b3Vec3));
-	b3Mul_Jacobian(v3, v, mass_size, Jx, springs, spring_size);
-	for (u32 i = 0; i < mass_size; ++i)
-	{
-		v3[i] *= -h * h;
-	}
-
-	// v = v1 + v2 + v3
-	for (u32 i = 0; i < mass_size; ++i)
-	{
-		out[i] = v1[i] + v2[i] + v3[i];
-	}
-
-	allocator->Free(v3);
-	allocator->Free(v2);
-	allocator->Free(v1);
-}
-
-void b3SpringCloth::UpdateCollisions() const
-{
-	// Compute cloth-solid collision position alteration
-	for (u32 i = 0; i < m_massCount; ++i)
-	{
-		// Clear flag
-		m_collisions[i].active = false;
-
-		b3Vec3 c1 = m_x[i];
-		float32 r1 = m_r;
-
-		// Only solve the deepest penetrations
-		float32 bestSeparation = B3_MAX_FLOAT;
-		u32 bestIndex = ~0;
-
-		for (u32 j = 0; j < m_sphereCount; ++j)
-		{
-			const b3Sphere* sphere = m_spheres + j;
-			b3Vec3 c2 = sphere->vertex;
-			float32 r2 = sphere->radius;
-
-			b3Vec3 d = c2 - c1;
-			float32 dd = b3Dot(d, d);
-			float32 totalRadius = r1 + r2;
-			if (dd > totalRadius * totalRadius)
-			{
-				continue;
-			}
-
-			float32 distance = b3Length(d);
-			float32 separation = distance - totalRadius;
-
-			if (separation < bestSeparation)
-			{
-				bestSeparation = separation;
-				bestIndex = j;
-			}
-		}
-
-		if (bestIndex != ~0)
-		{
-			const b3Sphere* sphere = m_spheres + bestIndex;
-			b3Vec3 c2 = sphere->vertex;
-			float32 r2 = sphere->radius;
-
-			float32 totalRadius = r1 + r2;
-
-			b3Vec3 d = c2 - c1;
-			float32 distance = b3Length(d);
-			float32 separation = distance - totalRadius;
-
-			b3Vec3 n(0.0f, 1.0f, 0.0f);
-			if (distance > B3_EPSILON)
-			{
-				n = d / distance;
-			}
-
-			// Avoid large corrections
-			const float32 kMaxCorrection = 0.75f;
-			separation = b3Clamp(separation, -kMaxCorrection, 0.0f);
-
-			b3Vec3 dx1 = separation * n;
-
-			// Add position alteration
-			m_y[i] += dx1;
-
-			m_collisions[i].active = true;
-			m_collisions[i].j = bestIndex;
-			m_collisions[i].s = separation;
-			m_collisions[i].n = n;
-		}
-	}
-}
-
-void b3SpringCloth::Step(float32 h)
-{
-	if (h == 0.0f)
+	if (m_shapeCount == B3_CLOTH_SHAPE_CAPACITY)
 	{
 		return;
 	}
 
-	// Detect and store collisions
-	UpdateCollisions();
+	m_shapes[m_shapeCount++] = shape;
+}
 
-	u32 size = m_massCount;
-	b3MassType* types = m_massTypes;
-	u32 spring_size = m_springCount;
-
-	// Add gravity
-	for (u32 i = 0; i < size; ++i)
+static B3_FORCE_INLINE void b3MakeTangents(b3Vec3& t1, b3Vec3& t2, const b3Vec3& dv, const b3Vec3& n)
+{
+	t1 = dv - b3Dot(dv, n) * n;
+	if (b3Dot(t1, t1) > B3_EPSILON * B3_EPSILON)
 	{
-		if (types[i] == e_dynamicMass)
+		t1.Normalize();
+		t2 = b3Cross(t1, n);
+	}
+	else
+	{
+		t1 = b3Perp(n);
+		t2 = b3Cross(t1, n);
+	}
+}
+
+void b3SpringCloth::UpdateContacts()
+{
+	for (u32 i = 0; i < m_massCount; ++i)
+	{
+		b3MassContact* c = m_contacts + i;
+
+		bool wasLocked = c->lockOnSurface;
+		bool wasSliding = c->slideOnSurface;
+
+		b3Sphere s1;
+		s1.vertex = m_x[i];
+		s1.radius = m_r;
+
+		// Solve the deepest penetration
+		float32 bestSeparation = 0.0f;
+		b3Vec3 bestNormal(0.0f, 0.0f, 0.0f);
+		u32 bestIndex = ~0;
+
+		for (u32 j = 0; j < m_shapeCount; ++j)
+		{
+			b3Shape* shape = m_shapes[j];
+
+			b3Transform xf2;
+			xf2.SetIdentity();
+
+			b3TestSphereOutput output;
+			if (shape->TestSphere(&output, s1, xf2) == false)
+			{
+				continue;
+			}
+
+			if (output.separation < bestSeparation)
+			{
+				bestSeparation = output.separation;
+				bestNormal = output.normal;
+				bestIndex = j;
+			}
+		}
+
+		if (bestIndex == ~0)
+		{
+			c->Fn = 0.0f;
+			c->Ft1 = 0.0f;
+			c->Ft2 = 0.0f;
+			c->lockOnSurface = false;
+			c->slideOnSurface = false;
+			continue;
+		}
+
+		B3_ASSERT(bestSeparation <= 0.0f);
+
+		const b3Shape* shape = m_shapes[bestIndex];
+		float32 s = bestSeparation;
+		
+		// Ensure the normal points to the shape
+		b3Vec3 n = -bestNormal;
+
+		// Apply position correction
+		b3Vec3 dx = s * n;
+		
+		m_y[i] += dx;
+
+		// Update contact state
+		if (wasLocked)
+		{
+			// Was the contact force attractive?
+			if (c->Fn > -B3_FORCE_THRESHOLD)
+			{
+				// Terminate the contact.
+				c->lockOnSurface = false;
+				continue;
+			}
+
+			// Since the contact force was repulsive 
+			// maintain the acceleration constraint.
+			c->n = n;
+			c->j = bestIndex;
+			c->lockOnSurface = true;
+		}
+		else
+		{
+			// The contact has began.
+			c->n = n;
+			c->Fn = 0.0f;
+			c->Ft1 = 0.0f;
+			c->Ft2 = 0.0f;
+			c->j = bestIndex;
+			c->lockOnSurface = true;
+
+			// Relative velocity
+			b3Vec3 dv = m_v[i];
+			
+			b3MakeTangents(c->t1, c->t2, dv, n);
+			c->slideOnSurface = false;
+
+			continue;
+		}
+	}
+}
+
+void b3SpringCloth::Step(float32 dt)
+{
+	if (dt == 0.0f)
+	{
+		return;
+	}
+
+	// Update contacts
+	UpdateContacts();
+
+	// Apply gravity
+	for (u32 i = 0; i < m_massCount; ++i)
+	{
+		if (m_types[i] == e_dynamicMass)
 		{
 			m_f[i] += m_gravity;
 		}
 	}
 
-	// Compute non-zero Jacobians Jx, Jv
-	b3Mat33* Jx = (b3Mat33*)m_allocator->Allocate(spring_size * sizeof(b3Mat33));
-	b3SetZero_Jacobian(Jx, spring_size);
+	// Solve springs, constraints, and integrate
+	b3SpringSolverDef solverDef;
+	solverDef.cloth = this;
+	solverDef.dt = dt;
 
-	b3Mat33* Jv = (b3Mat33*)m_allocator->Allocate(spring_size * sizeof(b3Mat33));
-	b3SetZero_Jacobian(Jv, spring_size);
+	b3SpringSolver solver(solverDef);
 
-	// Compute forces and Jacobians
-	for (u32 i = 0; i < m_springCount; ++i)
-	{
-		b3Spring* S = m_springs + i;
+	// Constraint forces that are required to satisfy the constraints
+	b3Vec3* forces = (b3Vec3*)m_allocator->Allocate(m_massCount * sizeof(b3Vec3));
 
-		b3Vec3 x1 = m_x[S->i1];
-		b3Vec3 v1 = m_v[S->i1];
+	solver.Solve(forces);
 
-		b3Vec3 x2 = m_x[S->i2];
-		b3Vec3 v2 = m_v[S->i2];
-
-		// Strech
-		b3Vec3 dx = x2 - x1;
-		float32 L = b3Length(dx);
-		b3Vec3 n = dx;
-		if (L > 0.0f)
-		{
-			n /= L;
-		}
-
-		float32 C = L - S->L0;
-		
-		// Compute streching forces
-		b3Vec3 sf1 = -S->ks * C * -n;
-		b3Vec3 sf2 = -sf1;
-
-		m_f[S->i1] += sf1;
-		m_f[S->i2] += sf2;
-
-		// Compute damping forces
-		b3Vec3 dv = v2 - v1;
-
-		b3Vec3 df1 = -S->kd * -dv;
-		b3Vec3 df2 = -df1;
-
-		m_f[S->i1] += df1;
-		m_f[S->i2] += df2;
-
-		b3Mat33 I = b3Mat33_identity;
-		
-		// Compute Jx11	
-		float32 inv_L = L > 0.0f ? 1.0f / L : 0.0f;
-		
-		float32 L2 = L * L;
-		float32 inv_L2 = L2 > 0.0f ? 1.0f / L2 : 0.0f;
-
-		// Hessian
-		// del^2_C / del_x
-		b3Mat33 H_11 = inv_L * I + inv_L2 * b3Outer(dx, -n);
-
-		// del_C / del_x * del_C / del_x^T
-		b3Mat33 JJ_11 = b3Outer(-n, -n);
-
-		b3Mat33 Jx11 = -S->ks * (C * H_11 + JJ_11);
-		Jx[i] = Jx11;
-
-		// Compute Jv11
-		b3Mat33 Jv11 = -S->kd * I;
-		Jv[i] = Jv11;
-	}
-	
-	// Solve Ax = b
-
-	// Compute b
-
-	// b = h * (f0 + h * dfdx * v0 + dfdx * y) ) 
-	b3Vec3* b = (b3Vec3*) m_allocator->Allocate(size * sizeof(b3Vec3));
-
-	// Jx_v = dfdx * v
-	b3Vec3* Jx_v = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-	// b3Mul(Jx_v, dfdx, v, size);
-	b3Mul_Jacobian(Jx_v, m_v, size, Jx, m_springs, m_springCount);
-
-	// Jx_v0y = dfdx * y
-	b3Vec3* Jx_y = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-	// b3Mul(Jx_y, dfdx, y, size);
-	b3Mul_Jacobian(Jx_y, m_y, size, Jx, m_springs, m_springCount);
-
-	// b = h * (f0 + h * Jx_v + Jx_y )
-	for (u32 i = 0; i < size; ++i)
-	{
-		b[i] = h * (m_f[i] + h * Jx_v[i] + Jx_y[i]);
-	}
-
-	m_allocator->Free(Jx_y);
-	m_allocator->Free(Jx_v);
-
-	// Solve Ax = b
-	b3Vec3* z = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-
-	b3Vec3* dv = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-
-	b3Vec3* Adv = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-
-	b3Vec3* r = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-
-	b3Vec3* c = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-
-	b3Vec3* q = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-	
-	b3Vec3* s = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-
-	b3Vec3* P = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-
-	b3Vec3* inv_P = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-
-	// Compute z
-	b3Make_z(z, size, types, m_collisions);
-
-	// dv = z
-	b3Copy(dv, z, size);
-
-	// Adv = A * dv 
-	// b3Mul(Adv, A, dv, size);
-	b3Mul_A(Adv, dv, size, m_allocator, m_inv_m, h, Jx, Jv, m_springs, m_springCount);
-
-	// Compute P, P^-1
-	// We compute A because P = diag(A)^-1
-	// Note this is not necessary, and should be optimized as soon 
-	// as possible.
-
-	// A = M - h * dfdv - h * h * dfdx
-	b3Mat33* A = (b3Mat33*)m_allocator->Allocate(size * size * sizeof(b3Mat33));
-
-	// A = 0
-	b3SetZero(A, size);
-
-	// Compute dfdx, dfdv
-	b3Mat33* dfdx = (b3Mat33*)m_allocator->Allocate(size * size * sizeof(b3Mat33));
-	b3SetZero(dfdx, size);
-
-	b3Mat33* dfdv = (b3Mat33*)m_allocator->Allocate(size * size * sizeof(b3Mat33));
-	b3SetZero(dfdv, size);
-
-	for (u32 i = 0; i < m_springCount; ++i)
-	{
-		b3Spring* S = m_springs + i;
-
-		b3Mat33 Jx11 = Jx[i];
-		b3Mat33 Jx12 = -Jx11;
-		b3Mat33 Jx21 = Jx12;
-		b3Mat33 Jx22 = Jx11;
-
-		dfdx[B3_INDEX(S->i1, S->i1, size)] += Jx11;
-		dfdx[B3_INDEX(S->i1, S->i2, size)] += Jx12;
-		dfdx[B3_INDEX(S->i2, S->i1, size)] += Jx21;
-		dfdx[B3_INDEX(S->i2, S->i2, size)] += Jx22;
-
-		b3Mat33 Jv11 = Jv[i];
-		b3Mat33 Jv12 = -Jv11;
-		b3Mat33 Jv21 = Jv12;
-		b3Mat33 Jv22 = Jv11;
-
-		dfdv[B3_INDEX(S->i1, S->i1, size)] += Jv11;
-		dfdv[B3_INDEX(S->i1, S->i2, size)] += Jv12;
-		dfdv[B3_INDEX(S->i2, S->i1, size)] += Jv21;
-		dfdv[B3_INDEX(S->i2, S->i2, size)] += Jv22;
-	}
-
-	// A += M
-	for (u32 i = 0; i < size; ++i)
-	{
-		float32 m = 1.0f / m_inv_m[i];
-
-		A[B3_INDEX(i, i, size)] += b3Diagonal(m);
-	}
-
-	// A += - h * dfdv - h * h * dfdx
-	for (u32 i = 0; i < size; ++i)
-	{
-		for (u32 j = 0; j < size; ++j)
-		{
-			A[B3_INDEX(i, j, size)] += (-h * dfdv[B3_INDEX(i, j, size)]) + (-h * h * dfdx[B3_INDEX(i, j, size)]);
-		}
-	}
-
-	for (u32 i = 0; i < size; ++i)
-	{
-		b3Mat33 D = A[B3_INDEX(i, i, size)];
-
-		B3_ASSERT(D[0][0] != 0.0f);
-		B3_ASSERT(D[1][1] != 0.0f);
-		B3_ASSERT(D[2][2] != 0.0f);
-
-		P[i] = b3Vec3(1.0f / D[0][0], 1.0f / D[1][1], 1.0f / D[2][2]);
-		inv_P[i] = b3Vec3(D[0][0], D[1][1], D[2][2]);
-	}
-
-	m_allocator->Free(dfdv);
-	m_allocator->Free(dfdx);
-	m_allocator->Free(A);
-
-	// eps0 = dot( filter(b), P * filter(b) )
-	b3Vec3* filter_b = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-	b3Filter(filter_b, b, size, types, m_collisions);
-
-	b3Vec3* P_filter_b = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
-	for (u32 i = 0; i < size; ++i)
-	{
-		P_filter_b[i][0] = P[i][0] * filter_b[i][0];
-		P_filter_b[i][1] = P[i][1] * filter_b[i][1];
-		P_filter_b[i][2] = P[i][2] * filter_b[i][2];
-	}
-
-	float32 eps0 = b3Dot(filter_b, P_filter_b, size);
-
-	m_allocator->Free(P_filter_b);
-	m_allocator->Free(filter_b);
-
-	// r = filter(b - Adv)
-	b3Sub(r, b, Adv, size);
-	b3Filter(r, r, size, types, m_collisions);
-
-	// c = filter(P^-1 * r)
+	// Store constraint forces for physics logic
 	for (u32 i = 0; i < m_massCount; ++i)
 	{
-		c[i][0] = inv_P[i][0] * r[i][0];
-		c[i][1] = inv_P[i][1] * r[i][1];
-		c[i][2] = inv_P[i][2] * r[i][2];
-	}
-	b3Filter(c, c, size, types, m_collisions);
+		b3Vec3 force = forces[i];
 
-	// epsNew = dot(r, c)
-	float32 epsNew = b3Dot(r, c, size);
+		b3MassContact* contact = m_contacts + i;
 
-	// This is in [0, 1]
-	// Making it smaller can increase accuracy, but it might increase the number 
-	// of iterations to be taken by the solver.
-	const float32 kTol = 0.25f;
+		// Signed normal force magnitude
+		contact->Fn = b3Dot(force, contact->n);
 
-	// Limit number of iterations to prevent cycling.
-	const u32 kMaxIters = 200;
-	
-	// Main iteration loop.
-	u32 iter = 0;
-	while (iter < kMaxIters && epsNew > kTol * kTol * eps0)
-	{
-		// q = filter(A * c) 
-		// b3Mul(q, A, c, size);
-		b3Mul_A(q, c, size, m_allocator, m_inv_m, h, Jx, Jv, m_springs, m_springCount);
-		b3Filter(q, q, size, types, m_collisions);
-
-		// alpha = epsNew / dot(c, q)
-		float32 alpha = epsNew / b3Dot(c, q, size);
-
-		// x = x + alpha * c
-		for (u32 i = 0; i < m_massCount; ++i)
-		{
-			dv[i] = dv[i] + alpha * c[i];
-		}
-
-		// r = r - alpha * q 
-		for (u32 i = 0; i < m_massCount; ++i)
-		{
-			r[i] = r[i] - alpha * q[i];
-		}
-
-		// s = inv_P * r
-		for (u32 i = 0; i < m_massCount; ++i)
-		{
-			s[i][0] = inv_P[i][0] * r[i][0];
-			s[i][1] = inv_P[i][1] * r[i][1];
-			s[i][2] = inv_P[i][2] * r[i][2];
-		}
-
-		// epsOld = epsNew
-		float32 epsOld = epsNew;
-		
-		// epsNew = dot(r, s) 
-		epsNew = b3Dot(r, s, size);
-
-		// beta = epsNew / epsOld
-		float32 beta = epsNew / epsOld;
-
-		// c = filter(s + beta * c) 
-		for (u32 i = 0; i < m_massCount; ++i)
-		{
-			c[i] = s[i] + beta * c[i];
-		}
-		b3Filter(c, c, size, types, m_collisions);
-
-		++iter;
+		// Signed tangent forces magnitude
+		contact->Ft1 = b3Dot(force, contact->t1);
+		contact->Ft2 = b3Dot(force, contact->t2);
 	}
 
-	m_step.iterations = iter;
+	m_allocator->Free(forces);
 
-	// Update state
+	// Clear position correction
 	for (u32 i = 0; i < m_massCount; ++i)
 	{
-		m_v[i] += dv[i];
-		m_x[i] += h * m_v[i] + m_y[i];
+		m_y[i].SetZero();
 	}
 
 	// Clear forces
@@ -793,25 +360,6 @@ void b3SpringCloth::Step(float32 h)
 	{
 		m_f[i].SetZero();
 	}
-
-	// Clear position alteration
-	for (u32 i = 0; i < m_massCount; ++i)
-	{
-		m_y[i].SetZero();
-	}
-	
-	m_allocator->Free(inv_P);
-	m_allocator->Free(P);
-	m_allocator->Free(s);
-	m_allocator->Free(q);
-	m_allocator->Free(c);
-	m_allocator->Free(r);
-	m_allocator->Free(Adv);
-	m_allocator->Free(dv);
-	m_allocator->Free(z);
-	m_allocator->Free(b);
-	m_allocator->Free(Jv);
-	m_allocator->Free(Jx);
 }
 
 void b3SpringCloth::Apply() const
@@ -824,16 +372,25 @@ void b3SpringCloth::Apply() const
 
 void b3SpringCloth::Draw(b3Draw* draw) const
 {
-	for (u32 i = 0; i < m_sphereCount; ++i)
-	{
-		draw->DrawSolidSphere(m_spheres[i].vertex, m_spheres[i].radius, b3Color_white);
-	}
-
 	const b3Mesh* m = m_mesh;
 
 	for (u32 i = 0; i < m->vertexCount; ++i)
 	{
-		draw->DrawPoint(m_x[i], 2.0f, b3Color_green);
+		if (m_contacts[i].lockOnSurface)
+		{
+			if (m_contacts[i].Fn > -B3_FORCE_THRESHOLD)
+			{
+				draw->DrawPoint(m_x[i], 6.0f, b3Color_yellow);
+			}
+			else
+			{
+				draw->DrawPoint(m_x[i], 6.0f, b3Color_red);
+			}
+		}
+		else
+		{
+			draw->DrawPoint(m_x[i], 6.0f, b3Color_green);
+		}
 	}
 
 	for (u32 i = 0; i < m->triangleCount; ++i)
diff --git a/src/bounce/dynamics/cloth/spring_solver.cpp b/src/bounce/dynamics/cloth/spring_solver.cpp
new file mode 100644
index 0000000..2ca00f6
--- /dev/null
+++ b/src/bounce/dynamics/cloth/spring_solver.cpp
@@ -0,0 +1,722 @@
+/*
+* Copyright (c) 2016-2016 Irlan Robson http://www.irlan.net
+*
+* This software is provided 'as-is', without any express or implied
+* warranty.  In no event will the authors be held liable for any damages
+* arising from the use of this software.
+* Permission is granted to anyone to use this software for any purpose,
+* including commercial applications, and to alter it and redistribute it
+* freely, subject to the following restrictions:
+* 1. The origin of this software must not be misrepresented; you must not
+* claim that you wrote the original software. If you use this software
+* in a product, an acknowledgment in the product documentation would be
+* appreciated but is not required.
+* 2. Altered source versions must be plainly marked as such, and must not be
+* misrepresented as being the original software.
+* 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <bounce/dynamics/cloth/spring_solver.h>
+#include <bounce/dynamics/cloth/spring_cloth.h>
+#include <bounce/common/memory/stack_allocator.h>
+
+// Here, we solve Ax = b using the Modified Conjugate Gradient method.
+// This work is based on the paper "Large Steps in Cloth Simulation - David Baraff, Andrew Witkin".
+
+// Enable preconditioning. It can be slow, depending on 
+// how the preconditioning matrix is computed, but it can help
+// to increase convergence.
+bool b3_enablePrecontitioning = false;
+
+b3SpringSolver::b3SpringSolver(const b3SpringSolverDef& def)
+{
+	m_cloth = def.cloth;
+	m_h = def.dt;
+	m_iterations = 0;
+	m_Jx = nullptr;
+	m_Jv = nullptr;
+
+	m_allocator = m_cloth->m_allocator;
+
+	m_x = m_cloth->m_x;
+	m_v = m_cloth->m_v;
+	m_f = m_cloth->m_f;
+	m_inv_m = m_cloth->m_inv_m;
+	m_y = m_cloth->m_y;
+	m_types = m_cloth->m_types;
+	m_massCount = m_cloth->m_massCount;
+
+	m_contacts = m_cloth->m_contacts;
+
+	m_springs = m_cloth->m_springs;
+	m_springCount = m_cloth->m_springCount;
+}
+
+b3SpringSolver::~b3SpringSolver()
+{
+
+}
+
+void b3SpringSolver::Solve(b3Vec3* f)
+{
+	u32 size = m_massCount;
+	b3MassType* types = m_types;
+	u32 spring_size = m_springCount;
+
+	m_Jx = (b3Mat33*)m_allocator->Allocate(spring_size * sizeof(b3Mat33));
+	m_Jv = (b3Mat33*)m_allocator->Allocate(spring_size * sizeof(b3Mat33));
+
+	// Compute and apply spring forces, store their unique derivatives.
+	InitializeSpringForces();
+
+	// Integrate
+
+	// Solve Ax = b, where
+	// A = M - h * dfdv - h * h * dfdx
+	// b = h * (f0 + h * dfdx * v0 + dfdx * y) 
+	
+	//
+	b3Vec3* b = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+	Compute_b(b);
+	
+	//
+	b3Vec3* x = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+
+	// Solve Ax = b
+	if (b3_enablePrecontitioning)
+	{
+		Solve_MPCG(x, f, m_iterations, b);
+	}
+	else
+	{
+		Solve_MCG(x, f, m_iterations, b);
+	}
+
+	// Update state
+	for (u32 i = 0; i < m_massCount; ++i)
+	{
+		m_v[i] += x[i];
+
+		// dx = h * (v0 + dv) + y = h * v1 + y
+		m_x[i] += m_h * m_v[i] + m_y[i];
+	}
+
+	m_allocator->Free(x);
+	m_allocator->Free(b);
+
+	m_allocator->Free(m_Jv);
+	m_allocator->Free(m_Jx);
+	m_Jv = nullptr;
+	m_Jx = nullptr;
+}
+
+// This outputs the desired acceleration of the masses in the constrained 
+// directions.
+static B3_FORCE_INLINE void b3Compute_z(b3Vec3* out, 
+	u32 size, const b3MassType* types, const b3MassContact* contacts)
+{
+	for (u32 i = 0; i < size; ++i)
+	{
+		switch (types[i])
+		{
+		case e_staticMass:
+		{
+			out[i].SetZero();
+			break;
+		}
+		case e_dynamicMass:
+		{
+			if (contacts[i].lockOnSurface)
+			{
+				out[i].SetZero();
+				break;
+			}
+
+			out[i].SetZero();
+			break;
+		}
+		default:
+		{
+			B3_ASSERT(false);
+			break;
+		}
+		}
+	}
+}
+
+static B3_FORCE_INLINE void b3Filter(b3Vec3* out, 
+	const b3Vec3* v, u32 size, const b3MassType* types, const b3MassContact* contacts)
+{
+	for (u32 i = 0; i < size; ++i)
+	{
+		switch (types[i])
+		{
+		case e_staticMass:
+		{
+			out[i].SetZero();
+			break;
+		}
+		case e_dynamicMass:
+		{
+			if (contacts[i].lockOnSurface)
+			{				
+				b3Vec3 n = contacts[i].n;
+
+				b3Mat33 S = b3Mat33_identity - b3Outer(n, n);
+
+				out[i] = S * v[i];
+				break;
+			}
+
+			out[i] = v[i];
+			break;
+		}
+		default:
+		{
+			B3_ASSERT(false);
+			break;
+		}
+		}
+	}
+}
+
+static B3_FORCE_INLINE void b3SetZero(b3Vec3* out, u32 size)
+{
+	for (u32 i = 0; i < size; ++i)
+	{
+		out[i].SetZero();
+	}
+}
+
+static B3_FORCE_INLINE void b3Copy(b3Vec3* out, const b3Vec3* v, u32 size)
+{
+	for (u32 i = 0; i < size; ++i)
+	{
+		out[i] = v[i];
+	}
+}
+
+static B3_FORCE_INLINE void b3Add(b3Vec3* out, const b3Vec3* a, const b3Vec3* b, u32 size)
+{
+	for (u32 i = 0; i < size; ++i)
+	{
+		out[i] = a[i] + b[i];
+	}
+}
+
+static B3_FORCE_INLINE void b3Sub(b3Vec3* out, const b3Vec3* a, const b3Vec3* b, u32 size)
+{
+	for (u32 i = 0; i < size; ++i)
+	{
+		out[i] = a[i] - b[i];
+	}
+}
+
+static B3_FORCE_INLINE float32 b3Dot(const b3Vec3* a, const b3Vec3* b, u32 size)
+{
+	float32 result = 0.0f;
+	for (u32 i = 0; i < size; ++i)
+	{
+		result += b3Dot(a[i], b[i]);
+	}
+	return result;
+}
+
+#define B3_INDEX(i, j, size) (i + j * size)
+
+static B3_FORCE_INLINE void b3SetZero(b3Mat33* out, u32 size)
+{
+	for (u32 i = 0; i < size; ++i)
+	{
+		for (u32 j = 0; j < size; ++j)
+		{
+			out[B3_INDEX(i, j, size)].SetZero();
+		}
+	}
+}
+
+static B3_FORCE_INLINE void b3Mul(b3Vec3* out, const b3Mat33* M, const b3Vec3* v, u32 size)
+{
+	for (u32 i = 0; i < size; ++i)
+	{
+		out[i].SetZero();
+
+		for (u32 j = 0; j < size; ++j)
+		{
+			out[i] += M[B3_INDEX(i, j, size)] * v[j];
+		}
+	}
+}
+
+// J = dfdx or dvdx
+static B3_FORCE_INLINE void b3Mul_Jacobian(b3Vec3* out, const b3Vec3* v, u32 mass_size,
+	const b3Mat33* J_ii, const b3Spring* springs, u32 spring_size)
+{
+	b3SetZero(out, mass_size);
+
+	for (u32 i = 0; i < spring_size; ++i)
+	{
+		const b3Spring* S = springs + i;
+		u32 i1 = S->i1;
+		u32 i2 = S->i2;
+
+		b3Mat33 J_11 = J_ii[i];
+		b3Mat33 J_12 = -J_11;
+		b3Mat33 J_21 = J_12;
+		b3Mat33 J_22 = J_11;
+
+		out[i1] += J_11 * v[i1] + J_12 * v[i2];
+		out[i2] += J_21 * v[i1] + J_22 * v[i2];
+	}
+}
+
+static B3_FORCE_INLINE void b3SetZero_Jacobian(b3Mat33* out, u32 size)
+{
+	for (u32 i = 0; i < size; ++i)
+	{
+		out[i].SetZero();
+	}
+}
+
+// A = M - h * dfdv - h * h * dfdx
+// A * v = (M - h * dfdv - h * h * dfdx) * v = M * v + (-h * dfdv * v) + (-h * h * dfdx * v) 
+static B3_FORCE_INLINE void b3Mul_A(b3Vec3* out, const b3Vec3* v, u32 mass_size,
+	b3StackAllocator* allocator,
+	const float32* inv_m, float32 h, const b3Mat33* Jx, const b3Mat33* Jv, const b3Spring* springs, u32 spring_size)
+{
+	// v1 = M * v
+	b3Vec3* v1 = (b3Vec3*)allocator->Allocate(mass_size * sizeof(b3Vec3));
+	for (u32 i = 0; i < mass_size; ++i)
+	{
+		float32 m = inv_m[i] != 0.0f ? 1.0f / inv_m[i] : 0.0f;
+		v1[i] = m * v[i];
+	}
+
+	// v2 = (-h * dfdv * v)
+	b3Vec3* v2 = (b3Vec3*)allocator->Allocate(mass_size * sizeof(b3Vec3));
+	b3Mul_Jacobian(v2, v, mass_size, Jv, springs, spring_size);
+	for (u32 i = 0; i < mass_size; ++i)
+	{
+		v2[i] *= -h;
+	}
+
+	// v3 = (-h * h * dfdx * v)
+	b3Vec3* v3 = (b3Vec3*)allocator->Allocate(mass_size * sizeof(b3Vec3));
+	b3Mul_Jacobian(v3, v, mass_size, Jx, springs, spring_size);
+	for (u32 i = 0; i < mass_size; ++i)
+	{
+		v3[i] *= -h * h;
+	}
+
+	// v = v1 + v2 + v3
+	for (u32 i = 0; i < mass_size; ++i)
+	{
+		out[i] = v1[i] + v2[i] + v3[i];
+	}
+
+	allocator->Free(v3);
+	allocator->Free(v2);
+	allocator->Free(v1);
+}
+
+void b3SpringSolver::InitializeSpringForces()
+{
+	u32 size = m_massCount;
+	u32 spring_size = m_springCount;
+
+	// Zero Jacobians
+	b3SetZero_Jacobian(m_Jx, spring_size);
+	b3SetZero_Jacobian(m_Jv, spring_size);
+
+	// Compute forces and Jacobians
+	for (u32 i = 0; i < spring_size; ++i)
+	{
+		b3Spring* S = m_springs + i;
+
+		b3Vec3 x1 = m_x[S->i1];
+		b3Vec3 v1 = m_v[S->i1];
+
+		b3Vec3 x2 = m_x[S->i2];
+		b3Vec3 v2 = m_v[S->i2];
+
+		// Strech
+		b3Vec3 dx = x2 - x1;
+		float32 L = b3Length(dx);
+		b3Vec3 n = dx;
+		if (L > 0.0f)
+		{
+			n /= L;
+		}
+
+		float32 C = L - S->L0;
+
+		// Compute streching forces
+		b3Vec3 sf1 = -S->ks * C * -n;
+		b3Vec3 sf2 = -sf1;
+
+		m_f[S->i1] += sf1;
+		m_f[S->i2] += sf2;
+
+		// Compute damping forces
+		b3Vec3 dv = v2 - v1;
+
+		b3Vec3 df1 = -S->kd * -dv;
+		b3Vec3 df2 = -df1;
+
+		m_f[S->i1] += df1;
+		m_f[S->i2] += df2;
+
+		b3Mat33 I = b3Mat33_identity;
+
+		// Compute Jx11	
+		float32 inv_L = L > 0.0f ? 1.0f / L : 0.0f;
+
+		float32 L2 = L * L;
+		float32 inv_L2 = L2 > 0.0f ? 1.0f / L2 : 0.0f;
+
+		// Hessian
+		// del^2_C / del_x
+		b3Mat33 H_11 = inv_L * I + inv_L2 * b3Outer(dx, -n);
+
+		// del_C / del_x * del_C / del_x^T
+		b3Mat33 JJ_11 = b3Outer(-n, -n);
+
+		b3Mat33 Jx11 = -S->ks * (C * H_11 + JJ_11);
+		m_Jx[i] = Jx11;
+
+		// Compute Jv11
+		b3Mat33 Jv11 = -S->kd * I;
+		m_Jv[i] = Jv11;
+	}
+}
+
+void b3SpringSolver::Compute_b(b3Vec3* b) const
+{
+	float32 h = m_h;
+	u32 size = m_massCount;
+
+	// Jx_v = dfdx * v
+	b3Vec3* Jx_v = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+	b3Mul_Jacobian(Jx_v, m_v, size, m_Jx, m_springs, m_springCount);
+
+	// Jx_y = dfdx * y
+	b3Vec3* Jx_y = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+	b3Mul_Jacobian(Jx_y, m_y, size, m_Jx, m_springs, m_springCount);
+
+	// b = h * (f0 + h * Jx_v + Jx_y )
+	for (u32 i = 0; i < size; ++i)
+	{
+		b[i] = h * (m_f[i] + h * Jx_v[i] + Jx_y[i]);
+	}
+
+	m_allocator->Free(Jx_y);
+	m_allocator->Free(Jx_v);
+}
+
+void b3SpringSolver::Solve_MCG(b3Vec3* dv, b3Vec3* e, u32& iterations, const b3Vec3* b) const
+{
+	// dv = z
+	b3Compute_z(dv, m_massCount, m_types, m_contacts);
+
+	// r = filter(b - Adv)
+	b3Vec3* r = (b3Vec3*)m_allocator->Allocate(m_massCount * sizeof(b3Vec3));
+
+	// Adv = A * dv 
+	b3Vec3* Adv = (b3Vec3*)m_allocator->Allocate(m_massCount * sizeof(b3Vec3));
+
+	b3Mul_A(Adv, dv, m_massCount, m_allocator, m_inv_m, m_h, m_Jx, m_Jv, m_springs, m_springCount);
+	b3Sub(r, b, Adv, m_massCount);
+	b3Filter(r, r, m_massCount, m_types, m_contacts);
+
+	m_allocator->Free(Adv);
+
+	// c = r
+	b3Vec3* c = (b3Vec3*)m_allocator->Allocate(m_massCount * sizeof(b3Vec3));
+	b3Copy(c, r, m_massCount);
+
+	// eps0 = dot(f, f)
+	float32 eps0 = b3Dot(r, r, m_massCount);
+
+	// epsNew = dot(r, r)
+	float32 epsNew = eps0;
+
+	// [0, 1]
+	const float32 kTol = 0.25f;
+
+	// Limit number of iterations to prevent cycling.
+	const u32 kMaxIters = 100;
+
+	// Main iteration loop.
+	u32 iter = 0;
+	while (iter < kMaxIters && epsNew > kTol * kTol * eps0)
+	{
+		// q = filter(A * c) 
+		b3Vec3* q = (b3Vec3*)m_allocator->Allocate(m_massCount * sizeof(b3Vec3));
+		b3Mul_A(q, c, m_massCount, m_allocator, m_inv_m, m_h, m_Jx, m_Jv, m_springs, m_springCount);
+		b3Filter(q, q, m_massCount, m_types, m_contacts);
+
+		// alpha = epsNew / dot(c, q)
+		float32 alpha_den = b3Dot(c, q, m_massCount);
+		float32 alpha = epsNew / alpha_den;
+
+		// dv = dv + alpha * c
+		for (u32 i = 0; i < m_massCount; ++i)
+		{
+			dv[i] = dv[i] + alpha * c[i];
+		}
+
+		// r = r - alpha * q 
+		for (u32 i = 0; i < m_massCount; ++i)
+		{
+			r[i] = r[i] - alpha * q[i];
+		}
+
+		m_allocator->Free(q);
+
+		// epsOld = epsNew
+		float32 epsOld = epsNew;
+
+		// epsNew = dot(r, r) 
+		epsNew = b3Dot(r, r, m_massCount);
+
+		float32 beta = epsNew / epsOld;
+
+		// c = filter(r + beta * c) 
+		for (u32 i = 0; i < m_massCount; ++i)
+		{
+			c[i] = r[i] + beta * c[i];
+		}
+		b3Filter(c, c, m_massCount, m_types, m_contacts);
+
+		++iter;
+	}
+
+	m_allocator->Free(c);
+	m_allocator->Free(r);
+
+	iterations = iter;
+
+	// f = A * dv - b
+	b3Mul_A(e, dv, m_massCount, m_allocator, m_inv_m, m_h, m_Jx, m_Jv, m_springs, m_springCount);
+	b3Sub(e, e, b, m_massCount);
+}
+
+static void B3_FORCE_INLINE b3Make_A(b3Mat33* A,
+	const b3Mat33* Jx, const b3Mat33* Jv, u32 size,
+	b3StackAllocator* allocator, float32 h, float32* inv_m,
+	const b3Spring* springs, u32 spring_size)
+{
+	// A = M - h * dfdv - h * h * dfdx
+
+	// A = 0
+	b3SetZero(A, size);
+
+	// Compute dfdx, dfdv
+	b3Mat33* dfdx = (b3Mat33*)allocator->Allocate(size * size * sizeof(b3Mat33));
+	b3SetZero(dfdx, size);
+
+	b3Mat33* dfdv = (b3Mat33*)allocator->Allocate(size * size * sizeof(b3Mat33));
+	b3SetZero(dfdv, size);
+
+	for (u32 i = 0; i < spring_size; ++i)
+	{
+		const b3Spring* S = springs + i;
+
+		b3Mat33 Jx11 = Jx[i];
+		b3Mat33 Jx12 = -Jx11;
+		b3Mat33 Jx21 = Jx12;
+		b3Mat33 Jx22 = Jx11;
+
+		dfdx[B3_INDEX(S->i1, S->i1, size)] += Jx11;
+		dfdx[B3_INDEX(S->i1, S->i2, size)] += Jx12;
+		dfdx[B3_INDEX(S->i2, S->i1, size)] += Jx21;
+		dfdx[B3_INDEX(S->i2, S->i2, size)] += Jx22;
+
+		b3Mat33 Jv11 = Jv[i];
+		b3Mat33 Jv12 = -Jv11;
+		b3Mat33 Jv21 = Jv12;
+		b3Mat33 Jv22 = Jv11;
+
+		dfdv[B3_INDEX(S->i1, S->i1, size)] += Jv11;
+		dfdv[B3_INDEX(S->i1, S->i2, size)] += Jv12;
+		dfdv[B3_INDEX(S->i2, S->i1, size)] += Jv21;
+		dfdv[B3_INDEX(S->i2, S->i2, size)] += Jv22;
+	}
+
+	// A += M
+	for (u32 i = 0; i < size; ++i)
+	{
+		B3_ASSERT(inv_m[i] != 0.0f);
+
+		float32 m = 1.0f / inv_m[i];
+
+		A[B3_INDEX(i, i, size)] += b3Diagonal(m);
+	}
+
+	// A += - h * dfdv - h * h * dfdx
+	for (u32 i = 0; i < size; ++i)
+	{
+		for (u32 j = 0; j < size; ++j)
+		{
+			A[B3_INDEX(i, j, size)] += (-h * dfdv[B3_INDEX(i, j, size)]) + (-h * h * dfdx[B3_INDEX(i, j, size)]);
+		}
+	}
+
+	allocator->Free(dfdv);
+	allocator->Free(dfdx);
+}
+
+void b3SpringSolver::Solve_MPCG(b3Vec3* dv, b3Vec3* e, u32& iterations, const b3Vec3* b) const
+{
+	u32 size = m_massCount;
+	b3MassType* types = m_types;
+	u32 spring_size = m_springCount;
+
+	b3Vec3* r = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+
+	b3Vec3* c = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+
+	b3Vec3* s = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+
+	b3Vec3* inv_P = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+
+	// dv = z
+	b3Compute_z(dv, size, types, m_contacts);
+
+	// P = diag(A)^-1
+	b3Vec3* P = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+
+	// A = M - h * dfdv - h * h * dfdx
+	b3Mat33* A = (b3Mat33*)m_allocator->Allocate(size * size * sizeof(b3Mat33));
+	b3Make_A(A, m_Jx, m_Jv, size, m_allocator, m_h, m_inv_m, m_springs, m_springCount);
+
+	// Compute P, P^-1
+	// @todo Optimize so we don't need to compute A.
+	for (u32 i = 0; i < size; ++i)
+	{
+		b3Mat33 D = A[B3_INDEX(i, i, size)];
+
+		B3_ASSERT(D[0][0] != 0.0f);
+		B3_ASSERT(D[1][1] != 0.0f);
+		B3_ASSERT(D[2][2] != 0.0f);
+
+		P[i] = b3Vec3(1.0f / D[0][0], 1.0f / D[1][1], 1.0f / D[2][2]);
+		inv_P[i] = b3Vec3(D[0][0], D[1][1], D[2][2]);
+	}
+
+	m_allocator->Free(A);
+
+	// eps0 = dot( filter(b), P * filter(b) )
+	b3Vec3* filter_b = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+	b3Filter(filter_b, b, size, types, m_contacts);
+
+	b3Vec3* P_filter_b = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+	for (u32 i = 0; i < size; ++i)
+	{
+		P_filter_b[i][0] = P[i][0] * filter_b[i][0];
+		P_filter_b[i][1] = P[i][1] * filter_b[i][1];
+		P_filter_b[i][2] = P[i][2] * filter_b[i][2];
+	}
+
+	float32 eps0 = b3Dot(filter_b, P_filter_b, size);
+
+	m_allocator->Free(P_filter_b);
+	m_allocator->Free(filter_b);
+	m_allocator->Free(P);
+
+	// r = filter(b - Adv)
+
+	// Adv = A * dv 
+	b3Vec3* Adv = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+	b3Mul_A(Adv, dv, size, m_allocator, m_inv_m, m_h, m_Jx, m_Jv, m_springs, m_springCount);
+
+	b3Sub(r, b, Adv, size);
+
+	m_allocator->Free(Adv);
+
+	b3Filter(r, r, size, types, m_contacts);
+
+	// c = filter(P^-1 * r)
+	for (u32 i = 0; i < m_massCount; ++i)
+	{
+		c[i][0] = inv_P[i][0] * r[i][0];
+		c[i][1] = inv_P[i][1] * r[i][1];
+		c[i][2] = inv_P[i][2] * r[i][2];
+	}
+	b3Filter(c, c, size, types, m_contacts);
+
+	// epsNew = dot(r, c)
+	float32 epsNew = b3Dot(r, c, size);
+
+	// [0, 1]
+	const float32 kTol = 0.25f;
+
+	// Limit number of iterations to prevent cycling.
+	const u32 kMaxIters = 100;
+
+	// Main iteration loop.
+	u32 iter = 0;
+	while (iter < kMaxIters && epsNew > kTol * kTol * eps0)
+	{
+		// q = filter(A * c) 
+		b3Vec3* q = (b3Vec3*)m_allocator->Allocate(size * sizeof(b3Vec3));
+
+		b3Mul_A(q, c, size, m_allocator, m_inv_m, m_h, m_Jx, m_Jv, m_springs, m_springCount);
+		b3Filter(q, q, size, types, m_contacts);
+
+		// alpha = epsNew / dot(c, q)
+		float32 alpha = epsNew / b3Dot(c, q, size);
+
+		// x = x + alpha * c
+		for (u32 i = 0; i < m_massCount; ++i)
+		{
+			dv[i] = dv[i] + alpha * c[i];
+		}
+
+		// r = r - alpha * q 
+		for (u32 i = 0; i < m_massCount; ++i)
+		{
+			r[i] = r[i] - alpha * q[i];
+		}
+
+		m_allocator->Free(q);
+
+		// s = inv_P * r
+		for (u32 i = 0; i < m_massCount; ++i)
+		{
+			s[i][0] = inv_P[i][0] * r[i][0];
+			s[i][1] = inv_P[i][1] * r[i][1];
+			s[i][2] = inv_P[i][2] * r[i][2];
+		}
+
+		// epsOld = epsNew
+		float32 epsOld = epsNew;
+
+		// epsNew = dot(r, s) 
+		epsNew = b3Dot(r, s, size);
+
+		// beta = epsNew / epsOld
+		float32 beta = epsNew / epsOld;
+
+		// c = filter(s + beta * c) 
+		for (u32 i = 0; i < m_massCount; ++i)
+		{
+			c[i] = s[i] + beta * c[i];
+		}
+		b3Filter(c, c, size, types, m_contacts);
+
+		++iter;
+	}
+
+	m_allocator->Free(inv_P);
+	m_allocator->Free(s);
+	m_allocator->Free(c);
+	m_allocator->Free(r);
+
+	iterations = iter;
+
+	// Residual error
+	// f = A * x - b
+	b3Mul_A(e, dv, size, m_allocator, m_inv_m, m_h, m_Jx, m_Jv, m_springs, spring_size);
+	b3Sub(e, e, b, size);
+}
\ No newline at end of file