From 76128c4357bc36acd54575153516c6d337fe4263 Mon Sep 17 00:00:00 2001
From: Graham Linden <graham@lindenlab.com>
Date: Mon, 5 Aug 2019 12:04:29 -0700
Subject: SL-10566 Use vector for some high-traffic, low-item count containers
 instead of list.

Provide method of storing joint indices sep from weight data for faster runtime processing.
---
 indra/llmath/llvolume.cpp | 54 +++++++++++++++++++++++++++++++++++++++++------
 indra/llmath/llvolume.h   |  2 ++
 2 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'indra/llmath')

diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index e32625796c..9d0cf1e119 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -2526,6 +2526,7 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 			if (mdl[i].has("Weights"))
 			{
 				face.allocateWeights(num_verts);
+                face.allocateJointIndices(num_verts);
 
 				LLSD::Binary weights = mdl[i]["Weights"];
 
@@ -2566,6 +2567,13 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
                     {
                         wght = LLVector4(0.999f,0.f,0.f,0.f);
                     }
+                    if (face.mJointIndices)
+                    {
+                        for (U32 k=0; k<4; k++)
+                        {
+                            face.mJointIndices[cur_vertex * 4 + k] = llclamp((U8)joints[k], (U8)0, (U8)110);
+                        }
+                    }
                     for (U32 k=0; k<4; k++)
                     {
                         F32 f_combined = (F32) joints[k] + wght[k];
@@ -4656,6 +4664,7 @@ LLVolumeFace::LLVolumeFace() :
 	mTexCoords(NULL),
 	mIndices(NULL),
 	mWeights(NULL),
+    mJointIndices(NULL),
     mWeightsScrubbed(FALSE),
 	mOctree(NULL),
 	mOptimized(FALSE)
@@ -4682,6 +4691,7 @@ LLVolumeFace::LLVolumeFace(const LLVolumeFace& src)
 	mTexCoords(NULL),
 	mIndices(NULL),
 	mWeights(NULL),
+    mJointIndices(NULL),
     mWeightsScrubbed(FALSE),
 	mOctree(NULL)
 { 
@@ -4746,15 +4756,29 @@ LLVolumeFace& LLVolumeFace::operator=(const LLVolumeFace& src)
 
 		if (src.mWeights)
 		{
+            llassert(!mWeights); // don't orphan an old alloc here accidentally
 			allocateWeights(src.mNumVertices);
-			LLVector4a::memcpyNonAliased16((F32*) mWeights, (F32*) src.mWeights, vert_size);
+			LLVector4a::memcpyNonAliased16((F32*) mWeights, (F32*) src.mWeights, vert_size);            
+            mWeightsScrubbed = src.mWeightsScrubbed;
 		}
 		else
 		{
-			ll_aligned_free_16(mWeights);
-			mWeights = NULL;
-		}
-        mWeightsScrubbed = src.mWeightsScrubbed;
+			ll_aligned_free_16(mWeights);            
+			mWeights = NULL;            
+            mWeightsScrubbed = FALSE;
+		}   
+
+        if (src.mJointIndices)
+        {
+            llassert(!mJointIndices); // don't orphan an old alloc here accidentally
+            allocateJointIndices(src.mNumVertices);
+            LLVector4a::memcpyNonAliased16((F32*) mJointIndices, (F32*) src.mJointIndices, src.mNumVertices * sizeof(U8) * 4);
+        }
+        else
+        {
+            ll_aligned_free_16(mJointIndices);
+            mJointIndices = NULL;
+        }     
 	}
 
 	if (mNumIndices)
@@ -4763,7 +4787,12 @@ LLVolumeFace& LLVolumeFace::operator=(const LLVolumeFace& src)
 		
 		LLVector4a::memcpyNonAliased16((F32*) mIndices, (F32*) src.mIndices, idx_size);
 	}
-	
+	else
+    {
+        ll_aligned_free_16(mIndices);
+        mIndices = NULL;
+    }
+
 	mOptimized = src.mOptimized;
 
 	//delete 
@@ -4794,6 +4823,8 @@ void LLVolumeFace::freeData()
 	mTangents = NULL;
 	ll_aligned_free_16(mWeights);
 	mWeights = NULL;
+    ll_aligned_free_16(mJointIndices);
+	mJointIndices = NULL;
 
 	delete mOctree;
 	mOctree = NULL;
@@ -5448,11 +5479,13 @@ bool LLVolumeFace::cacheOptimize()
 	// DO NOT free mNormals and mTexCoords as they are part of mPositions buffer
 	ll_aligned_free_16(mWeights);
 	ll_aligned_free_16(mTangents);
+    ll_aligned_free_16(mJointIndices);
 
 	mPositions = pos;
 	mNormals = norm;
 	mTexCoords = tc;
 	mWeights = wght;
+    mJointIndices = NULL; // filled in later as necessary by skinning code for acceleration
 	mTangents = binorm;
 
 	//std::string result = llformat("ACMR pre/post: %.3f/%.3f  --  %d triangles %d breaks", pre_acmr, post_acmr, mNumIndices/3, breaks);
@@ -6362,7 +6395,14 @@ void LLVolumeFace::allocateTangents(S32 num_verts)
 void LLVolumeFace::allocateWeights(S32 num_verts)
 {
 	ll_aligned_free_16(mWeights);
-	mWeights = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
+	mWeights = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
+    
+}
+
+void LLVolumeFace::allocateJointIndices(S32 num_verts)
+{
+    ll_aligned_free_16(mJointIndices);
+    mJointIndices = (U8*)ll_aligned_malloc_16(sizeof(U8) * 4 * num_verts);    
 }
 
 void LLVolumeFace::resizeIndices(S32 num_indices)
diff --git a/indra/llmath/llvolume.h b/indra/llmath/llvolume.h
index 1d6d35c432..ed2cd9cde0 100644
--- a/indra/llmath/llvolume.h
+++ b/indra/llmath/llvolume.h
@@ -875,6 +875,7 @@ public:
 	void resizeVertices(S32 num_verts);
 	void allocateTangents(S32 num_verts);
 	void allocateWeights(S32 num_verts);
+    void allocateJointIndices(S32 num_verts);
 	void resizeIndices(S32 num_indices);
 	void fillFromLegacyData(std::vector<LLVolumeFace::VertexData>& v, std::vector<U16>& idx);
 
@@ -955,6 +956,7 @@ public:
 	// format is mWeights[vertex_index].mV[influence] = <joint_index>.<weight>
 	// mWeights.size() should be empty or match mVertices.size()  
 	LLVector4a* mWeights;
+    U8* mJointIndices;
 
     mutable BOOL mWeightsScrubbed;
 
-- 
cgit v1.3


From 71af0a2a9e9f90d1e336f8a30f642bb5e19ef658 Mon Sep 17 00:00:00 2001
From: Graham Linden <graham@lindenlab.com>
Date: Tue, 6 Aug 2019 14:41:55 -0700
Subject: Fix shutdown crash in teardown of joint heirarchy.

Ifdef'd code for potential skinning speed up to avoid lots of int<->float conversions (expensive and static for min space investment)
as updating rigged VBs shows up as a profiling bottleneck for Low rendering (where we actually use CPU skinning).
---
 indra/llcharacter/lljoint.cpp      |  17 ++---
 indra/llmath/llvolume.cpp          |  39 +++++++----
 indra/llmath/llvolume.h            |   4 ++
 indra/newview/lldrawpoolavatar.cpp | 133 +++++++++++++++++++++++++++++--------
 indra/newview/lldrawpoolavatar.h   |   2 +-
 indra/newview/llskinningutil.cpp   |  71 ++++++++++++++++----
 indra/newview/llskinningutil.h     |  29 +++++++-
 indra/newview/llvovolume.cpp       |  48 ++++++++++---
 8 files changed, 270 insertions(+), 73 deletions(-)

(limited to 'indra/llmath')

diff --git a/indra/llcharacter/lljoint.cpp b/indra/llcharacter/lljoint.cpp
index 36ecf8cb4b..a685df5925 100644
--- a/indra/llcharacter/lljoint.cpp
+++ b/indra/llcharacter/lljoint.cpp
@@ -303,16 +303,17 @@ void LLJoint::removeChild(LLJoint* joint)
 //--------------------------------------------------------------------
 void LLJoint::removeAllChildren()
 {
-	for (joints_t::iterator iter = mChildren.begin();
-		 iter != mChildren.end();)
+	for (LLJoint* joint : mChildren)
 	{
-		joints_t::iterator curiter = iter++;
-		LLJoint* joint = *curiter;
-		mChildren.erase(curiter);
-		joint->mXform.setParent(NULL);
-		joint->mParent = NULL;
-		joint->touch();
+		if (joint)
+        {
+		    joint->mXform.setParent(NULL);
+		    joint->mParent = NULL;
+		    joint->touch();
+            //delete joint;
+        }
 	}
+    mChildren.clear();
 }
 
 
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index 9d0cf1e119..df867b332d 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -2526,7 +2526,6 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 			if (mdl[i].has("Weights"))
 			{
 				face.allocateWeights(num_verts);
-                face.allocateJointIndices(num_verts);
 
 				LLSD::Binary weights = mdl[i]["Weights"];
 
@@ -2567,13 +2566,6 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
                     {
                         wght = LLVector4(0.999f,0.f,0.f,0.f);
                     }
-                    if (face.mJointIndices)
-                    {
-                        for (U32 k=0; k<4; k++)
-                        {
-                            face.mJointIndices[cur_vertex * 4 + k] = llclamp((U8)joints[k], (U8)0, (U8)110);
-                        }
-                    }
                     for (U32 k=0; k<4; k++)
                     {
                         F32 f_combined = (F32) joints[k] + wght[k];
@@ -4664,7 +4656,10 @@ LLVolumeFace::LLVolumeFace() :
 	mTexCoords(NULL),
 	mIndices(NULL),
 	mWeights(NULL),
+#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
+    mJustWeights(NULL),
     mJointIndices(NULL),
+#endif
     mWeightsScrubbed(FALSE),
 	mOctree(NULL),
 	mOptimized(FALSE)
@@ -4691,7 +4686,10 @@ LLVolumeFace::LLVolumeFace(const LLVolumeFace& src)
 	mTexCoords(NULL),
 	mIndices(NULL),
 	mWeights(NULL),
+#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
+    mJustWeights(NULL),
     mJointIndices(NULL),
+#endif
     mWeightsScrubbed(FALSE),
 	mOctree(NULL)
 { 
@@ -4768,19 +4766,22 @@ LLVolumeFace& LLVolumeFace::operator=(const LLVolumeFace& src)
             mWeightsScrubbed = FALSE;
 		}   
 
+    #if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
         if (src.mJointIndices)
         {
             llassert(!mJointIndices); // don't orphan an old alloc here accidentally
             allocateJointIndices(src.mNumVertices);
             LLVector4a::memcpyNonAliased16((F32*) mJointIndices, (F32*) src.mJointIndices, src.mNumVertices * sizeof(U8) * 4);
         }
-        else
+        else*/
         {
             ll_aligned_free_16(mJointIndices);
             mJointIndices = NULL;
         }     
-	}
+    #endif
 
+	}
+    
 	if (mNumIndices)
 	{
 		S32 idx_size = (mNumIndices*sizeof(U16)+0xF) & ~0xF;
@@ -4823,8 +4824,13 @@ void LLVolumeFace::freeData()
 	mTangents = NULL;
 	ll_aligned_free_16(mWeights);
 	mWeights = NULL;
+
+#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
     ll_aligned_free_16(mJointIndices);
 	mJointIndices = NULL;
+    ll_aligned_free_16(mJustWeights);
+	mJustWeights = NULL;
+#endif
 
 	delete mOctree;
 	mOctree = NULL;
@@ -5479,13 +5485,17 @@ bool LLVolumeFace::cacheOptimize()
 	// DO NOT free mNormals and mTexCoords as they are part of mPositions buffer
 	ll_aligned_free_16(mWeights);
 	ll_aligned_free_16(mTangents);
+#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
     ll_aligned_free_16(mJointIndices);
+    ll_aligned_free_16(mJustWeights);
+    mJustWeights = NULL;
+    mJointIndices = NULL; // filled in later as necessary by skinning code for acceleration
+#endif
 
 	mPositions = pos;
 	mNormals = norm;
 	mTexCoords = tc;
-	mWeights = wght;
-    mJointIndices = NULL; // filled in later as necessary by skinning code for acceleration
+	mWeights = wght;    
 	mTangents = binorm;
 
 	//std::string result = llformat("ACMR pre/post: %.3f/%.3f  --  %d triangles %d breaks", pre_acmr, post_acmr, mNumIndices/3, breaks);
@@ -6401,8 +6411,13 @@ void LLVolumeFace::allocateWeights(S32 num_verts)
 
 void LLVolumeFace::allocateJointIndices(S32 num_verts)
 {
+#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
     ll_aligned_free_16(mJointIndices);
+    ll_aligned_free_16(mJustWeights);
+
     mJointIndices = (U8*)ll_aligned_malloc_16(sizeof(U8) * 4 * num_verts);    
+    mJustWeights = (LLVector4a*)ll_aligned_malloc_16(sizeof(LLVector4a) * num_verts);    
+#endif
 }
 
 void LLVolumeFace::resizeIndices(S32 num_indices)
diff --git a/indra/llmath/llvolume.h b/indra/llmath/llvolume.h
index ed2cd9cde0..a77e8c08c6 100644
--- a/indra/llmath/llvolume.h
+++ b/indra/llmath/llvolume.h
@@ -956,7 +956,11 @@ public:
 	// format is mWeights[vertex_index].mV[influence] = <joint_index>.<weight>
 	// mWeights.size() should be empty or match mVertices.size()  
 	LLVector4a* mWeights;
+
+#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
+    LLVector4a* mJustWeights;
     U8* mJointIndices;
+#endif
 
     mutable BOOL mWeightsScrubbed;
 
diff --git a/indra/newview/lldrawpoolavatar.cpp b/indra/newview/lldrawpoolavatar.cpp
index 15a0595179..789a254389 100644
--- a/indra/newview/lldrawpoolavatar.cpp
+++ b/indra/newview/lldrawpoolavatar.cpp
@@ -38,6 +38,7 @@
 #include "lldrawable.h"
 #include "lldrawpoolbump.h"
 #include "llface.h"
+#include "llvolume.h"
 #include "llmeshrepository.h"
 #include "llsky.h"
 #include "llviewercamera.h"
@@ -1833,15 +1834,13 @@ void LLDrawPoolAvatar::updateRiggedFaceVertexBuffer(
     LLFace* face,
     const LLMeshSkinInfo* skin,
     LLVolume* volume,
-    const LLVolumeFace& vol_face)
+    LLVolumeFace& vol_face)
 {
 	LLVector4a* weights = vol_face.mWeights;
 	if (!weights)
 	{
 		return;
 	}
-    // FIXME ugly const cast
-    LLSkinningUtil::scrubInvalidJoints(avatar, const_cast<LLMeshSkinInfo*>(skin));
 
 	LLPointer<LLVertexBuffer> buffer = face->getVertexBuffer();
 	LLDrawable* drawable = face->getDrawable();
@@ -1851,6 +1850,48 @@ void LLDrawPoolAvatar::updateRiggedFaceVertexBuffer(
 		return;
 	}
 
+    const U32 max_joints = LLSkinningUtil::getMaxJointCount();
+
+#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
+    #define CONDITION_WEIGHT(f) ((U8)llclamp((S32)f, (S32)0, (S32)max_joints-1))
+    LLVector4a* just_weights = vol_face.mJustWeights;
+    // we need to calculate the separated indices and store just the matrix weights for this vol...
+    if (!vol_face.mJointIndices)
+    {
+        // not very consty after all...
+        vol_face.allocateJointIndices(vol_face.mNumVertices);
+        just_weights = vol_face.mJustWeights;
+
+        U8* joint_indices_cursor = vol_face.mJointIndices;
+        for (int i = 0; i < vol_face.mNumVertices; i++)
+        {
+            F32* w = weights[i].getF32ptr();
+            F32* w_ = just_weights[i].getF32ptr();
+
+            F32 w0 = floorf(w[0]);
+            F32 w1 = floorf(w[1]);
+            F32 w2 = floorf(w[2]);
+            F32 w3 = floorf(w[3]);
+
+            joint_indices_cursor[0] = CONDITION_WEIGHT(w0);
+            joint_indices_cursor[1] = CONDITION_WEIGHT(w1);
+            joint_indices_cursor[2] = CONDITION_WEIGHT(w2);
+            joint_indices_cursor[3] = CONDITION_WEIGHT(w3);
+
+            // remove joint portion of combined weight
+            w_[0] = w[0] - w0;
+            w_[1] = w[1] - w1;
+            w_[2] = w[2] - w2;
+            w_[3] = w[3] - w3;
+
+            joint_indices_cursor += 4;
+        }
+    }
+#endif
+
+    // FIXME ugly const cast
+    LLSkinningUtil::scrubInvalidJoints(avatar, const_cast<LLMeshSkinInfo*>(skin));
+
 	U32 data_mask = face->getRiggedVertexBufferDataMask();
 
     if (!vol_face.mWeightsScrubbed)
@@ -1927,29 +1968,67 @@ void LLDrawPoolAvatar::updateRiggedFaceVertexBuffer(
 		LLMatrix4a bind_shape_matrix;
 		bind_shape_matrix.loadu(skin->mBindShapeMatrix);
 
-        const U32 max_joints = LLSkinningUtil::getMaxJointCount();
-		for (U32 j = 0; j < buffer->getNumVerts(); ++j)
-		{
-			LLMatrix4a final_mat;
-            LLSkinningUtil::getPerVertexSkinMatrix(weights[j].getF32ptr(), mat, false, final_mat, max_joints);
-			
-			LLVector4a& v = vol_face.mPositions[j];
-
-			LLVector4a t;
-			LLVector4a dst;
-			bind_shape_matrix.affineTransform(v, t);
-			final_mat.affineTransform(t, dst);
-			pos[j] = dst;
-
-			if (norm)
-			{
-				LLVector4a& n = vol_face.mNormals[j];
-				bind_shape_matrix.rotate(n, t);
-				final_mat.rotate(t, dst);
-				dst.normalize3fast();
-				norm[j] = dst;
-			}
-		}
+#if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
+        U8* joint_indices_cursor = vol_face.mJointIndices;
+        // fast path with joint indices separate from weights
+        if (joint_indices_cursor)
+        {
+            LLMatrix4a src[4];
+		    for (U32 j = 0; j < buffer->getNumVerts(); ++j)
+		    {
+			    LLMatrix4a final_mat;
+                //LLMatrix4a final_mat_correct;
+
+                F32* jw = just_weights[j].getF32ptr();
+
+                LLSkinningUtil::getPerVertexSkinMatrixWithIndices(jw, joint_indices_cursor, mat, final_mat, src);                
+
+                joint_indices_cursor += 4;
+
+			    LLVector4a& v = vol_face.mPositions[j];
+
+			    LLVector4a t;
+			    LLVector4a dst;
+			    bind_shape_matrix.affineTransform(v, t);
+			    final_mat.affineTransform(t, dst);
+			    pos[j] = dst;
+
+			    if (norm)
+			    {
+				    LLVector4a& n = vol_face.mNormals[j];
+				    bind_shape_matrix.rotate(n, t);
+				    final_mat.rotate(t, dst);
+				    dst.normalize3fast();
+				    norm[j] = dst;
+			    }
+		    }
+        }
+        // slow path with joint indices calculated from weights
+        else
+#endif
+        {
+            for (U32 j = 0; j < buffer->getNumVerts(); ++j)
+		    {
+			    LLMatrix4a final_mat;
+                LLSkinningUtil::getPerVertexSkinMatrix(weights[j].getF32ptr(), mat, false, final_mat, max_joints);
+
+			    LLVector4a& v = vol_face.mPositions[j];
+			    LLVector4a t;
+			    LLVector4a dst;
+			    bind_shape_matrix.affineTransform(v, t);
+			    final_mat.affineTransform(t, dst);
+			    pos[j] = dst;
+
+			    if (norm)
+			    {
+				    LLVector4a& n = vol_face.mNormals[j];
+				    bind_shape_matrix.rotate(n, t);
+				    final_mat.rotate(t, dst);
+				    //dst.normalize3fast();
+				    norm[j] = dst;
+			    }
+		    }
+        }
 	}
 }
 
@@ -2301,7 +2380,7 @@ void LLDrawPoolAvatar::updateRiggedVertexBuffers(LLVOAvatar* avatar)
 
 			stop_glerror();
 
-			const LLVolumeFace& vol_face = volume->getVolumeFace(te);
+			LLVolumeFace& vol_face = volume->getVolumeFace(te);
 			updateRiggedFaceVertexBuffer(avatar, face, skin, volume, vol_face);
 		}
 	}
diff --git a/indra/newview/lldrawpoolavatar.h b/indra/newview/lldrawpoolavatar.h
index e8add0e1d8..cb09eb18e2 100644
--- a/indra/newview/lldrawpoolavatar.h
+++ b/indra/newview/lldrawpoolavatar.h
@@ -257,7 +257,7 @@ typedef enum
 									  LLFace* facep, 
 									  const LLMeshSkinInfo* skin, 
 									  LLVolume* volume,
-									  const LLVolumeFace& vol_face);
+									  LLVolumeFace& vol_face);
 	void updateRiggedVertexBuffers(LLVOAvatar* avatar);
 
 	void renderRigged(LLVOAvatar* avatar, U32 type, bool glow = false);
diff --git a/indra/newview/llskinningutil.cpp b/indra/newview/llskinningutil.cpp
index 0fa4c2b114..83b9c8971a 100644
--- a/indra/newview/llskinningutil.cpp
+++ b/indra/newview/llskinningutil.cpp
@@ -34,8 +34,12 @@
 #include "llvolume.h"
 #include "llrigginginfo.h"
 
+#define DEBUG_SKINNING  LL_DEBUG
+#define MAT_USE_SSE     1
+
 void dump_avatar_and_skin_state(const std::string& reason, LLVOAvatar *avatar, const LLMeshSkinInfo *skin)
 {
+#if DEBUG_SKINNING
     static S32 dump_count = 0;
     const S32 max_dump = 10;
 
@@ -81,16 +85,16 @@ void dump_avatar_and_skin_state(const std::string& reason, LLVOAvatar *avatar, c
 
         dump_count++;
     }
+#endif
 }
 
 void LLSkinningUtil::initClass()
 {
 }
 
-U32 LLSkinningUtil::getMaxJointCount()
+S32 LLSkinningUtil::getMaxJointCount()
 {
-    U32 result = LL_MAX_JOINTS_PER_MESH_OBJECT;
-	return result;
+    return (S32)LL_MAX_JOINTS_PER_MESH_OBJECT;
 }
 
 U32 LLSkinningUtil::getMeshJointCount(const LLMeshSkinInfo *skin)
@@ -120,6 +124,8 @@ void LLSkinningUtil::scrubInvalidJoints(LLVOAvatar *avatar, LLMeshSkinInfo* skin
     skin->mInvalidJointsScrubbed = true;
 }
 
+#define MAT_USE_SSE 1
+
 void LLSkinningUtil::initSkinningMatrixPalette(
     LLMatrix4* mat,
     S32 count, 
@@ -130,9 +136,9 @@ void LLSkinningUtil::initSkinningMatrixPalette(
     for (U32 j = 0; j < count; ++j)
     {
         LLJoint *joint = avatar->getJoint(skin->mJointNums[j]);
+        llassert(joint);
         if (joint)
         {
-#define MAT_USE_SSE
 #ifdef MAT_USE_SSE
             LLMatrix4a bind, world, res;
             bind.loadu(skin->mInvBindMatrix[j]);
@@ -147,6 +153,7 @@ void LLSkinningUtil::initSkinningMatrixPalette(
         else
         {
             mat[j] = skin->mInvBindMatrix[j];
+#if DEBUG_SKINNING
             // This  shouldn't  happen   -  in  mesh  upload,  skinned
             // rendering  should  be disabled  unless  all joints  are
             // valid.  In other  cases of  skinned  rendering, invalid
@@ -157,16 +164,15 @@ void LLSkinningUtil::initSkinningMatrixPalette(
             LL_WARNS_ONCE("Avatar") << avatar->getFullname() 
                                     << " avatar build state: isBuilt() " << avatar->isBuilt() 
                                     << " mInitFlags " << avatar->mInitFlags << LL_ENDL;
-#if 0
-            dump_avatar_and_skin_state("initSkinningMatrixPalette joint not found", avatar, skin);
 #endif
+            dump_avatar_and_skin_state("initSkinningMatrixPalette joint not found", avatar, skin);
         }
     }
 }
 
 void LLSkinningUtil::checkSkinWeights(LLVector4a* weights, U32 num_vertices, const LLMeshSkinInfo* skin)
 {
-#ifdef SHOW_ASSERT                  // same condition that controls llassert()
+#if DEBUG_SKINNING
 	const S32 max_joints = skin->mJointNames.size();
     for (U32 j=0; j<num_vertices; j++)
     {
@@ -265,6 +271,7 @@ void LLSkinningUtil::initJointNums(LLMeshSkinInfo* skin, LLVOAvatar *avatar)
     {
         for (U32 j = 0; j < skin->mJointNames.size(); ++j)
         {
+    #if DEBUG_SKINNING     
             LLJoint *joint = NULL;
             if (skin->mJointNums[j] == -1)
             {
@@ -282,11 +289,16 @@ void LLSkinningUtil::initJointNums(LLMeshSkinInfo* skin, LLVOAvatar *avatar)
                 {
                     LL_WARNS_ONCE("Avatar") << avatar->getFullname() << " unable to find joint " << skin->mJointNames[j] << LL_ENDL;
                     LL_WARNS_ONCE("Avatar") << avatar->getFullname() << " avatar build state: isBuilt() " << avatar->isBuilt() << " mInitFlags " << avatar->mInitFlags << LL_ENDL;
-#if 0
                     dump_avatar_and_skin_state("initJointNums joint not found", avatar, skin);
-#endif
+                    skin->mJointNums[j] = 0;
                 }
             }
+    #else
+            LLJoint *joint = (skin->mJointNums[j] == -1) ? avatar->getJoint(skin->mJointNames[j]) : avatar->getJoint(skin->mJointNums[j]);
+            skin->mJointNums[j] = joint ? joint->getJointNum() : 0;            
+    #endif
+            // insure we have *a* valid joint to reference
+            llassert(skin->mJointNums[j] >= 0);
         }
         skin->mJointNumsInitialized = true;
     }
@@ -344,14 +356,17 @@ void LLSkinningUtil::updateRiggingInfo(const LLMeshSkinInfo* skin, LLVOAvatar *a
 
                                 // FIXME could precompute these matMuls.
                                 LLMatrix4a bind_shape;
-                                bind_shape.loadu(skin->mBindShapeMatrix);
                                 LLMatrix4a inv_bind;
-                                inv_bind.loadu(skin->mInvBindMatrix[joint_index]);
                                 LLMatrix4a mat;
-                                matMul(bind_shape, inv_bind, mat);
                                 LLVector4a pos_joint_space;
+
+                                bind_shape.loadu(skin->mBindShapeMatrix);
+                                inv_bind.loadu(skin->mInvBindMatrix[joint_index]);
+                                matMul(bind_shape, inv_bind, mat);
+
                                 mat.affineTransform(pos, pos_joint_space);
                                 pos_joint_space.mul(wght[k]);
+
                                 LLVector4a *extents = rig_info_tab[joint_num].getRiggedExtents();
                                 update_min_max(extents[0], extents[1], pos_joint_space);
                             }
@@ -366,6 +381,8 @@ void LLSkinningUtil::updateRiggingInfo(const LLMeshSkinInfo* skin, LLVOAvatar *a
                 vol_face.mJointRiggingInfoTab.setNeedsUpdate(false);
             }
         }
+
+#if DEBUG_SKINNING
         if (vol_face.mJointRiggingInfoTab.size()!=0)
         {
             LL_DEBUGS("RigSpammish") << "we have rigging info for vf " << &vol_face 
@@ -376,10 +393,40 @@ void LLSkinningUtil::updateRiggingInfo(const LLMeshSkinInfo* skin, LLVOAvatar *a
             LL_DEBUGS("RigSpammish") << "no rigging info for vf " << &vol_face 
                                      << " num_verts " << vol_face.mNumVertices << LL_ENDL; 
         }
+#endif
 
     }
 }
 
+void LLSkinningUtil::updateRiggingInfo_(LLMeshSkinInfo* skin, LLVOAvatar *avatar, S32 num_verts, LLVector4a* weights, LLVector4a* positions, U8* joint_indices, LLJointRiggingInfoTab &rig_info_tab)
+{
+    LL_RECORD_BLOCK_TIME(FTM_FACE_RIGGING_INFO);
+    for (S32 i=0; i < num_verts; i++)
+    {
+        LLVector4a& pos  = positions[i];
+        LLVector4a& wght = weights[i];
+        for (U32 k=0; k<4; ++k)
+        {
+            S32 joint_num = skin->mJointNums[joint_indices[k]];
+            llassert(joint_num >= 0 && joint_num < LL_CHARACTER_MAX_ANIMATED_JOINTS);
+            {
+                rig_info_tab[joint_num].setIsRiggedTo(true);
+                LLMatrix4a bind_shape;
+                bind_shape.loadu(skin->mBindShapeMatrix);
+                LLMatrix4a inv_bind;
+                inv_bind.loadu(skin->mInvBindMatrix[joint_indices[k]]);
+                LLMatrix4a mat;
+                matMul(bind_shape, inv_bind, mat);
+                LLVector4a pos_joint_space;
+                mat.affineTransform(pos, pos_joint_space);
+                pos_joint_space.mul(wght[k]);
+                LLVector4a *extents = rig_info_tab[joint_num].getRiggedExtents();
+                update_min_max(extents[0], extents[1], pos_joint_space);
+            }
+        }
+    }
+}
+
 // This is used for extracting rotation from a bind shape matrix that
 // already has scales baked in
 LLQuaternion LLSkinningUtil::getUnscaledQuaternion(const LLMatrix4& mat4)
diff --git a/indra/newview/llskinningutil.h b/indra/newview/llskinningutil.h
index ccc501adc0..d39356451d 100644
--- a/indra/newview/llskinningutil.h
+++ b/indra/newview/llskinningutil.h
@@ -27,23 +27,48 @@
 #ifndef LLSKINNINGUTIL_H
 #define LLSKINNINGUTIL_H
 
+#include "v2math.h"
+#include "v4math.h"
+#include "llvector4a.h"
+#include "llmatrix4a.h"
+
 class LLVOAvatar;
 class LLMeshSkinInfo;
-class LLMatrix4a;
 class LLVolumeFace;
+class LLJointRiggingInfoTab;
 
 namespace LLSkinningUtil
 {
     void initClass();
-    U32 getMaxJointCount();
+    S32 getMaxJointCount();
     U32 getMeshJointCount(const LLMeshSkinInfo *skin);
     void scrubInvalidJoints(LLVOAvatar *avatar, LLMeshSkinInfo* skin);
     void initSkinningMatrixPalette(LLMatrix4* mat, S32 count, const LLMeshSkinInfo* skin, LLVOAvatar *avatar);
     void checkSkinWeights(LLVector4a* weights, U32 num_vertices, const LLMeshSkinInfo* skin);
     void scrubSkinWeights(LLVector4a* weights, U32 num_vertices, const LLMeshSkinInfo* skin);
     void getPerVertexSkinMatrix(F32* weights, LLMatrix4a* mat, bool handle_bad_scale, LLMatrix4a& final_mat, U32 max_joints);
+
+    LL_FORCE_INLINE void getPerVertexSkinMatrixWithIndices(
+        F32*        weights,
+        U8*         idx,
+        LLMatrix4a* mat,
+        LLMatrix4a& final_mat,
+        LLMatrix4a* src)
+    {    
+        final_mat.clear();
+        src[0].setMul(mat[idx[0]], weights[0]);
+        src[1].setMul(mat[idx[1]], weights[1]);
+        final_mat.add(src[0]);
+        final_mat.add(src[1]);
+        src[2].setMul(mat[idx[2]], weights[2]);        
+        src[3].setMul(mat[idx[3]], weights[3]);
+        final_mat.add(src[2]);
+        final_mat.add(src[3]);
+    }
+
     void initJointNums(LLMeshSkinInfo* skin, LLVOAvatar *avatar);
     void updateRiggingInfo(const LLMeshSkinInfo* skin, LLVOAvatar *avatar, LLVolumeFace& vol_face);
+    void updateRiggingInfo_(LLMeshSkinInfo* skin, LLVOAvatar *avatar, S32 num_verts, LLVector4a* weights, LLVector4a* positions, U8* joint_indices, LLJointRiggingInfoTab &rig_info_tab);
 	LLQuaternion getUnscaledQuaternion(const LLMatrix4& mat4);
 };
 
diff --git a/indra/newview/llvovolume.cpp b/indra/newview/llvovolume.cpp
index 02ef7612a7..706e2c6895 100644
--- a/indra/newview/llvovolume.cpp
+++ b/indra/newview/llvovolume.cpp
@@ -4787,18 +4787,44 @@ void LLRiggedVolume::update(const LLMeshSkinInfo* skin, LLVOAvatar* avatar, cons
                 U32 max_joints = LLSkinningUtil::getMaxJointCount();
                 rigged_vert_count += dst_face.mNumVertices;
                 rigged_face_count++;
-				for (U32 j = 0; j < dst_face.mNumVertices; ++j)
-				{
-					LLMatrix4a final_mat;
-                    LLSkinningUtil::getPerVertexSkinMatrix(weight[j].getF32ptr(), mat, false, final_mat, max_joints);
+
+            #if USE_SEPARATE_JOINT_INDICES_AND_WEIGHTS
+                if (vol_face.mJointIndices) // fast path with preconditioned joint indices
+                {
+                    LLMatrix4a src[4];
+                    U8* joint_indices_cursor = vol_face.mJointIndices;
+                    LLVector4a* just_weights = vol_face.mJustWeights;
+                    for (U32 j = 0; j < dst_face.mNumVertices; ++j)
+				    {
+					    LLMatrix4a final_mat;
+                        F32* w = just_weights[j].getF32ptr();
+                        LLSkinningUtil::getPerVertexSkinMatrixWithIndices(w, joint_indices_cursor, mat, final_mat, src);
+                        joint_indices_cursor += 4;
+
+					    LLVector4a& v = vol_face.mPositions[j];
+					    LLVector4a t;
+					    LLVector4a dst;
+					    bind_shape_matrix.affineTransform(v, t);
+					    final_mat.affineTransform(t, dst);
+					    pos[j] = dst;
+				    }
+                }
+                else
+            #endif
+                {
+				    for (U32 j = 0; j < dst_face.mNumVertices; ++j)
+				    {
+					    LLMatrix4a final_mat;
+                        LLSkinningUtil::getPerVertexSkinMatrix(weight[j].getF32ptr(), mat, false, final_mat, max_joints);
 				
-					LLVector4a& v = vol_face.mPositions[j];
-					LLVector4a t;
-					LLVector4a dst;
-					bind_shape_matrix.affineTransform(v, t);
-					final_mat.affineTransform(t, dst);
-					pos[j] = dst;
-				}
+					    LLVector4a& v = vol_face.mPositions[j];
+					    LLVector4a t;
+					    LLVector4a dst;
+					    bind_shape_matrix.affineTransform(v, t);
+					    final_mat.affineTransform(t, dst);
+					    pos[j] = dst;
+				    }
+                }
 
 				//update bounding box
 				// VFExtents change
-- 
cgit v1.3