1 files changed, 144 insertions, 380 deletions
diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp
index e59e685f53..e052e37393 100644
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@@ -55,6 +55,7 @@
 #include "v4math.h"
 #include "m3math.h"
 #include "m4math.h"
+#include "llmatrix4a.h"
 
 #if !LL_DARWIN && !LL_LINUX && !LL_SOLARIS
 extern PFNGLWEIGHTPOINTERARBPROC glWeightPointerARB;
@@ -62,7 +63,6 @@ extern PFNGLWEIGHTFVARBPROC glWeightfvARB;
 extern PFNGLVERTEXBLENDARBPROC glVertexBlendARB;
 #endif
 
-static LLPointer<LLVertexBuffer> sRenderBuffer = NULL;
 static const U32 sRenderMask = LLVertexBuffer::MAP_VERTEX |
 							   LLVertexBuffer::MAP_NORMAL |
 							   LLVertexBuffer::MAP_TEXCOORD0;
@@ -375,6 +375,7 @@ const S32 NUM_AXES = 3;
 // pivot parent 0-n -- child = n+1
 
 static LLMatrix4	gJointMatUnaligned[32];
+static LLMatrix4a	gJointMatAligned[32];
 static LLMatrix3	gJointRotUnaligned[32];
 static LLVector4	gJointPivot[32];
 
@@ -457,9 +458,20 @@ void LLViewerJointMesh::uploadJointMatrices()
 			}
 		}
 		stop_glerror();
-		glUniform4fvARB(gAvatarMatrixParam, 45, mat);
+		if (LLGLSLShader::sCurBoundShaderPtr)
+		{
+			LLGLSLShader::sCurBoundShaderPtr->uniform4fv(LLViewerShaderMgr::AVATAR_MATRIX, 45, mat);
+		}
 		stop_glerror();
 	}
+	else
+	{
+		//load gJointMatUnaligned into gJointMatAligned
+		for (joint_num = 0; joint_num < reference_mesh->mJointRenderData.count(); ++joint_num)
+		{
+			gJointMatAligned[joint_num].loadu(gJointMatUnaligned[joint_num]);
+		}
+	}
 }
 
 //--------------------------------------------------------------------
@@ -501,27 +513,30 @@ int compare_int(const void *a, const void *b)
 U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy)
 {
 	if (!mValid || !mMesh || !mFace || !mVisible || 
-		mFace->mVertexBuffer.isNull() ||
-		mMesh->getNumFaces() == 0) 
+		!mFace->getVertexBuffer() ||
+		mMesh->getNumFaces() == 0 ||
+		(LLGLSLShader::sNoFixedFunction && LLGLSLShader::sCurBoundShaderPtr == NULL))
 	{
 		return 0;
 	}
 
 	U32 triangle_count = 0;
 
+	S32 diffuse_channel = LLDrawPoolAvatar::sDiffuseChannel;
+
 	stop_glerror();
 	
 	//----------------------------------------------------------------
 	// setup current color
 	//----------------------------------------------------------------
 	if (is_dummy)
-		glColor4fv(LLVOAvatar::getDummyColor().mV);
+		gGL.diffuseColor4fv(LLVOAvatar::getDummyColor().mV);
 	else
-		glColor4fv(mColor.mV);
+		gGL.diffuseColor4fv(mColor.mV);
 
 	stop_glerror();
 	
-	LLGLSSpecular specular(LLColor4(1.f,1.f,1.f,1.f), mShiny && !(mFace->getPool()->getVertexShaderLevel() > 0));
+	LLGLSSpecular specular(LLColor4(1.f,1.f,1.f,1.f), (mFace->getPool()->getVertexShaderLevel() > 0 || LLGLSLShader::sNoFixedFunction) ? 0.f : mShiny);
 
 	//----------------------------------------------------------------
 	// setup current texture
@@ -531,33 +546,27 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy)
 	LLTexUnit::eTextureAddressMode old_mode = LLTexUnit::TAM_WRAP;
 	if (mTestImageName)
 	{
-		gGL.getTexUnit(0)->bindManual(LLTexUnit::TT_TEXTURE, mTestImageName);
+		gGL.getTexUnit(diffuse_channel)->bindManual(LLTexUnit::TT_TEXTURE, mTestImageName);
 
 		if (mIsTransparent)
 		{
-			glColor4f(1.f, 1.f, 1.f, 1.f);
+			gGL.diffuseColor4f(1.f, 1.f, 1.f, 1.f);
 		}
 		else
 		{
-			glColor4f(0.7f, 0.6f, 0.3f, 1.f);
-			gGL.getTexUnit(0)->setTextureColorBlend(LLTexUnit::TBO_LERP_TEX_ALPHA, LLTexUnit::TBS_TEX_COLOR, LLTexUnit::TBS_PREV_COLOR);
+			gGL.diffuseColor4f(0.7f, 0.6f, 0.3f, 1.f);
+			gGL.getTexUnit(diffuse_channel)->setTextureColorBlend(LLTexUnit::TBO_LERP_TEX_ALPHA, LLTexUnit::TBS_TEX_COLOR, LLTexUnit::TBS_PREV_COLOR);
 		}
 	}
 	else if( !is_dummy && mLayerSet )
 	{
 		if(	mLayerSet->hasComposite() )
 		{
-			gGL.getTexUnit(0)->bind(mLayerSet->getComposite());
+			gGL.getTexUnit(diffuse_channel)->bind(mLayerSet->getComposite());
 		}
 		else
 		{
-			// This warning will always trigger if you've hacked the avatar to show as incomplete.
-			// Ignore the warning if that's the case.
-			if (!gSavedSettings.getBOOL("RenderUnloadedAvatar"))
-			{
-				//llwarns << "Layerset without composite" << llendl;
-			}
-			gGL.getTexUnit(0)->bind(LLViewerTextureManager::getFetchedTexture(IMG_DEFAULT));
+			gGL.getTexUnit(diffuse_channel)->bind(LLViewerTextureManager::getFetchedTexture(IMG_DEFAULT));
 		}
 	}
 	else
@@ -567,22 +576,25 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy)
 		{
 			old_mode = mTexture->getAddressMode();
 		}
-		gGL.getTexUnit(0)->bind(mTexture.get());
-		gGL.getTexUnit(0)->bind(mTexture);
-		gGL.getTexUnit(0)->setTextureAddressMode(LLTexUnit::TAM_CLAMP);
+		gGL.getTexUnit(diffuse_channel)->bind(mTexture.get());
+		gGL.getTexUnit(diffuse_channel)->bind(mTexture);
+		gGL.getTexUnit(diffuse_channel)->setTextureAddressMode(LLTexUnit::TAM_CLAMP);
 	}
 	else
 	{
-		gGL.getTexUnit(0)->bind(LLViewerTextureManager::getFetchedTexture(IMG_DEFAULT));
+		gGL.getTexUnit(diffuse_channel)->bind(LLViewerTextureManager::getFetchedTexture(IMG_DEFAULT));
 	}
 	
-	mFace->mVertexBuffer->setBuffer(sRenderMask);
+	
+	U32 mask = sRenderMask;
 
 	U32 start = mMesh->mFaceVertexOffset;
 	U32 end = start + mMesh->mFaceVertexCount - 1;
 	U32 count = mMesh->mFaceIndexCount;
 	U32 offset = mMesh->mFaceIndexOffset;
 
+	LLVertexBuffer* buff = mFace->getVertexBuffer();
+
 	if (mMesh->hasWeights())
 	{
 		if ((mFace->getPool()->getVertexShaderLevel() > 0))
@@ -591,17 +603,24 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy)
 			{
 				uploadJointMatrices();
 			}
+			mask = mask | LLVertexBuffer::MAP_WEIGHT;
+			if (mFace->getPool()->getVertexShaderLevel() > 1)
+			{
+				mask = mask | LLVertexBuffer::MAP_CLOTHWEIGHT;
+			}
 		}
 		
-		mFace->mVertexBuffer->drawRange(LLRender::TRIANGLES, start, end, count, offset);
+		buff->setBuffer(mask);
+		buff->drawRange(LLRender::TRIANGLES, start, end, count, offset);
 	}
 	else
 	{
-		glPushMatrix();
+		gGL.pushMatrix();
 		LLMatrix4 jointToWorld = getWorldMatrix();
-		glMultMatrixf((GLfloat*)jointToWorld.mMatrix);
-		mFace->mVertexBuffer->drawRange(LLRender::TRIANGLES, start, end, count, offset);
-		glPopMatrix();
+		gGL.multMatrix((GLfloat*)jointToWorld.mMatrix);
+		buff->setBuffer(mask);
+		buff->drawRange(LLRender::TRIANGLES, start, end, count, offset);
+		gGL.popMatrix();
 	}
 	gPipeline.addTrianglesDrawn(count);
 
@@ -609,13 +628,13 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy)
 	
 	if (mTestImageName)
 	{
-		gGL.getTexUnit(0)->setTextureBlendType(LLTexUnit::TB_MULT);
+		gGL.getTexUnit(diffuse_channel)->setTextureBlendType(LLTexUnit::TB_MULT);
 	}
 
 	if (mTexture.notNull() && !is_dummy)
 	{
-		gGL.getTexUnit(0)->bind(mTexture);
-		gGL.getTexUnit(0)->setTextureAddressMode(old_mode);
+		gGL.getTexUnit(diffuse_channel)->bind(mTexture);
+		gGL.getTexUnit(diffuse_channel)->setTextureAddressMode(old_mode);
 	}
 
 	return triangle_count;
@@ -626,6 +645,9 @@ U32 LLViewerJointMesh::drawShape( F32 pixelArea, BOOL first_pass, BOOL is_dummy)
 //-----------------------------------------------------------------------------
 void LLViewerJointMesh::updateFaceSizes(U32 &num_vertices, U32& num_indices, F32 pixel_area)
 {
+	//bump num_vertices to next multiple of 4
+	num_vertices = (num_vertices + 0x3) & ~0x3;
+
 	// Do a pre-alloc pass to determine sizes of data.
 	if (mMesh && mValid)
 	{
@@ -648,13 +670,25 @@ static LLFastTimer::DeclareTimer FTM_AVATAR_FACE("Avatar Face");
 
 void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_wind, bool terse_update)
 {
+	//IF THIS FUNCTION BREAKS, SEE LLPOLYMESH CONSTRUCTOR AND CHECK ALIGNMENT OF INPUT ARRAYS
+
 	mFace = face;
 
-	if (mFace->mVertexBuffer.isNull())
+	if (!mFace->getVertexBuffer())
 	{
 		return;
 	}
 
+	LLDrawPool *poolp = mFace->getPool();
+	BOOL hardware_skinning = (poolp && poolp->getVertexShaderLevel() > 0) ? TRUE : FALSE;
+
+	if (!hardware_skinning && terse_update)
+	{ //no need to do terse updates if we're doing software vertex skinning
+	 // since mMesh is being copied into mVertexBuffer every frame
+		return;
+	}
+
+
 	LLFastTimer t(FTM_AVATAR_FACE);
 
 	LLStrider<LLVector3> verticesp;
@@ -667,111 +701,59 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w
 	// Copy data into the faces from the polymesh data.
 	if (mMesh && mValid)
 	{
-		if (mMesh->getNumVertices())
+		const U32 num_verts = mMesh->getNumVertices();
+
+		if (num_verts)
 		{
-			stop_glerror();
+			face->getVertexBuffer()->getIndexStrider(indicesp);
 			face->getGeometryAvatar(verticesp, normalsp, tex_coordsp, vertex_weightsp, clothing_weightsp);
-			stop_glerror();
-			face->mVertexBuffer->getIndexStrider(indicesp);
-			stop_glerror();
-
+			
 			verticesp += mMesh->mFaceVertexOffset;
-			tex_coordsp += mMesh->mFaceVertexOffset;
 			normalsp += mMesh->mFaceVertexOffset;
-			vertex_weightsp += mMesh->mFaceVertexOffset;
-			clothing_weightsp += mMesh->mFaceVertexOffset;
+			
+			F32* v = (F32*) verticesp.get();
+			F32* n = (F32*) normalsp.get();
+			
+			U32 words = num_verts*4;
+
+			LLVector4a::memcpyNonAliased16(v, (F32*) mMesh->getCoords(), words*sizeof(F32));
+			LLVector4a::memcpyNonAliased16(n, (F32*) mMesh->getNormals(), words*sizeof(F32));
+						
+			
+			if (!terse_update)
+			{
+				vertex_weightsp += mMesh->mFaceVertexOffset;
+				clothing_weightsp += mMesh->mFaceVertexOffset;
+				tex_coordsp += mMesh->mFaceVertexOffset;
+		
+				F32* tc = (F32*) tex_coordsp.get();
+				F32* vw = (F32*) vertex_weightsp.get();
+				F32* cw = (F32*) clothing_weightsp.get();	
 
-			const U32* __restrict coords = (U32*) mMesh->getCoords();
-			const U32* __restrict tex_coords = (U32*) mMesh->getTexCoords();
-			const U32* __restrict normals = (U32*) mMesh->getNormals();
-			const U32* __restrict weights = (U32*) mMesh->getWeights();
-			const U32* __restrict cloth_weights = (U32*) mMesh->getClothingWeights();
+				LLVector4a::memcpyNonAliased16(tc, (F32*) mMesh->getTexCoords(), num_verts*2*sizeof(F32));
+				LLVector4a::memcpyNonAliased16(vw, (F32*) mMesh->getWeights(), num_verts*sizeof(F32));	
+				LLVector4a::memcpyNonAliased16(cw, (F32*) mMesh->getClothingWeights(), num_verts*4*sizeof(F32));	
+			}
 
-			const U32 num_verts = mMesh->getNumVertices();
+			const U32 idx_count = mMesh->getNumFaces()*3;
 
-			U32 i = 0;
+			indicesp += mMesh->mFaceIndexOffset;
 
-			const U32 skip = verticesp.getSkip()/sizeof(U32);
+			U16* __restrict idx = indicesp.get();
+			S32* __restrict src_idx = (S32*) mMesh->getFaces();	
 
-			U32* __restrict v = (U32*) verticesp.get();
-			U32* __restrict n = (U32*) normalsp.get();
-			
-			if (terse_update)
+			const S32 offset = (S32) mMesh->mFaceVertexOffset;
+
+			for (S32 i = 0; i < idx_count; ++i)
 			{
-				for (S32 i = num_verts; i > 0; --i)
-				{
-					//morph target application only, only update positions and normals
-					v[0] = coords[0]; 
-					v[1] = coords[1]; 
-					v[2] = coords[2];		
-					coords += 3;
-					v += skip;
-				}
-
-				for (S32 i = num_verts; i > 0; --i)
-				{
-					n[0] = normals[0]; 
-					n[1] = normals[1];
-					n[2] = normals[2];
-					normals += 3;
-					n += skip;
-				}
-			}
-			else
-				{
-
-				U32* __restrict tc = (U32*) tex_coordsp.get();
-				U32* __restrict vw = (U32*) vertex_weightsp.get();
-				U32* __restrict cw = (U32*) clothing_weightsp.get();
-				
-				do
-				{
-					v[0] = *(coords++); 
-					v[1] = *(coords++); 
-					v[2] = *(coords++);
-					v += skip;
-
-					tc[0] = *(tex_coords++); 
-					tc[1] = *(tex_coords++);
-					tc += skip;
-
-					n[0] = *(normals++); 
-					n[1] = *(normals++);
-					n[2] = *(normals++);
-					n += skip;
-
-					vw[0] = *(weights++);
-					vw += skip;
-
-					cw[0] = *(cloth_weights++);
-					cw[1] = *(cloth_weights++);
-					cw[2] = *(cloth_weights++);
-					cw[3] = *(cloth_weights++);
-					cw += skip;
-				}
-				while (++i < num_verts);
-
-				const U32 idx_count = mMesh->getNumFaces()*3;
-
-				indicesp += mMesh->mFaceIndexOffset;
-
-				U16* __restrict idx = indicesp.get();
-				S32* __restrict src_idx = (S32*) mMesh->getFaces();
-
-				i = 0;
-
-				const S32 offset = (S32) mMesh->mFaceVertexOffset;
-
-				do
-				{
-					*(idx++) = *(src_idx++)+offset;
-				}
-				while (++i < idx_count);
+				*(idx++) = *(src_idx++)+offset;
 			}
 		}
 	}
 }
 
+
+
 //-----------------------------------------------------------------------------
 // updateLOD()
 //-----------------------------------------------------------------------------
@@ -783,155 +765,58 @@ BOOL LLViewerJointMesh::updateLOD(F32 pixel_area, BOOL activate)
 }
 
 // static
-void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh)
+void LLViewerJointMesh::updateGeometry(LLFace *mFace, LLPolyMesh *mMesh)
 {
 	LLStrider<LLVector3> o_vertices;
 	LLStrider<LLVector3> o_normals;
 
 	//get vertex and normal striders
-	LLVertexBuffer *buffer = mFace->mVertexBuffer;
+	LLVertexBuffer* buffer = mFace->getVertexBuffer();
 	buffer->getVertexStrider(o_vertices,  0);
 	buffer->getNormalStrider(o_normals,   0);
 
-	F32 last_weight = F32_MAX;
-	LLMatrix4 gBlendMat;
-	LLMatrix3 gBlendRotMat;
+	F32* __restrict vert = o_vertices[0].mV;
+	F32* __restrict norm = o_normals[0].mV;
 
-	const F32* weights = mMesh->getWeights();
-	const LLVector3* coords = mMesh->getCoords();
-	const LLVector3* normals = mMesh->getNormals();
-	for (U32 index = 0; index < mMesh->getNumVertices(); index++)
-	{
-		U32 bidx = index + mMesh->mFaceVertexOffset;
-		
-		// blend by first matrix
-		F32 w = weights[index]; 
-		
-		// Maybe we don't have to change gBlendMat.
-		// Profiles of a single-avatar scene on a Mac show this to be a very
-		// common case.  JC
-		if (w == last_weight)
-		{
-			o_vertices[bidx] = coords[index] * gBlendMat;
-			o_normals[bidx] = normals[index] * gBlendRotMat;
-			continue;
-		}
-		
-		last_weight = w;
+	const F32* __restrict weights = mMesh->getWeights();
+	const LLVector4a* __restrict coords = (LLVector4a*) mMesh->getCoords();
+	const LLVector4a* __restrict normals = (LLVector4a*) mMesh->getNormals();
 
-		S32 joint = llfloor(w);
-		w -= joint;
-		
-		// No lerp required in this case.
-		if (w == 1.0f)
-		{
-			gBlendMat = gJointMatUnaligned[joint+1];
-			o_vertices[bidx] = coords[index] * gBlendMat;
-			gBlendRotMat = gJointRotUnaligned[joint+1];
-			o_normals[bidx] = normals[index] * gBlendRotMat;
-			continue;
-		}
-		
-		// Try to keep all the accesses to the matrix data as close
-		// together as possible.  This function is a hot spot on the
-		// Mac. JC
-		LLMatrix4 &m0 = gJointMatUnaligned[joint+1];
-		LLMatrix4 &m1 = gJointMatUnaligned[joint+0];
-		
-		gBlendMat.mMatrix[VX][VX] = lerp(m1.mMatrix[VX][VX], m0.mMatrix[VX][VX], w);
-		gBlendMat.mMatrix[VX][VY] = lerp(m1.mMatrix[VX][VY], m0.mMatrix[VX][VY], w);
-		gBlendMat.mMatrix[VX][VZ] = lerp(m1.mMatrix[VX][VZ], m0.mMatrix[VX][VZ], w);
-
-		gBlendMat.mMatrix[VY][VX] = lerp(m1.mMatrix[VY][VX], m0.mMatrix[VY][VX], w);
-		gBlendMat.mMatrix[VY][VY] = lerp(m1.mMatrix[VY][VY], m0.mMatrix[VY][VY], w);
-		gBlendMat.mMatrix[VY][VZ] = lerp(m1.mMatrix[VY][VZ], m0.mMatrix[VY][VZ], w);
-
-		gBlendMat.mMatrix[VZ][VX] = lerp(m1.mMatrix[VZ][VX], m0.mMatrix[VZ][VX], w);
-		gBlendMat.mMatrix[VZ][VY] = lerp(m1.mMatrix[VZ][VY], m0.mMatrix[VZ][VY], w);
-		gBlendMat.mMatrix[VZ][VZ] = lerp(m1.mMatrix[VZ][VZ], m0.mMatrix[VZ][VZ], w);
-
-		gBlendMat.mMatrix[VW][VX] = lerp(m1.mMatrix[VW][VX], m0.mMatrix[VW][VX], w);
-		gBlendMat.mMatrix[VW][VY] = lerp(m1.mMatrix[VW][VY], m0.mMatrix[VW][VY], w);
-		gBlendMat.mMatrix[VW][VZ] = lerp(m1.mMatrix[VW][VZ], m0.mMatrix[VW][VZ], w);
-
-		o_vertices[bidx] = coords[index] * gBlendMat;
-		
-		LLMatrix3 &n0 = gJointRotUnaligned[joint+1];
-		LLMatrix3 &n1 = gJointRotUnaligned[joint+0];
-		
-		gBlendRotMat.mMatrix[VX][VX] = lerp(n1.mMatrix[VX][VX], n0.mMatrix[VX][VX], w);
-		gBlendRotMat.mMatrix[VX][VY] = lerp(n1.mMatrix[VX][VY], n0.mMatrix[VX][VY], w);
-		gBlendRotMat.mMatrix[VX][VZ] = lerp(n1.mMatrix[VX][VZ], n0.mMatrix[VX][VZ], w);
-
-		gBlendRotMat.mMatrix[VY][VX] = lerp(n1.mMatrix[VY][VX], n0.mMatrix[VY][VX], w);
-		gBlendRotMat.mMatrix[VY][VY] = lerp(n1.mMatrix[VY][VY], n0.mMatrix[VY][VY], w);
-		gBlendRotMat.mMatrix[VY][VZ] = lerp(n1.mMatrix[VY][VZ], n0.mMatrix[VY][VZ], w);
-
-		gBlendRotMat.mMatrix[VZ][VX] = lerp(n1.mMatrix[VZ][VX], n0.mMatrix[VZ][VX], w);
-		gBlendRotMat.mMatrix[VZ][VY] = lerp(n1.mMatrix[VZ][VY], n0.mMatrix[VZ][VY], w);
-		gBlendRotMat.mMatrix[VZ][VZ] = lerp(n1.mMatrix[VZ][VZ], n0.mMatrix[VZ][VZ], w);
-		
-		o_normals[bidx] = normals[index] * gBlendRotMat;
-	}
+	U32 offset = mMesh->mFaceVertexOffset*4;
+	vert += offset;
+	norm += offset;
 
-	buffer->setBuffer(0);
-}
+	for (U32 index = 0; index < mMesh->getNumVertices(); index++)
+	{
+		// equivalent to joint = floorf(weights[index]);
+		S32 joint = _mm_cvtt_ss2si(_mm_load_ss(weights+index));
+		F32 w = weights[index] - joint;		
 
-const U32 UPDATE_GEOMETRY_CALL_MASK			= 0x1FFF; // 8K samples before overflow
-const U32 UPDATE_GEOMETRY_CALL_OVERFLOW		= ~UPDATE_GEOMETRY_CALL_MASK;
-static bool sUpdateGeometryCallPointer		= false;
-static F64 sUpdateGeometryGlobalTime		= 0.0 ;
-static F64 sUpdateGeometryElapsedTime		= 0.0 ;
-static F64 sUpdateGeometryElapsedTimeOff	= 0.0 ;
-static F64 sUpdateGeometryElapsedTimeOn		= 0.0 ;
-static F64 sUpdateGeometryRunAvgOff[10];
-static F64 sUpdateGeometryRunAvgOn[10];
-static U32 sUpdateGeometryRunCount			= 0 ;
-static U32 sUpdateGeometryCalls				= 0 ;
-static U32 sUpdateGeometryLastProcessor		= 0 ;
-static BOOL sVectorizePerfTest 				= FALSE;
-static U32 sVectorizeProcessor 				= 0;
-
-//static
-void (*LLViewerJointMesh::sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh);
-
-//static
-void LLViewerJointMesh::updateVectorize()
-{
-	sVectorizePerfTest = gSavedSettings.getBOOL("VectorizePerfTest");
-	sVectorizeProcessor = gSavedSettings.getU32("VectorizeProcessor");
-	BOOL vectorizeEnable = gSavedSettings.getBOOL("VectorizeEnable");
-	BOOL vectorizeSkin = gSavedSettings.getBOOL("VectorizeSkin");
+		LLMatrix4a gBlendMat;
 
-	std::string vp;
-	switch(sVectorizeProcessor)
-	{
-		case 2: vp = "SSE2"; break;					// *TODO: replace the magic #s
-		case 1: vp = "SSE"; break;
-		default: vp = "COMPILER DEFAULT"; break;
-	}
-	LL_INFOS("AppInit") << "Vectorization         : " << ( vectorizeEnable ? "ENABLED" : "DISABLED" ) << LL_ENDL ;
-	LL_INFOS("AppInit") << "Vector Processor      : " << vp << LL_ENDL ;
-	LL_INFOS("AppInit") << "Vectorized Skinning   : " << ( vectorizeSkin ? "ENABLED" : "DISABLED" ) << LL_ENDL ;
-	if(vectorizeEnable && vectorizeSkin)
-	{
-		switch(sVectorizeProcessor)
+		if (w != 0.f)
 		{
-			case 2:
-				sUpdateGeometryFunc = &updateGeometrySSE2;
-				break;
-			case 1:
-				sUpdateGeometryFunc = &updateGeometrySSE;
-				break;
-			default:
-				sUpdateGeometryFunc = &updateGeometryVectorized;
-				break;
+			// blend between matrices and apply
+			gBlendMat.setLerp(gJointMatAligned[joint+0],
+							  gJointMatAligned[joint+1], w);
+
+			LLVector4a res;
+			gBlendMat.affineTransform(coords[index], res);
+			res.store4a(vert+index*4);
+			gBlendMat.rotate(normals[index], res);
+			res.store4a(norm+index*4);
+		}
+		else
+		{  // No lerp required in this case.
+			LLVector4a res;
+			gJointMatAligned[joint].affineTransform(coords[index], res);
+			res.store4a(vert+index*4);
+			gJointMatAligned[joint].rotate(normals[index], res);
+			res.store4a(norm+index*4);
 		}
 	}
-	else
-	{
-		sUpdateGeometryFunc = &updateGeometryOriginal;
-	}
+
+	buffer->flush();
 }
 
 void LLViewerJointMesh::updateJointGeometry()
@@ -940,135 +825,14 @@ void LLViewerJointMesh::updateJointGeometry()
 		  && mMesh
 		  && mFace
 		  && mMesh->hasWeights()
-		  && mFace->mVertexBuffer.notNull()
+		  && mFace->getVertexBuffer()
 		  && LLViewerShaderMgr::instance()->getVertexShaderLevel(LLViewerShaderMgr::SHADER_AVATAR) == 0))
 	{
 		return;
 	}
 
-	if (!sVectorizePerfTest)
-	{
-		// Once we've measured performance, just run the specified
-		// code version.
-		if(sUpdateGeometryFunc == updateGeometryOriginal)
-			uploadJointMatrices();
-		sUpdateGeometryFunc(mFace, mMesh);
-	}
-	else
-	{
-		// At startup, measure the amount of time in skinning and choose
-		// the fastest one.
-		LLTimer ug_timer ;
-		
-		if (sUpdateGeometryCallPointer)
-		{
-			if(sUpdateGeometryFunc == updateGeometryOriginal)
-				uploadJointMatrices();
-			// call accelerated version for this processor
-			sUpdateGeometryFunc(mFace, mMesh);
-		}
-		else
-		{
-			uploadJointMatrices();
-			updateGeometryOriginal(mFace, mMesh);
-		}
-	
-		sUpdateGeometryElapsedTime += ug_timer.getElapsedTimeF64();
-		++sUpdateGeometryCalls;
-		if(0 != (sUpdateGeometryCalls & UPDATE_GEOMETRY_CALL_OVERFLOW))
-		{
-			F64 time_since_app_start = ug_timer.getElapsedSeconds();
-			if(sUpdateGeometryGlobalTime == 0.0 
-				|| sUpdateGeometryLastProcessor != sVectorizeProcessor)
-			{
-				sUpdateGeometryGlobalTime		= time_since_app_start;
-				sUpdateGeometryElapsedTime		= 0;
-				sUpdateGeometryCalls			= 0;
-				sUpdateGeometryRunCount			= 0;
-				sUpdateGeometryLastProcessor	= sVectorizeProcessor;
-				sUpdateGeometryCallPointer		= false;
-				return;
-			}
-			F64 percent_time_in_function = 
-				( sUpdateGeometryElapsedTime * 100.0 ) / ( time_since_app_start - sUpdateGeometryGlobalTime ) ;
-			sUpdateGeometryGlobalTime = time_since_app_start;
-			if (!sUpdateGeometryCallPointer)
-			{
-				// First set of run data is with vectorization off.
-				sUpdateGeometryCallPointer = true;
-				llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
-					<< "vectorize off " << percent_time_in_function
-					<< "% of time with "
-					<< (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
-					<< " seconds per call "
-					<< llendl;
-				sUpdateGeometryRunAvgOff[sUpdateGeometryRunCount] = percent_time_in_function;
-				sUpdateGeometryElapsedTimeOff += sUpdateGeometryElapsedTime;
-				sUpdateGeometryCalls = 0;
-			}
-			else
-			{
-				// Second set of run data is with vectorization on.
-				sUpdateGeometryCallPointer = false;
-				llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
-					<< "VEC on " << percent_time_in_function
-					<< "% of time with "
-					<< (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
-					<< " seconds per call "
-					<< llendl;
-				sUpdateGeometryRunAvgOn[sUpdateGeometryRunCount] = percent_time_in_function ;
-				sUpdateGeometryElapsedTimeOn += sUpdateGeometryElapsedTime;
-
-				sUpdateGeometryCalls = 0;
-				sUpdateGeometryRunCount++;
-				F64 a = 0.0, b = 0.0;
-				for(U32 i = 0; i<sUpdateGeometryRunCount; i++)
-				{
-					a += sUpdateGeometryRunAvgOff[i];
-					b += sUpdateGeometryRunAvgOn[i];
-				}
-				a /= sUpdateGeometryRunCount;
-				b /= sUpdateGeometryRunCount;
-				F64 perf_boost = ( sUpdateGeometryElapsedTimeOff - sUpdateGeometryElapsedTimeOn ) / sUpdateGeometryElapsedTimeOn;
-				llinfos << "run averages (" << (F64)sUpdateGeometryRunCount
-					<< "/10) vectorize off " << a
-					<< "% : vectorize type " << sVectorizeProcessor
-					<< " " << b
-					<< "% : performance boost " 
-					<< perf_boost * 100.0
-					<< "%"
-					<< llendl ;
-				if(sUpdateGeometryRunCount == 10)
-				{
-					// In case user runs test again, force reset of data on
-					// next run.
-					sUpdateGeometryGlobalTime = 0.0;
-
-					// We have data now on which version is faster.  Switch to that
-					// code and save the data for next run.
-					gSavedSettings.setBOOL("VectorizePerfTest", FALSE);
-
-					if (perf_boost > 0.0)
-					{
-						llinfos << "Vectorization improves avatar skinning performance, "
-							<< "keeping on for future runs."
-							<< llendl;
-						gSavedSettings.setBOOL("VectorizeSkin", TRUE);
-					}
-					else
-					{
-						// SIMD decreases performance, fall back to original code
-						llinfos << "Vectorization decreases avatar skinning performance, "
-							<< "switching back to original code."
-							<< llendl;
-
-						gSavedSettings.setBOOL("VectorizeSkin", FALSE);
-					}
-				}
-			}
-			sUpdateGeometryElapsedTime = 0.0f;
-		}
-	}
+	uploadJointMatrices();
+	updateGeometry(mFace, mMesh);
 }
 
 void LLViewerJointMesh::dump()