svn merge -r 62595:62596 and 62598:63308 sse-skinning-3 for faster software avatar rendering. Visual Studio 2005 project file fixed pending.

author: James Cook <james@lindenlab.com> 2007-07-02 23:52:40 +0000
committer: James Cook <james@lindenlab.com> 2007-07-02 23:52:40 +0000
commit: 1a33bc19b4ce94ab210749911dff14409b4454dd (patch)
tree: b674d97d37240a29c0a6671adfe950a506ef0ea4 /indra/newview
parent: e5124431b54d4342d4677371fccca5bc7250c079 (diff)
7 files changed, 467 insertions, 29 deletions
diff --git a/indra/newview/lldrawable.h b/indra/newview/lldrawable.h
index 48c58dbb4c..328a116f59 100644
--- a/indra/newview/lldrawable.h
+++ b/indra/newview/lldrawable.h
@@ -26,6 +26,7 @@
 #include "llviewerobject.h"
 #include "llrect.h"
 
+class LLCamera;
 class LLDrawPool;
 class LLDrawable;
 class LLFace;
diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp
index 150943465d..ba4c7e1b20 100644
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@@ -19,6 +19,7 @@
 #include "llfasttimer.h"
 
 #include "llagent.h"
+#include "llapr.h"
 #include "llbox.h"
 #include "lldrawable.h"
 #include "lldrawpoolavatar.h"
@@ -29,12 +30,18 @@
 #include "llglheaders.h"
 #include "lltexlayer.h"
 #include "llviewercamera.h"
+#include "llviewercontrol.h"
 #include "llviewerimagelist.h"
 #include "llviewerjointmesh.h"
 #include "llvoavatar.h"
 #include "llsky.h"
 #include "pipeline.h"
 #include "llglslshader.h"
+#include "llmath.h"
+#include "v4math.h"
+#include "m3math.h"
+#include "m4math.h"
+
 
 #if !LL_DARWIN && !LL_LINUX
 extern PFNGLWEIGHTPOINTERARBPROC glWeightPointerARB;
@@ -48,6 +55,7 @@ static const U32 sRenderMask = LLVertexBuffer::MAP_VERTEX |
 							   LLVertexBuffer::MAP_NORMAL |
 							   LLVertexBuffer::MAP_TEXCOORD;
 
+
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 // LLViewerJointMesh::LLSkinJoint
@@ -100,6 +108,7 @@ BOOL LLSkinJoint::setupSkinJoint( LLViewerJoint *joint)
 	return TRUE;
 }
 
+
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 // LLViewerJointMesh
@@ -394,9 +403,9 @@ const S32 NUM_AXES = 3;
 // rotation Z 0-n
 // pivot parent 0-n -- child = n+1
 
-static LLMatrix4 gJointMat[32];
-static LLMatrix3 gJointRot[32];
-static LLVector4 gJointPivot[32];
+static LLMatrix4	gJointMatUnaligned[32];
+static LLMatrix3	gJointRotUnaligned[32];
+static LLVector4	gJointPivot[32];
 
 //-----------------------------------------------------------------------------
 // uploadJointMatrices()
@@ -417,8 +426,8 @@ void LLViewerJointMesh::uploadJointMatrices()
 		{
 			joint_mat *= LLDrawPoolAvatar::getModelView();
 		}
-		gJointMat[joint_num] = joint_mat;
-		gJointRot[joint_num] = joint_mat.getMat3();
+		gJointMatUnaligned[joint_num] = joint_mat;
+		gJointRotUnaligned[joint_num] = joint_mat.getMat3();
 	}
 
 	BOOL last_pivot_uploaded = FALSE;
@@ -455,8 +464,8 @@ void LLViewerJointMesh::uploadJointMatrices()
 	{
 		LLVector3 pivot;
 		pivot = LLVector3(gJointPivot[i]);
-		pivot = pivot * gJointRot[i];
-		gJointMat[i].translate(pivot);
+		pivot = pivot * gJointRotUnaligned[i];
+		gJointMatUnaligned[i].translate(pivot);
 	}
 
 	// upload matrices
@@ -467,11 +476,11 @@ void LLViewerJointMesh::uploadJointMatrices()
 
 		for (joint_num = 0; joint_num < reference_mesh->mJointRenderData.count(); joint_num++)
 		{
-			gJointMat[joint_num].transpose();
+			gJointMatUnaligned[joint_num].transpose();
 
 			for (S32 axis = 0; axis < NUM_AXES; axis++)
 			{
-				F32* vector = gJointMat[joint_num].mMatrix[axis];
+				F32* vector = gJointMatUnaligned[joint_num].mMatrix[axis];
 				//glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, LL_CHARACTER_MAX_JOINTS_PER_MESH * axis + joint_num+5, (GLfloat*)vector);
 				U32 offset = LL_CHARACTER_MAX_JOINTS_PER_MESH*axis+joint_num;
 				memcpy(mat+offset*4, vector, sizeof(GLfloat)*4);
@@ -883,21 +892,9 @@ BOOL LLViewerJointMesh::updateLOD(F32 pixel_area, BOOL activate)
 	return (valid != activate);
 }
 
-
-void LLViewerJointMesh::updateGeometry()
+// static
+void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh)
 {
-	if (!(mValid
-		  && mMesh
-		  && mFace
-		  && mMesh->hasWeights()
-		  && mFace->mVertexBuffer.notNull()
-		  && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0))
-	{
-		return;
-	}
-	
-	uploadJointMatrices();
-
 	LLStrider<LLVector3> o_vertices;
 	LLStrider<LLVector3> o_normals;
 
@@ -938,9 +935,9 @@ void LLViewerJointMesh::updateGeometry()
 		// No lerp required in this case.
 		if (w == 1.0f)
 		{
-			gBlendMat = gJointMat[joint+1];
+			gBlendMat = gJointMatUnaligned[joint+1];
 			o_vertices[bidx] = coords[index] * gBlendMat;
-			gBlendRotMat = gJointRot[joint+1];
+			gBlendRotMat = gJointRotUnaligned[joint+1];
 			o_normals[bidx] = normals[index] * gBlendRotMat;
 			continue;
 		}
@@ -948,8 +945,8 @@ void LLViewerJointMesh::updateGeometry()
 		// Try to keep all the accesses to the matrix data as close
 		// together as possible.  This function is a hot spot on the
 		// Mac. JC
-		LLMatrix4 &m0 = gJointMat[joint+1];
-		LLMatrix4 &m1 = gJointMat[joint+0];
+		LLMatrix4 &m0 = gJointMatUnaligned[joint+1];
+		LLMatrix4 &m1 = gJointMatUnaligned[joint+0];
 		
 		gBlendMat.mMatrix[VX][VX] = lerp(m1.mMatrix[VX][VX], m0.mMatrix[VX][VX], w);
 		gBlendMat.mMatrix[VX][VY] = lerp(m1.mMatrix[VX][VY], m0.mMatrix[VX][VY], w);
@@ -969,8 +966,8 @@ void LLViewerJointMesh::updateGeometry()
 
 		o_vertices[bidx] = coords[index] * gBlendMat;
 		
-		LLMatrix3 &n0 = gJointRot[joint+1];
-		LLMatrix3 &n1 = gJointRot[joint+0];
+		LLMatrix3 &n0 = gJointRotUnaligned[joint+1];
+		LLMatrix3 &n1 = gJointRotUnaligned[joint+0];
 		
 		gBlendRotMat.mMatrix[VX][VX] = lerp(n1.mMatrix[VX][VX], n0.mMatrix[VX][VX], w);
 		gBlendRotMat.mMatrix[VX][VY] = lerp(n1.mMatrix[VX][VY], n0.mMatrix[VX][VY], w);
@@ -988,6 +985,161 @@ void LLViewerJointMesh::updateGeometry()
 	}
 }
 
+const U32 UPDATE_GEOMETRY_CALL_MASK			= 0x1FFF; // 8K samples before overflow
+const U32 UPDATE_GEOMETRY_CALL_OVERFLOW		= ~UPDATE_GEOMETRY_CALL_MASK;
+static bool sUpdateGeometryCallPointer		= false;
+static F64 sUpdateGeometryGlobalTime		= 0.0 ;
+static F64 sUpdateGeometryElapsedTime		= 0.0 ;
+static F64 sUpdateGeometryElapsedTimeOff	= 0.0 ;
+static F64 sUpdateGeometryElapsedTimeOn		= 0.0 ;
+static F64 sUpdateGeometryRunAvgOff[10];
+static F64 sUpdateGeometryRunAvgOn[10];
+static U32 sUpdateGeometryRunCount			= 0 ;
+static U32 sUpdateGeometryCalls				= 0 ;
+static U32 sUpdateGeometryLastProcessor		= 0 ;
+void (*LLViewerJointMesh::sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh);
+
+void LLViewerJointMesh::updateGeometry()
+{
+	extern BOOL gVectorizePerfTest;
+	extern U32	gVectorizeProcessor;
+
+	if (!(mValid
+		  && mMesh
+		  && mFace
+		  && mMesh->hasWeights()
+		  && mFace->mVertexBuffer.notNull()
+		  && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0))
+	{
+		return;
+	}
+
+	if (!gVectorizePerfTest)
+	{
+		// Once we've measured performance, just run the specified
+		// code version.
+		if(sUpdateGeometryFunc == updateGeometryOriginal)
+			uploadJointMatrices();
+		sUpdateGeometryFunc(mFace, mMesh);
+	}
+	else
+	{
+		// At startup, measure the amount of time in skinning and choose
+		// the fastest one.
+		LLTimer ug_timer ;
+		
+		if (sUpdateGeometryCallPointer)
+		{
+			if(sUpdateGeometryFunc == updateGeometryOriginal)
+				uploadJointMatrices();
+			// call accelerated version for this processor
+			sUpdateGeometryFunc(mFace, mMesh);
+		}
+		else
+		{
+			uploadJointMatrices();
+			updateGeometryOriginal(mFace, mMesh);
+		}
+	
+		sUpdateGeometryElapsedTime += ug_timer.getElapsedTimeF64();
+		++sUpdateGeometryCalls;
+		if(0 != (sUpdateGeometryCalls & UPDATE_GEOMETRY_CALL_OVERFLOW))
+		{
+			F64 time_since_app_start = ug_timer.getElapsedSeconds();
+			if(sUpdateGeometryGlobalTime == 0.0 
+				|| sUpdateGeometryLastProcessor != gVectorizeProcessor)
+			{
+				sUpdateGeometryGlobalTime		= time_since_app_start;
+				sUpdateGeometryElapsedTime		= 0;
+				sUpdateGeometryCalls			= 0;
+				sUpdateGeometryRunCount			= 0;
+				sUpdateGeometryLastProcessor	= gVectorizeProcessor;
+				sUpdateGeometryCallPointer		= false;
+				return;
+			}
+			F64 percent_time_in_function = 
+				( sUpdateGeometryElapsedTime * 100.0 ) / ( time_since_app_start - sUpdateGeometryGlobalTime ) ;
+			sUpdateGeometryGlobalTime = time_since_app_start;
+			if (!sUpdateGeometryCallPointer)
+			{
+				// First set of run data is with vectorization off.
+				sUpdateGeometryCallPointer = true;
+				llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
+					<< "vectorize off " << percent_time_in_function
+					<< "% of time with "
+					<< (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
+					<< " seconds per call "
+					<< llendl;
+				sUpdateGeometryRunAvgOff[sUpdateGeometryRunCount] = percent_time_in_function;
+				sUpdateGeometryElapsedTimeOff += sUpdateGeometryElapsedTime;
+				sUpdateGeometryCalls = 0;
+			}
+			else
+			{
+				// Second set of run data is with vectorization on.
+				sUpdateGeometryCallPointer = false;
+				llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
+					<< "VEC on " << percent_time_in_function
+					<< "% of time with "
+					<< (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
+					<< " seconds per call "
+					<< llendl;
+				sUpdateGeometryRunAvgOn[sUpdateGeometryRunCount] = percent_time_in_function ;
+				sUpdateGeometryElapsedTimeOn += sUpdateGeometryElapsedTime;
+
+				sUpdateGeometryCalls = 0;
+				sUpdateGeometryRunCount++;
+				F64 a = 0.0, b = 0.0;
+				for(U32 i = 0; i<sUpdateGeometryRunCount; i++)
+				{
+					a += sUpdateGeometryRunAvgOff[i];
+					b += sUpdateGeometryRunAvgOn[i];
+				}
+				a /= sUpdateGeometryRunCount;
+				b /= sUpdateGeometryRunCount;
+				F64 perf_boost = ( sUpdateGeometryElapsedTimeOff - sUpdateGeometryElapsedTimeOn ) / sUpdateGeometryElapsedTimeOn;
+				llinfos << "run averages (" << (F64)sUpdateGeometryRunCount
+					<< "/10) vectorize off " << a
+					<< "% : vectorize type " << gVectorizeProcessor
+					<< " " << b
+					<< "% : performance boost " 
+					<< perf_boost * 100.0
+					<< "%"
+					<< llendl ;
+				if(sUpdateGeometryRunCount == 10)
+				{
+					// In case user runs test again, force reset of data on
+					// next run.
+					sUpdateGeometryGlobalTime = 0.0;
+
+					// We have data now on which version is faster.  Switch to that
+					// code and save the data for next run.
+					gVectorizePerfTest = FALSE;
+					gSavedSettings.setBOOL("VectorizePerfTest", FALSE);
+
+					if (perf_boost > 0.0)
+					{
+						llinfos << "Vectorization improves avatar skinning performance, "
+							<< "keeping on for future runs."
+							<< llendl;
+						gSavedSettings.setBOOL("VectorizeSkin", TRUE);
+					}
+					else
+					{
+						// SIMD decreases performance, fall back to original code
+						llinfos << "Vectorization decreases avatar skinning performance, "
+							<< "switching back to original code."
+							<< llendl;
+
+						gSavedSettings.setBOOL("VectorizeSkin", FALSE);
+					}
+				}
+			}
+			sUpdateGeometryElapsedTime = 0.0f;
+		}
+	}
+}
+
 void LLViewerJointMesh::dump()
 {
 	if (mValid)
diff --git a/indra/newview/llviewerjointmesh.h b/indra/newview/llviewerjointmesh.h
index b6fd8afcdb..992c3656a1 100644
--- a/indra/newview/llviewerjointmesh.h
+++ b/indra/newview/llviewerjointmesh.h
@@ -126,6 +126,22 @@ public:
 
 	/*virtual*/ BOOL isAnimatable() { return FALSE; }
 	void writeCAL3D(apr_file_t* fp, S32 material_num, LLCharacter* characterp);
+
+	// Avatar vertex skinning is a significant performance issue on computers
+	// with avatar vertex programs turned off (for example, most Macs).  We
+	// therefore have custom versions that use SIMD instructions.
+	//
+	// These functions require compiler options for SSE2, SSE, or neither, and
+	// hence are contained in separate individual .cpp files.  JC
+	static void updateGeometryOriginal(LLFace* face, LLPolyMesh* mesh);
+	// generic vector code, used for Altivec
+	static void updateGeometryVectorized(LLFace* face, LLPolyMesh* mesh);
+	static void updateGeometrySSE(LLFace* face, LLPolyMesh* mesh);
+	static void updateGeometrySSE2(LLFace* face, LLPolyMesh* mesh);
+
+	// Use a fuction pointer to indicate which version we are running.
+	static void (*sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh);
+
 private:
 	// Allocate skin data
 	BOOL allocateSkinData( U32 numSkinJoints );
diff --git a/indra/newview/llviewerjointmesh_sse.cpp b/indra/newview/llviewerjointmesh_sse.cpp
new file mode 100644
index 0000000000..c4f8ff4fa8
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_sse.cpp
@@ -0,0 +1,94 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+
+// Do not use precompiled headers, because we need to build this file with
+// SSE support, but not the precompiled header file. JC
+#include "linden_common.h"
+
+#include "llviewerjointmesh.h"
+
+// project includes
+#include "llface.h"
+#include "llpolymesh.h"
+
+// library includes
+#include "lldarray.h"
+#include "llv4math.h"		// for LL_VECTORIZE
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+#include "v3math.h"
+
+// *NOTE: SSE must be enabled for this module
+
+#if LL_VECTORIZE
+
+static LLV4Matrix4	sJointMat[32];
+
+inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j)
+{
+	m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]);
+	m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]);
+	m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]);
+	m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]);
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY]));
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ]));
+}
+
+// static
+void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh)
+{
+	LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+
+	//upload joint pivots/matrices
+	for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j )
+	{
+		matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix,
+			joint_data[j]->mSkinJoint ?
+				joint_data[j]->mSkinJoint->mRootToJointSkinOffset
+				: joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset);
+	}
+
+	F32					weight		= F32_MAX;
+	LLV4Matrix4			blend_mat;
+
+	LLStrider<LLVector3> o_vertices;
+	LLStrider<LLVector3> o_normals;
+
+	LLVertexBuffer *buffer = face->mVertexBuffer;
+	buffer->getVertexStrider(o_vertices,  mesh->mFaceVertexOffset);
+	buffer->getNormalStrider(o_normals,   mesh->mFaceVertexOffset);
+
+	const F32*			weights			= mesh->getWeights();
+	const LLVector3*	coords			= mesh->getCoords();
+	const LLVector3*	normals			= mesh->getNormals();
+	for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+	{
+		if( weight != weights[index])
+		{
+			S32 joint = llfloor(weight = weights[index]);
+			blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+		}
+		blend_mat.multiply(coords[index], o_vertices[index]);
+		((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+	}
+}
+
+#else
+
+void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh)
+{
+	LLViewerJointMesh::updateGeometryVectorized(face, mesh);
+	return;
+}
+
+#endif
diff --git a/indra/newview/llviewerjointmesh_sse2.cpp b/indra/newview/llviewerjointmesh_sse2.cpp
new file mode 100644
index 0000000000..cae602ac14
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_sse2.cpp
@@ -0,0 +1,96 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+
+// Do not use precompiled headers, because we need to build this file with
+// SSE support, but not the precompiled header file. JC
+#include "linden_common.h"
+
+#include "llviewerjointmesh.h"
+
+// project includes
+#include "llface.h"
+#include "llpolymesh.h"
+
+// library includes
+#include "lldarray.h"
+#include "llstrider.h"
+#include "llv4math.h"		// for LL_VECTORIZE
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+#include "m4math.h"
+#include "v3math.h"
+
+// *NOTE: SSE2 must be enabled for this module
+
+#if LL_VECTORIZE
+
+static LLV4Matrix4	sJointMat[32];
+
+inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j)
+{
+	m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]);
+	m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]);
+	m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]);
+	m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]);
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY]));
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ]));
+}
+
+// static
+void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh)
+{
+	LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+
+	//upload joint pivots/matrices
+	for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j )
+	{
+		matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix,
+			joint_data[j]->mSkinJoint ?
+				joint_data[j]->mSkinJoint->mRootToJointSkinOffset
+				: joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset);
+	}
+
+	F32					weight		= F32_MAX;
+	LLV4Matrix4			blend_mat;
+
+	LLStrider<LLVector3> o_vertices;
+	LLStrider<LLVector3> o_normals;
+
+	LLVertexBuffer *buffer = face->mVertexBuffer;
+	buffer->getVertexStrider(o_vertices,  mesh->mFaceVertexOffset);
+	buffer->getNormalStrider(o_normals,   mesh->mFaceVertexOffset);
+
+	const F32*			weights			= mesh->getWeights();
+	const LLVector3*	coords			= mesh->getCoords();
+	const LLVector3*	normals			= mesh->getNormals();
+	for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+	{
+		if( weight != weights[index])
+		{
+			S32 joint = llfloor(weight = weights[index]);
+			blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+		}
+		blend_mat.multiply(coords[index], o_vertices[index]);
+		((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+	}
+}
+
+#else
+
+void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh)
+{
+	LLViewerJointMesh::updateGeometryVectorized(face, mesh);
+	return;
+}
+
+#endif
diff --git a/indra/newview/llviewerjointmesh_vec.cpp b/indra/newview/llviewerjointmesh_vec.cpp
new file mode 100644
index 0000000000..5b1e080435
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_vec.cpp
@@ -0,0 +1,76 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 math class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2001-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+#include "llviewerprecompiledheaders.h"
+
+#include "llviewerjointmesh.h"
+
+#include "llface.h"
+#include "llpolymesh.h"
+#include "llv4math.h"
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+
+// *NOTE: SSE must be disabled for this module
+
+#if LL_VECTORIZE
+#error This module requires vectorization (i.e. SSE) mode to be disabled.
+#endif
+
+static LLV4Matrix4	sJointMat[32];
+
+// static
+void LLViewerJointMesh::updateGeometryVectorized(LLFace *face, LLPolyMesh *mesh)
+{
+	LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+	S32 j, joint_num, joint_end = joint_data.count();
+	LLV4Vector3 pivot;
+
+	//upload joint pivots/matrices
+	for(j = joint_num = 0; joint_num < joint_end ; ++joint_num )
+	{
+		LLSkinJoint *sj;
+		const LLMatrix4 *	wm = joint_data[joint_num]->mWorldMatrix;
+		if (NULL == (sj = joint_data[joint_num]->mSkinJoint))
+		{
+				sj = joint_data[++joint_num]->mSkinJoint;
+				((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToParentJointSkinOffset, pivot);
+				sJointMat[j++].translate(pivot);
+				wm = joint_data[joint_num]->mWorldMatrix;
+		}
+		((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToJointSkinOffset, pivot);
+		sJointMat[j++].translate(pivot);
+	}
+
+	F32					weight		= F32_MAX;
+	LLV4Matrix4			blend_mat;
+
+	LLStrider<LLVector3> o_vertices;
+	LLStrider<LLVector3> o_normals;
+
+	LLVertexBuffer *buffer = face->mVertexBuffer;
+	buffer->getVertexStrider(o_vertices,  mesh->mFaceVertexOffset);
+	buffer->getNormalStrider(o_normals,   mesh->mFaceVertexOffset);
+
+	const F32*			weights			= mesh->getWeights();
+	const LLVector3*	coords			= mesh->getCoords();
+	const LLVector3*	normals			= mesh->getNormals();
+	for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+	{
+		if( weight != weights[index])
+		{
+			S32 joint = llfloor(weight = weights[index]);
+			blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+		}
+		blend_mat.multiply(coords[index], o_vertices[index]);
+		((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+	}
+}
diff --git a/indra/newview/llviewermenu.cpp b/indra/newview/llviewermenu.cpp
index d18859e356..7ad4f1d70b 100644
--- a/indra/newview/llviewermenu.cpp
+++ b/indra/newview/llviewermenu.cpp
@@ -960,6 +960,7 @@ extern BOOL gDebugClicks;
 extern BOOL gDebugWindowProc;
 extern BOOL gDebugTextEditorTips;
 extern BOOL gDebugSelectMgr;
+extern BOOL gVectorizePerfTest;
 
 void init_debug_ui_menu(LLMenuGL* menu)
 {
@@ -1169,6 +1170,8 @@ void init_debug_rendering_menu(LLMenuGL* menu)
 										   (void*)"ShowDepthBuffer"));
 	sub_menu->append(new LLMenuItemToggleGL("Show Select Buffer", &gDebugSelect));
 
+	sub_menu->append(new LLMenuItemToggleGL("Vectorize Perf Test", &gVectorizePerfTest));
+
 	sub_menu = new LLMenuGL("Render Tests");
 
 	sub_menu->append(new LLMenuItemCheckGL("Camera Offset",
author	James Cook <james@lindenlab.com>	2007-07-02 23:52:40 +0000
committer	James Cook <james@lindenlab.com>	2007-07-02 23:52:40 +0000
commit	1a33bc19b4ce94ab210749911dff14409b4454dd (patch)
tree	b674d97d37240a29c0a6671adfe950a506ef0ea4 /indra/newview
parent	e5124431b54d4342d4677371fccca5bc7250c079 (diff)