summaryrefslogtreecommitdiff
path: root/indra/newview
diff options
context:
space:
mode:
authorJames Cook <james@lindenlab.com>2007-07-02 23:52:40 +0000
committerJames Cook <james@lindenlab.com>2007-07-02 23:52:40 +0000
commit1a33bc19b4ce94ab210749911dff14409b4454dd (patch)
treeb674d97d37240a29c0a6671adfe950a506ef0ea4 /indra/newview
parente5124431b54d4342d4677371fccca5bc7250c079 (diff)
svn merge -r 62595:62596 and 62598:63308 sse-skinning-3 for faster software avatar rendering. Visual Studio 2005 project file fixed pending.
Diffstat (limited to 'indra/newview')
-rw-r--r--indra/newview/lldrawable.h1
-rw-r--r--indra/newview/llviewerjointmesh.cpp210
-rw-r--r--indra/newview/llviewerjointmesh.h16
-rw-r--r--indra/newview/llviewerjointmesh_sse.cpp94
-rw-r--r--indra/newview/llviewerjointmesh_sse2.cpp96
-rw-r--r--indra/newview/llviewerjointmesh_vec.cpp76
-rw-r--r--indra/newview/llviewermenu.cpp3
7 files changed, 467 insertions, 29 deletions
diff --git a/indra/newview/lldrawable.h b/indra/newview/lldrawable.h
index 48c58dbb4c..328a116f59 100644
--- a/indra/newview/lldrawable.h
+++ b/indra/newview/lldrawable.h
@@ -26,6 +26,7 @@
#include "llviewerobject.h"
#include "llrect.h"
+class LLCamera;
class LLDrawPool;
class LLDrawable;
class LLFace;
diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp
index 150943465d..ba4c7e1b20 100644
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@@ -19,6 +19,7 @@
#include "llfasttimer.h"
#include "llagent.h"
+#include "llapr.h"
#include "llbox.h"
#include "lldrawable.h"
#include "lldrawpoolavatar.h"
@@ -29,12 +30,18 @@
#include "llglheaders.h"
#include "lltexlayer.h"
#include "llviewercamera.h"
+#include "llviewercontrol.h"
#include "llviewerimagelist.h"
#include "llviewerjointmesh.h"
#include "llvoavatar.h"
#include "llsky.h"
#include "pipeline.h"
#include "llglslshader.h"
+#include "llmath.h"
+#include "v4math.h"
+#include "m3math.h"
+#include "m4math.h"
+
#if !LL_DARWIN && !LL_LINUX
extern PFNGLWEIGHTPOINTERARBPROC glWeightPointerARB;
@@ -48,6 +55,7 @@ static const U32 sRenderMask = LLVertexBuffer::MAP_VERTEX |
LLVertexBuffer::MAP_NORMAL |
LLVertexBuffer::MAP_TEXCOORD;
+
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
// LLViewerJointMesh::LLSkinJoint
@@ -100,6 +108,7 @@ BOOL LLSkinJoint::setupSkinJoint( LLViewerJoint *joint)
return TRUE;
}
+
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
// LLViewerJointMesh
@@ -394,9 +403,9 @@ const S32 NUM_AXES = 3;
// rotation Z 0-n
// pivot parent 0-n -- child = n+1
-static LLMatrix4 gJointMat[32];
-static LLMatrix3 gJointRot[32];
-static LLVector4 gJointPivot[32];
+static LLMatrix4 gJointMatUnaligned[32];
+static LLMatrix3 gJointRotUnaligned[32];
+static LLVector4 gJointPivot[32];
//-----------------------------------------------------------------------------
// uploadJointMatrices()
@@ -417,8 +426,8 @@ void LLViewerJointMesh::uploadJointMatrices()
{
joint_mat *= LLDrawPoolAvatar::getModelView();
}
- gJointMat[joint_num] = joint_mat;
- gJointRot[joint_num] = joint_mat.getMat3();
+ gJointMatUnaligned[joint_num] = joint_mat;
+ gJointRotUnaligned[joint_num] = joint_mat.getMat3();
}
BOOL last_pivot_uploaded = FALSE;
@@ -455,8 +464,8 @@ void LLViewerJointMesh::uploadJointMatrices()
{
LLVector3 pivot;
pivot = LLVector3(gJointPivot[i]);
- pivot = pivot * gJointRot[i];
- gJointMat[i].translate(pivot);
+ pivot = pivot * gJointRotUnaligned[i];
+ gJointMatUnaligned[i].translate(pivot);
}
// upload matrices
@@ -467,11 +476,11 @@ void LLViewerJointMesh::uploadJointMatrices()
for (joint_num = 0; joint_num < reference_mesh->mJointRenderData.count(); joint_num++)
{
- gJointMat[joint_num].transpose();
+ gJointMatUnaligned[joint_num].transpose();
for (S32 axis = 0; axis < NUM_AXES; axis++)
{
- F32* vector = gJointMat[joint_num].mMatrix[axis];
+ F32* vector = gJointMatUnaligned[joint_num].mMatrix[axis];
//glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, LL_CHARACTER_MAX_JOINTS_PER_MESH * axis + joint_num+5, (GLfloat*)vector);
U32 offset = LL_CHARACTER_MAX_JOINTS_PER_MESH*axis+joint_num;
memcpy(mat+offset*4, vector, sizeof(GLfloat)*4);
@@ -883,21 +892,9 @@ BOOL LLViewerJointMesh::updateLOD(F32 pixel_area, BOOL activate)
return (valid != activate);
}
-
-void LLViewerJointMesh::updateGeometry()
+// static
+void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh)
{
- if (!(mValid
- && mMesh
- && mFace
- && mMesh->hasWeights()
- && mFace->mVertexBuffer.notNull()
- && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0))
- {
- return;
- }
-
- uploadJointMatrices();
-
LLStrider<LLVector3> o_vertices;
LLStrider<LLVector3> o_normals;
@@ -938,9 +935,9 @@ void LLViewerJointMesh::updateGeometry()
// No lerp required in this case.
if (w == 1.0f)
{
- gBlendMat = gJointMat[joint+1];
+ gBlendMat = gJointMatUnaligned[joint+1];
o_vertices[bidx] = coords[index] * gBlendMat;
- gBlendRotMat = gJointRot[joint+1];
+ gBlendRotMat = gJointRotUnaligned[joint+1];
o_normals[bidx] = normals[index] * gBlendRotMat;
continue;
}
@@ -948,8 +945,8 @@ void LLViewerJointMesh::updateGeometry()
// Try to keep all the accesses to the matrix data as close
// together as possible. This function is a hot spot on the
// Mac. JC
- LLMatrix4 &m0 = gJointMat[joint+1];
- LLMatrix4 &m1 = gJointMat[joint+0];
+ LLMatrix4 &m0 = gJointMatUnaligned[joint+1];
+ LLMatrix4 &m1 = gJointMatUnaligned[joint+0];
gBlendMat.mMatrix[VX][VX] = lerp(m1.mMatrix[VX][VX], m0.mMatrix[VX][VX], w);
gBlendMat.mMatrix[VX][VY] = lerp(m1.mMatrix[VX][VY], m0.mMatrix[VX][VY], w);
@@ -969,8 +966,8 @@ void LLViewerJointMesh::updateGeometry()
o_vertices[bidx] = coords[index] * gBlendMat;
- LLMatrix3 &n0 = gJointRot[joint+1];
- LLMatrix3 &n1 = gJointRot[joint+0];
+ LLMatrix3 &n0 = gJointRotUnaligned[joint+1];
+ LLMatrix3 &n1 = gJointRotUnaligned[joint+0];
gBlendRotMat.mMatrix[VX][VX] = lerp(n1.mMatrix[VX][VX], n0.mMatrix[VX][VX], w);
gBlendRotMat.mMatrix[VX][VY] = lerp(n1.mMatrix[VX][VY], n0.mMatrix[VX][VY], w);
@@ -988,6 +985,161 @@ void LLViewerJointMesh::updateGeometry()
}
}
+const U32 UPDATE_GEOMETRY_CALL_MASK = 0x1FFF; // 8K samples before overflow
+const U32 UPDATE_GEOMETRY_CALL_OVERFLOW = ~UPDATE_GEOMETRY_CALL_MASK;
+static bool sUpdateGeometryCallPointer = false;
+static F64 sUpdateGeometryGlobalTime = 0.0 ;
+static F64 sUpdateGeometryElapsedTime = 0.0 ;
+static F64 sUpdateGeometryElapsedTimeOff = 0.0 ;
+static F64 sUpdateGeometryElapsedTimeOn = 0.0 ;
+static F64 sUpdateGeometryRunAvgOff[10];
+static F64 sUpdateGeometryRunAvgOn[10];
+static U32 sUpdateGeometryRunCount = 0 ;
+static U32 sUpdateGeometryCalls = 0 ;
+static U32 sUpdateGeometryLastProcessor = 0 ;
+void (*LLViewerJointMesh::sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh);
+
+void LLViewerJointMesh::updateGeometry()
+{
+ extern BOOL gVectorizePerfTest;
+ extern U32 gVectorizeProcessor;
+
+ if (!(mValid
+ && mMesh
+ && mFace
+ && mMesh->hasWeights()
+ && mFace->mVertexBuffer.notNull()
+ && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0))
+ {
+ return;
+ }
+
+ if (!gVectorizePerfTest)
+ {
+ // Once we've measured performance, just run the specified
+ // code version.
+ if(sUpdateGeometryFunc == updateGeometryOriginal)
+ uploadJointMatrices();
+ sUpdateGeometryFunc(mFace, mMesh);
+ }
+ else
+ {
+ // At startup, measure the amount of time in skinning and choose
+ // the fastest one.
+ LLTimer ug_timer ;
+
+ if (sUpdateGeometryCallPointer)
+ {
+ if(sUpdateGeometryFunc == updateGeometryOriginal)
+ uploadJointMatrices();
+ // call accelerated version for this processor
+ sUpdateGeometryFunc(mFace, mMesh);
+ }
+ else
+ {
+ uploadJointMatrices();
+ updateGeometryOriginal(mFace, mMesh);
+ }
+
+ sUpdateGeometryElapsedTime += ug_timer.getElapsedTimeF64();
+ ++sUpdateGeometryCalls;
+ if(0 != (sUpdateGeometryCalls & UPDATE_GEOMETRY_CALL_OVERFLOW))
+ {
+ F64 time_since_app_start = ug_timer.getElapsedSeconds();
+ if(sUpdateGeometryGlobalTime == 0.0
+ || sUpdateGeometryLastProcessor != gVectorizeProcessor)
+ {
+ sUpdateGeometryGlobalTime = time_since_app_start;
+ sUpdateGeometryElapsedTime = 0;
+ sUpdateGeometryCalls = 0;
+ sUpdateGeometryRunCount = 0;
+ sUpdateGeometryLastProcessor = gVectorizeProcessor;
+ sUpdateGeometryCallPointer = false;
+ return;
+ }
+ F64 percent_time_in_function =
+ ( sUpdateGeometryElapsedTime * 100.0 ) / ( time_since_app_start - sUpdateGeometryGlobalTime ) ;
+ sUpdateGeometryGlobalTime = time_since_app_start;
+ if (!sUpdateGeometryCallPointer)
+ {
+ // First set of run data is with vectorization off.
+ sUpdateGeometryCallPointer = true;
+ llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
+ << "vectorize off " << percent_time_in_function
+ << "% of time with "
+ << (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
+ << " seconds per call "
+ << llendl;
+ sUpdateGeometryRunAvgOff[sUpdateGeometryRunCount] = percent_time_in_function;
+ sUpdateGeometryElapsedTimeOff += sUpdateGeometryElapsedTime;
+ sUpdateGeometryCalls = 0;
+ }
+ else
+ {
+ // Second set of run data is with vectorization on.
+ sUpdateGeometryCallPointer = false;
+ llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
+ << "VEC on " << percent_time_in_function
+ << "% of time with "
+ << (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
+ << " seconds per call "
+ << llendl;
+ sUpdateGeometryRunAvgOn[sUpdateGeometryRunCount] = percent_time_in_function ;
+ sUpdateGeometryElapsedTimeOn += sUpdateGeometryElapsedTime;
+
+ sUpdateGeometryCalls = 0;
+ sUpdateGeometryRunCount++;
+ F64 a = 0.0, b = 0.0;
+ for(U32 i = 0; i<sUpdateGeometryRunCount; i++)
+ {
+ a += sUpdateGeometryRunAvgOff[i];
+ b += sUpdateGeometryRunAvgOn[i];
+ }
+ a /= sUpdateGeometryRunCount;
+ b /= sUpdateGeometryRunCount;
+ F64 perf_boost = ( sUpdateGeometryElapsedTimeOff - sUpdateGeometryElapsedTimeOn ) / sUpdateGeometryElapsedTimeOn;
+ llinfos << "run averages (" << (F64)sUpdateGeometryRunCount
+ << "/10) vectorize off " << a
+ << "% : vectorize type " << gVectorizeProcessor
+ << " " << b
+ << "% : performance boost "
+ << perf_boost * 100.0
+ << "%"
+ << llendl ;
+ if(sUpdateGeometryRunCount == 10)
+ {
+ // In case user runs test again, force reset of data on
+ // next run.
+ sUpdateGeometryGlobalTime = 0.0;
+
+ // We have data now on which version is faster. Switch to that
+ // code and save the data for next run.
+ gVectorizePerfTest = FALSE;
+ gSavedSettings.setBOOL("VectorizePerfTest", FALSE);
+
+ if (perf_boost > 0.0)
+ {
+ llinfos << "Vectorization improves avatar skinning performance, "
+ << "keeping on for future runs."
+ << llendl;
+ gSavedSettings.setBOOL("VectorizeSkin", TRUE);
+ }
+ else
+ {
+ // SIMD decreases performance, fall back to original code
+ llinfos << "Vectorization decreases avatar skinning performance, "
+ << "switching back to original code."
+ << llendl;
+
+ gSavedSettings.setBOOL("VectorizeSkin", FALSE);
+ }
+ }
+ }
+ sUpdateGeometryElapsedTime = 0.0f;
+ }
+ }
+}
+
void LLViewerJointMesh::dump()
{
if (mValid)
diff --git a/indra/newview/llviewerjointmesh.h b/indra/newview/llviewerjointmesh.h
index b6fd8afcdb..992c3656a1 100644
--- a/indra/newview/llviewerjointmesh.h
+++ b/indra/newview/llviewerjointmesh.h
@@ -126,6 +126,22 @@ public:
/*virtual*/ BOOL isAnimatable() { return FALSE; }
void writeCAL3D(apr_file_t* fp, S32 material_num, LLCharacter* characterp);
+
+ // Avatar vertex skinning is a significant performance issue on computers
+ // with avatar vertex programs turned off (for example, most Macs). We
+ // therefore have custom versions that use SIMD instructions.
+ //
+ // These functions require compiler options for SSE2, SSE, or neither, and
+ // hence are contained in separate individual .cpp files. JC
+ static void updateGeometryOriginal(LLFace* face, LLPolyMesh* mesh);
+ // generic vector code, used for Altivec
+ static void updateGeometryVectorized(LLFace* face, LLPolyMesh* mesh);
+ static void updateGeometrySSE(LLFace* face, LLPolyMesh* mesh);
+ static void updateGeometrySSE2(LLFace* face, LLPolyMesh* mesh);
+
+ // Use a fuction pointer to indicate which version we are running.
+ static void (*sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh);
+
private:
// Allocate skin data
BOOL allocateSkinData( U32 numSkinJoints );
diff --git a/indra/newview/llviewerjointmesh_sse.cpp b/indra/newview/llviewerjointmesh_sse.cpp
new file mode 100644
index 0000000000..c4f8ff4fa8
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_sse.cpp
@@ -0,0 +1,94 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+
+// Do not use precompiled headers, because we need to build this file with
+// SSE support, but not the precompiled header file. JC
+#include "linden_common.h"
+
+#include "llviewerjointmesh.h"
+
+// project includes
+#include "llface.h"
+#include "llpolymesh.h"
+
+// library includes
+#include "lldarray.h"
+#include "llv4math.h" // for LL_VECTORIZE
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+#include "v3math.h"
+
+// *NOTE: SSE must be enabled for this module
+
+#if LL_VECTORIZE
+
+static LLV4Matrix4 sJointMat[32];
+
+inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j)
+{
+ m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]);
+ m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]);
+ m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]);
+ m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]);
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY]));
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ]));
+}
+
+// static
+void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh)
+{
+ LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+
+ //upload joint pivots/matrices
+ for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j )
+ {
+ matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix,
+ joint_data[j]->mSkinJoint ?
+ joint_data[j]->mSkinJoint->mRootToJointSkinOffset
+ : joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset);
+ }
+
+ F32 weight = F32_MAX;
+ LLV4Matrix4 blend_mat;
+
+ LLStrider<LLVector3> o_vertices;
+ LLStrider<LLVector3> o_normals;
+
+ LLVertexBuffer *buffer = face->mVertexBuffer;
+ buffer->getVertexStrider(o_vertices, mesh->mFaceVertexOffset);
+ buffer->getNormalStrider(o_normals, mesh->mFaceVertexOffset);
+
+ const F32* weights = mesh->getWeights();
+ const LLVector3* coords = mesh->getCoords();
+ const LLVector3* normals = mesh->getNormals();
+ for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+ {
+ if( weight != weights[index])
+ {
+ S32 joint = llfloor(weight = weights[index]);
+ blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+ }
+ blend_mat.multiply(coords[index], o_vertices[index]);
+ ((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+ }
+}
+
+#else
+
+void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh)
+{
+ LLViewerJointMesh::updateGeometryVectorized(face, mesh);
+ return;
+}
+
+#endif
diff --git a/indra/newview/llviewerjointmesh_sse2.cpp b/indra/newview/llviewerjointmesh_sse2.cpp
new file mode 100644
index 0000000000..cae602ac14
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_sse2.cpp
@@ -0,0 +1,96 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+
+// Do not use precompiled headers, because we need to build this file with
+// SSE support, but not the precompiled header file. JC
+#include "linden_common.h"
+
+#include "llviewerjointmesh.h"
+
+// project includes
+#include "llface.h"
+#include "llpolymesh.h"
+
+// library includes
+#include "lldarray.h"
+#include "llstrider.h"
+#include "llv4math.h" // for LL_VECTORIZE
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+#include "m4math.h"
+#include "v3math.h"
+
+// *NOTE: SSE2 must be enabled for this module
+
+#if LL_VECTORIZE
+
+static LLV4Matrix4 sJointMat[32];
+
+inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j)
+{
+ m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]);
+ m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]);
+ m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]);
+ m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]);
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY]));
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ]));
+}
+
+// static
+void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh)
+{
+ LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+
+ //upload joint pivots/matrices
+ for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j )
+ {
+ matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix,
+ joint_data[j]->mSkinJoint ?
+ joint_data[j]->mSkinJoint->mRootToJointSkinOffset
+ : joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset);
+ }
+
+ F32 weight = F32_MAX;
+ LLV4Matrix4 blend_mat;
+
+ LLStrider<LLVector3> o_vertices;
+ LLStrider<LLVector3> o_normals;
+
+ LLVertexBuffer *buffer = face->mVertexBuffer;
+ buffer->getVertexStrider(o_vertices, mesh->mFaceVertexOffset);
+ buffer->getNormalStrider(o_normals, mesh->mFaceVertexOffset);
+
+ const F32* weights = mesh->getWeights();
+ const LLVector3* coords = mesh->getCoords();
+ const LLVector3* normals = mesh->getNormals();
+ for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+ {
+ if( weight != weights[index])
+ {
+ S32 joint = llfloor(weight = weights[index]);
+ blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+ }
+ blend_mat.multiply(coords[index], o_vertices[index]);
+ ((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+ }
+}
+
+#else
+
+void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh)
+{
+ LLViewerJointMesh::updateGeometryVectorized(face, mesh);
+ return;
+}
+
+#endif
diff --git a/indra/newview/llviewerjointmesh_vec.cpp b/indra/newview/llviewerjointmesh_vec.cpp
new file mode 100644
index 0000000000..5b1e080435
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_vec.cpp
@@ -0,0 +1,76 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 math class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2001-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+#include "llviewerprecompiledheaders.h"
+
+#include "llviewerjointmesh.h"
+
+#include "llface.h"
+#include "llpolymesh.h"
+#include "llv4math.h"
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+
+// *NOTE: SSE must be disabled for this module
+
+#if LL_VECTORIZE
+#error This module requires vectorization (i.e. SSE) mode to be disabled.
+#endif
+
+static LLV4Matrix4 sJointMat[32];
+
+// static
+void LLViewerJointMesh::updateGeometryVectorized(LLFace *face, LLPolyMesh *mesh)
+{
+ LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+ S32 j, joint_num, joint_end = joint_data.count();
+ LLV4Vector3 pivot;
+
+ //upload joint pivots/matrices
+ for(j = joint_num = 0; joint_num < joint_end ; ++joint_num )
+ {
+ LLSkinJoint *sj;
+ const LLMatrix4 * wm = joint_data[joint_num]->mWorldMatrix;
+ if (NULL == (sj = joint_data[joint_num]->mSkinJoint))
+ {
+ sj = joint_data[++joint_num]->mSkinJoint;
+ ((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToParentJointSkinOffset, pivot);
+ sJointMat[j++].translate(pivot);
+ wm = joint_data[joint_num]->mWorldMatrix;
+ }
+ ((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToJointSkinOffset, pivot);
+ sJointMat[j++].translate(pivot);
+ }
+
+ F32 weight = F32_MAX;
+ LLV4Matrix4 blend_mat;
+
+ LLStrider<LLVector3> o_vertices;
+ LLStrider<LLVector3> o_normals;
+
+ LLVertexBuffer *buffer = face->mVertexBuffer;
+ buffer->getVertexStrider(o_vertices, mesh->mFaceVertexOffset);
+ buffer->getNormalStrider(o_normals, mesh->mFaceVertexOffset);
+
+ const F32* weights = mesh->getWeights();
+ const LLVector3* coords = mesh->getCoords();
+ const LLVector3* normals = mesh->getNormals();
+ for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+ {
+ if( weight != weights[index])
+ {
+ S32 joint = llfloor(weight = weights[index]);
+ blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+ }
+ blend_mat.multiply(coords[index], o_vertices[index]);
+ ((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+ }
+}
diff --git a/indra/newview/llviewermenu.cpp b/indra/newview/llviewermenu.cpp
index d18859e356..7ad4f1d70b 100644
--- a/indra/newview/llviewermenu.cpp
+++ b/indra/newview/llviewermenu.cpp
@@ -960,6 +960,7 @@ extern BOOL gDebugClicks;
extern BOOL gDebugWindowProc;
extern BOOL gDebugTextEditorTips;
extern BOOL gDebugSelectMgr;
+extern BOOL gVectorizePerfTest;
void init_debug_ui_menu(LLMenuGL* menu)
{
@@ -1169,6 +1170,8 @@ void init_debug_rendering_menu(LLMenuGL* menu)
(void*)"ShowDepthBuffer"));
sub_menu->append(new LLMenuItemToggleGL("Show Select Buffer", &gDebugSelect));
+ sub_menu->append(new LLMenuItemToggleGL("Vectorize Perf Test", &gVectorizePerfTest));
+
sub_menu = new LLMenuGL("Render Tests");
sub_menu->append(new LLMenuItemCheckGL("Camera Offset",