From 1a33bc19b4ce94ab210749911dff14409b4454dd Mon Sep 17 00:00:00 2001
From: James Cook <james@lindenlab.com>
Date: Mon, 2 Jul 2007 23:52:40 +0000
Subject: svn merge -r 62595:62596 and 62598:63308 sse-skinning-3 for faster
 software avatar rendering.  Visual Studio 2005 project file fixed pending.

---
 indra/llcommon/llpreprocessor.h          |  22 ++-
 indra/llcommon/llprocessor.cpp           |   8 +-
 indra/llcommon/llprocessor.h             |   1 +
 indra/llcommon/llskiplist.h              |  12 +-
 indra/llcommon/llsys.cpp                 |  25 +++-
 indra/llcommon/llsys.h                   |  12 +-
 indra/llmath/llv4math.h                  | 101 ++++++++++++++
 indra/llmath/llv4matrix3.h               | 202 +++++++++++++++++++++++++++
 indra/llmath/llv4matrix4.h               | 231 +++++++++++++++++++++++++++++++
 indra/llmath/llv4vector3.h               |  62 +++++++++
 indra/newview/lldrawable.h               |   1 +
 indra/newview/llviewerjointmesh.cpp      | 210 ++++++++++++++++++++++++----
 indra/newview/llviewerjointmesh.h        |  16 +++
 indra/newview/llviewerjointmesh_sse.cpp  |  94 +++++++++++++
 indra/newview/llviewerjointmesh_sse2.cpp |  96 +++++++++++++
 indra/newview/llviewerjointmesh_vec.cpp  |  76 ++++++++++
 indra/newview/llviewermenu.cpp           |   3 +
 17 files changed, 1120 insertions(+), 52 deletions(-)
 create mode 100644 indra/llmath/llv4math.h
 create mode 100644 indra/llmath/llv4matrix3.h
 create mode 100644 indra/llmath/llv4matrix4.h
 create mode 100644 indra/llmath/llv4vector3.h
 create mode 100644 indra/newview/llviewerjointmesh_sse.cpp
 create mode 100644 indra/newview/llviewerjointmesh_sse2.cpp
 create mode 100644 indra/newview/llviewerjointmesh_vec.cpp

(limited to 'indra')

diff --git a/indra/llcommon/llpreprocessor.h b/indra/llcommon/llpreprocessor.h
index 0882472242..4389fd3e30 100644
--- a/indra/llcommon/llpreprocessor.h
+++ b/indra/llcommon/llpreprocessor.h
@@ -51,12 +51,22 @@
 	#define MOZILLA_INTERNAL_API 1
 #endif
 
-// Deal with minor differences on Unixy OSes.
-#if LL_DARWIN || LL_LINUX
+// Figure out differences between compilers
+#if defined(__GNUC__)
 	#define GCC_VERSION (__GNUC__ * 10000 \
 						+ __GNUC_MINOR__ * 100 \
 						+ __GNUC_PATCHLEVEL__)
+	#ifndef LL_GNUC
+		#define LL_GNUC 1
+	#endif
+#elif defined(__MSVC_VER__) || defined(_MSC_VER)
+	#ifndef LL_MSVC
+		#define LL_MSVC 1
+	#endif
+#endif
 
+// Deal with minor differences on Unixy OSes.
+#if LL_DARWIN || LL_LINUX
 	// Different name, same functionality.
 	#define stricmp strcasecmp
 	#define strnicmp strncasecmp
@@ -69,9 +79,9 @@
 #endif
 
 // Deal with the differeneces on Windows
-#if LL_WINDOWS
+#if LL_MSVC
 #define snprintf safe_snprintf		/* Flawfinder: ignore */
-#endif	// LL_WINDOWS
+#endif	// LL_MSVC
 
 // Static linking with apr on windows needs to be declared.
 #ifdef LL_WINDOWS
@@ -90,7 +100,7 @@
 
 
 // Deal with VC6 problems
-#if defined(LL_WINDOWS)
+#if LL_MSVC
 #pragma warning( 3	     : 4701 )	// "local variable used without being initialized"  Treat this as level 3, not level 4.
 #pragma warning( 3	     : 4702 )	// "unreachable code"  Treat this as level 3, not level 4.
 #pragma warning( 3	     : 4189 )	// "local variable initialized but not referenced"  Treat this as level 3, not level 4.
@@ -101,6 +111,6 @@
 #pragma warning( disable : 4503 )	// 'decorated name length exceeded, name was truncated'. Does not seem to affect compilation.
 #pragma warning( disable : 4800 )	// 'BOOL' : forcing value to bool 'true' or 'false' (performance warning)
 #pragma warning( disable : 4996 )	// warning: deprecated
-#endif	//	LL_WINDOWS
+#endif	//	LL_MSVC
 
 #endif	//	not LL_LINDEN_PREPROCESSOR_H
diff --git a/indra/llcommon/llprocessor.cpp b/indra/llcommon/llprocessor.cpp
index 00f4a13c39..bcabb47a66 100644
--- a/indra/llcommon/llprocessor.cpp
+++ b/indra/llcommon/llprocessor.cpp
@@ -1518,6 +1518,7 @@ void CProcessor::GetStandardProcessorExtensions()
 	CPUInfo._Ext.FXSR_FastStreamingSIMD_ExtensionsSaveRestore	= CheckBit(edxreg, 24);
 	CPUInfo._Ext.SSE_StreamingSIMD_Extensions					= CheckBit(edxreg, 25);
 	CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions					= CheckBit(edxreg, 26);
+	CPUInfo._Ext.Altivec_Extensions = false;
 	CPUInfo._Ext.SS_SelfSnoop									= CheckBit(edxreg, 27);
 	CPUInfo._Ext.HT_HyperThreading								= CheckBit(edxreg, 28);
 	CPUInfo._Ext.HT_HyterThreadingSiblings = (ebxreg >> 16) & 0xFF;
@@ -1871,11 +1872,12 @@ const ProcessorInfo *CProcessor::GetCPUInfo()
 		break;
 	}
 
-	// It's kinda like MMX or SSE...
 	CPUInfo._Ext.EMMX_MultimediaExtensions = 
 	CPUInfo._Ext.MMX_MultimediaExtensions = 
 	CPUInfo._Ext.SSE_StreamingSIMD_Extensions =
-	CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = hasFeature("hw.optional.altivec");
+	CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = false;
+
+	CPUInfo._Ext.Altivec_Extensions = hasFeature("hw.optional.altivec");
 
 #endif
 
@@ -1892,6 +1894,7 @@ const ProcessorInfo *CProcessor::GetCPUInfo()
 	CPUInfo._Ext.MMX_MultimediaExtensions = hasFeature("hw.optional.mmx");
 	CPUInfo._Ext.SSE_StreamingSIMD_Extensions = hasFeature("hw.optional.sse");
 	CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = hasFeature("hw.optional.sse2");
+	CPUInfo._Ext.Altivec_Extensions = false;
 	CPUInfo._Ext.AA64_AMD64BitArchitecture = hasFeature("hw.optional.x86_64");
 
 #endif
@@ -2045,6 +2048,7 @@ bool CProcessor::CPUInfoToText(char *strBuffer, unsigned int uiMaxLen)
 	BOOLADD("SS     Self Snoop:                                 ", CPUInfo._Ext.SS_SelfSnoop);
 	BOOLADD("SSE    Streaming SIMD Extensions:                  ", CPUInfo._Ext.SSE_StreamingSIMD_Extensions);
 	BOOLADD("SSE2   Streaming SIMD 2 Extensions:                ", CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions);
+	BOOLADD("ALTVEC Altivec Extensions:                         ", CPUInfo._Ext.Altivec_Extensions);
 	BOOLADD("TM     Thermal Monitor:                            ", CPUInfo._Ext.TM_ThermalMonitor);
 	BOOLADD("TSC    Time Stamp Counter:                         ", CPUInfo._Ext.TSC_TimeStampCounter);
 	BOOLADD("VME    Virtual 8086 Mode Enhancements:             ", CPUInfo._Ext.VME_Virtual8086ModeEnhancements);
diff --git a/indra/llcommon/llprocessor.h b/indra/llcommon/llprocessor.h
index 8453263f9d..9060e8aa95 100644
--- a/indra/llcommon/llprocessor.h
+++ b/indra/llcommon/llprocessor.h
@@ -51,6 +51,7 @@ typedef struct ProcessorExtensions
 	bool FXSR_FastStreamingSIMD_ExtensionsSaveRestore;
 	bool SSE_StreamingSIMD_Extensions;
 	bool SSE2_StreamingSIMD2_Extensions;
+	bool Altivec_Extensions;
 	bool SS_SelfSnoop;
 	bool HT_HyperThreading;
 	unsigned int HT_HyterThreadingSiblings;
diff --git a/indra/llcommon/llskiplist.h b/indra/llcommon/llskiplist.h
index ed1aa1f0aa..4676fb8f18 100644
--- a/indra/llcommon/llskiplist.h
+++ b/indra/llcommon/llskiplist.h
@@ -8,11 +8,10 @@
 #ifndef LL_LLSKIPLIST_H
 #define LL_LLSKIPLIST_H
 
-#include "llerror.h"
-//#include "vmath.h"
+#include "llrand.h"
 
 // NOTA BENE: Insert first needs to be < NOT <=
-
+// Binary depth must be >= 2
 template <class DATA_TYPE, S32 BINARY_DEPTH = 10>
 class LLSkipList
 {
@@ -124,14 +123,11 @@ private:
 // Implementation
 //
 
+
+// Binary depth must be >= 2
 template <class DATA_TYPE, S32 BINARY_DEPTH>
 inline void LLSkipList<DATA_TYPE, BINARY_DEPTH>::init()
 {
-	if (BINARY_DEPTH < 2)
-	{
-		llerrs << "Trying to create skip list with too little depth, "
-			"must be 2 or greater" << llendl;
-	}
 	S32 i;
 	for (i = 0; i < BINARY_DEPTH; i++)
 	{
diff --git a/indra/llcommon/llsys.cpp b/indra/llcommon/llsys.cpp
index 6f0bda4b71..90cc374ade 100644
--- a/indra/llcommon/llsys.cpp
+++ b/indra/llcommon/llsys.cpp
@@ -284,12 +284,33 @@ LLCPUInfo::LLCPUInfo()
 {
 	CProcessor proc;
 	const ProcessorInfo* info = proc.GetCPUInfo();
-	mHasSSE = (info->_Ext.SSE_StreamingSIMD_Extensions != 0);
-	mHasSSE2 = (info->_Ext.SSE2_StreamingSIMD2_Extensions != 0);
+	// proc.WriteInfoTextFile("procInfo.txt");
+	mHasSSE = info->_Ext.SSE_StreamingSIMD_Extensions;
+	mHasSSE2 = info->_Ext.SSE2_StreamingSIMD2_Extensions;
+	mHasAltivec = info->_Ext.Altivec_Extensions;
 	mCPUMhz = (S32)(proc.GetCPUFrequency(50)/1000000.0);
 	mFamily.assign( info->strFamily );
 }
 
+bool LLCPUInfo::hasAltivec() const
+{
+	return mHasAltivec;
+}
+
+bool LLCPUInfo::hasSSE() const
+{
+	return mHasSSE;
+}
+
+bool LLCPUInfo::hasSSE2() const
+{
+	return mHasSSE2;
+}
+
+S32 LLCPUInfo::getMhz() const
+{
+	return mCPUMhz;
+}
 
 std::string LLCPUInfo::getCPUString() const
 {
diff --git a/indra/llcommon/llsys.h b/indra/llcommon/llsys.h
index 7808a97b80..4b6fbe149b 100644
--- a/indra/llcommon/llsys.h
+++ b/indra/llcommon/llsys.h
@@ -52,16 +52,18 @@ public:
 
 	std::string getCPUString() const;
 
-	BOOL  hasSSE() const	{ return mHasSSE; }
-	BOOL  hasSSE2()	const	{ return mHasSSE2; }
-	S32	  getMhz() const	{ return mCPUMhz; }
+	bool hasAltivec() const;
+	bool hasSSE() const;
+	bool hasSSE2() const;
+	S32	 getMhz() const;
 
 	// Family is "AMD Duron" or "Intel Pentium Pro"
 	const std::string& getFamily() const { return mFamily; }
 
 private:
-	BOOL mHasSSE;
-	BOOL mHasSSE2;
+	bool mHasSSE;
+	bool mHasSSE2;
+	bool mHasAltivec;
 	S32 mCPUMhz;
 	std::string mFamily;
 };
diff --git a/indra/llmath/llv4math.h b/indra/llmath/llv4math.h
new file mode 100644
index 0000000000..4a299716b1
--- /dev/null
+++ b/indra/llmath/llv4math.h
@@ -0,0 +1,101 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4* class header file - vector processor enabled math
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+#ifndef	LL_LLV4MATH_H
+#define	LL_LLV4MATH_H
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4MATH - GNUC
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if LL_GNUC && __GNUC__ >= 4 && __SSE__
+
+#define			LL_VECTORIZE					1
+
+#if LL_DARWIN
+
+#include <Accelerate/Accelerate.h>
+#include <xmmintrin.h>
+typedef vFloat	V4F32;
+
+#else
+
+#include <xmmintrin.h>
+typedef float	V4F32							__attribute__((vector_size(16)));
+
+#endif
+
+#endif
+#if LL_GNUC
+
+#define			LL_LLV4MATH_ALIGN_PREFIX
+#define			LL_LLV4MATH_ALIGN_POSTFIX		__attribute__((aligned(16)))
+
+#endif
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4MATH - MSVC
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if LL_MSVC && _M_IX86_FP
+
+#define			LL_VECTORIZE					1
+
+#include <xmmintrin.h>
+
+typedef __m128	V4F32;
+
+#endif
+#if LL_MSVC
+
+#define			LL_LLV4MATH_ALIGN_PREFIX		__declspec(align(16))
+#define			LL_LLV4MATH_ALIGN_POSTFIX
+
+#endif
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4MATH - default - no vectorization
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if !LL_VECTORIZE
+
+#define			LL_VECTORIZE					0
+
+struct			V4F32							{ F32 __pad__[4]; };
+
+inline F32 llv4lerp(F32 a, F32 b, F32 w)		{ return ( b - a ) * w + a; }
+
+#endif
+
+#ifndef			LL_LLV4MATH_ALIGN_PREFIX
+#	define			LL_LLV4MATH_ALIGN_PREFIX
+#endif
+#ifndef			LL_LLV4MATH_ALIGN_POSTFIX
+#	define			LL_LLV4MATH_ALIGN_POSTFIX
+#endif
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4MATH
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+
+#define			LLV4_NUM_AXIS					4
+
+class LLV4Vector3;
+class LLV4Matrix3;
+class LLV4Matrix4;
+
+#endif
diff --git a/indra/llmath/llv4matrix3.h b/indra/llmath/llv4matrix3.h
new file mode 100644
index 0000000000..a273abe496
--- /dev/null
+++ b/indra/llmath/llv4matrix3.h
@@ -0,0 +1,202 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4* class header file - vector processor enabled math
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+#ifndef LL_LLV4MATRIX3_H
+#define LL_LLV4MATRIX3_H
+
+#include "llv4math.h"
+#include "llv4vector3.h"
+#include "m3math.h"			// for operator LLMatrix3()
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+LL_LLV4MATH_ALIGN_PREFIX
+
+class LLV4Matrix3
+{
+public:
+	union {
+		F32		mMatrix[LLV4_NUM_AXIS][LLV4_NUM_AXIS];
+		V4F32	mV[LLV4_NUM_AXIS];
+	};
+
+	void				lerp(const LLV4Matrix3 &a, const LLV4Matrix3 &b, const F32 &w);
+	void				multiply(const LLVector3 &a, LLVector3& out) const;
+	void				multiply(const LLVector4 &a, LLV4Vector3& out) const;
+	void				multiply(const LLVector3 &a, LLV4Vector3& out) const;
+
+	const LLV4Matrix3&	transpose();
+	const LLV4Matrix3&	operator=(const LLMatrix3& a);
+
+	operator			LLMatrix3()	const { return (reinterpret_cast<const LLMatrix4*>(const_cast<const F32*>(&mMatrix[0][0])))->getMat3(); }
+
+	friend LLVector3	operator*(const LLVector3& a, const LLV4Matrix3& b);
+}
+
+LL_LLV4MATH_ALIGN_POSTFIX;
+
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix3 - SSE
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if LL_VECTORIZE
+
+inline void LLV4Matrix3::lerp(const LLV4Matrix3 &a, const LLV4Matrix3 &b, const F32 &w)
+{
+	__m128 vw = _mm_set1_ps(w);
+	mV[VX] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VX], a.mV[VX]), vw), a.mV[VX]); // ( b - a ) * w + a
+	mV[VY] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VY], a.mV[VY]), vw), a.mV[VY]);
+	mV[VZ] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VZ], a.mV[VZ]), vw), a.mV[VZ]);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector3 &a, LLVector3& o) const
+{
+	LLV4Vector3 j;
+	j.v = 				 	_mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX]); // ( ax * vx ) + ...
+	j.v = _mm_add_ps(j.v  , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+	j.v = _mm_add_ps(j.v  , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+	o.setVec(j.mV);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector4 &a, LLV4Vector3& o) const
+{
+	o.v =					_mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX]); // ( ax * vx ) + ...
+	o.v = _mm_add_ps(o.v  , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+	o.v = _mm_add_ps(o.v  , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+}
+
+inline void LLV4Matrix3::multiply(const LLVector3 &a, LLV4Vector3& o) const
+{
+	o.v =					_mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX]); // ( ax * vx ) + ...
+	o.v = _mm_add_ps(o.v  , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+	o.v = _mm_add_ps(o.v  , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#else
+
+inline void LLV4Matrix3::lerp(const LLV4Matrix3 &a, const LLV4Matrix3 &b, const F32 &w)
+{
+	mMatrix[VX][VX] = llv4lerp(a.mMatrix[VX][VX], b.mMatrix[VX][VX], w);
+	mMatrix[VX][VY] = llv4lerp(a.mMatrix[VX][VY], b.mMatrix[VX][VY], w);
+	mMatrix[VX][VZ] = llv4lerp(a.mMatrix[VX][VZ], b.mMatrix[VX][VZ], w);
+
+	mMatrix[VY][VX] = llv4lerp(a.mMatrix[VY][VX], b.mMatrix[VY][VX], w);
+	mMatrix[VY][VY] = llv4lerp(a.mMatrix[VY][VY], b.mMatrix[VY][VY], w);
+	mMatrix[VY][VZ] = llv4lerp(a.mMatrix[VY][VZ], b.mMatrix[VY][VZ], w);
+
+	mMatrix[VZ][VX] = llv4lerp(a.mMatrix[VZ][VX], b.mMatrix[VZ][VX], w);
+	mMatrix[VZ][VY] = llv4lerp(a.mMatrix[VZ][VY], b.mMatrix[VZ][VY], w);
+	mMatrix[VZ][VZ] = llv4lerp(a.mMatrix[VZ][VZ], b.mMatrix[VZ][VZ], w);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector3 &a, LLVector3& o) const
+{
+	o.setVec(		a.mV[VX] * mMatrix[VX][VX] + 
+					a.mV[VY] * mMatrix[VY][VX] + 
+					a.mV[VZ] * mMatrix[VZ][VX],
+					 
+					a.mV[VX] * mMatrix[VX][VY] + 
+					a.mV[VY] * mMatrix[VY][VY] + 
+					a.mV[VZ] * mMatrix[VZ][VY],
+					 
+					a.mV[VX] * mMatrix[VX][VZ] + 
+					a.mV[VY] * mMatrix[VY][VZ] + 
+					a.mV[VZ] * mMatrix[VZ][VZ]);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector4 &a, LLV4Vector3& o) const
+{
+	o.setVec(		a.mV[VX] * mMatrix[VX][VX] + 
+					a.mV[VY] * mMatrix[VY][VX] + 
+					a.mV[VZ] * mMatrix[VZ][VX],
+					 
+					a.mV[VX] * mMatrix[VX][VY] + 
+					a.mV[VY] * mMatrix[VY][VY] + 
+					a.mV[VZ] * mMatrix[VZ][VY],
+					 
+					a.mV[VX] * mMatrix[VX][VZ] + 
+					a.mV[VY] * mMatrix[VY][VZ] + 
+					a.mV[VZ] * mMatrix[VZ][VZ]);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector3 &a, LLV4Vector3& o) const
+{
+	o.setVec(		a.mV[VX] * mMatrix[VX][VX] + 
+					a.mV[VY] * mMatrix[VY][VX] + 
+					a.mV[VZ] * mMatrix[VZ][VX],
+					 
+					a.mV[VX] * mMatrix[VX][VY] + 
+					a.mV[VY] * mMatrix[VY][VY] + 
+					a.mV[VZ] * mMatrix[VZ][VY],
+					 
+					a.mV[VX] * mMatrix[VX][VZ] + 
+					a.mV[VY] * mMatrix[VY][VZ] + 
+					a.mV[VZ] * mMatrix[VZ][VZ]);
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#endif
+
+inline const LLV4Matrix3&	LLV4Matrix3::transpose()
+{
+#if LL_VECTORIZE && defined(_MM_TRANSPOSE4_PS)
+	_MM_TRANSPOSE4_PS(mV[VX], mV[VY], mV[VZ], mV[VW]);
+	return *this;
+#else
+	F32 temp;
+	temp = mMatrix[VX][VY]; mMatrix[VX][VY] = mMatrix[VY][VX]; mMatrix[VY][VX] = temp;
+	temp = mMatrix[VX][VZ]; mMatrix[VX][VZ] = mMatrix[VZ][VX]; mMatrix[VZ][VX] = temp;
+	temp = mMatrix[VY][VZ]; mMatrix[VY][VZ] = mMatrix[VZ][VY]; mMatrix[VZ][VY] = temp;
+#endif
+	return *this;
+}
+
+inline const LLV4Matrix3& LLV4Matrix3::operator=(const LLMatrix3& a)
+{
+	memcpy(mMatrix[VX], a.mMatrix[VX], sizeof(F32) * 3 );
+	memcpy(mMatrix[VY], a.mMatrix[VY], sizeof(F32) * 3 );
+	memcpy(mMatrix[VZ], a.mMatrix[VZ], sizeof(F32) * 3 );
+	return *this;
+}
+
+inline LLVector3 operator*(const LLVector3& a, const LLV4Matrix3& b)
+{
+	return LLVector3(
+				a.mV[VX] * b.mMatrix[VX][VX] + 
+				a.mV[VY] * b.mMatrix[VY][VX] + 
+				a.mV[VZ] * b.mMatrix[VZ][VX],
+	
+				a.mV[VX] * b.mMatrix[VX][VY] + 
+				a.mV[VY] * b.mMatrix[VY][VY] + 
+				a.mV[VZ] * b.mMatrix[VZ][VY],
+	
+				a.mV[VX] * b.mMatrix[VX][VZ] + 
+				a.mV[VY] * b.mMatrix[VY][VZ] + 
+				a.mV[VZ] * b.mMatrix[VZ][VZ] );
+}
+
+#endif
diff --git a/indra/llmath/llv4matrix4.h b/indra/llmath/llv4matrix4.h
new file mode 100644
index 0000000000..0673f6fa7d
--- /dev/null
+++ b/indra/llmath/llv4matrix4.h
@@ -0,0 +1,231 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4* class header file - vector processor enabled math
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+#ifndef LL_LLV4MATRIX4_H
+#define LL_LLV4MATRIX4_H
+
+#include "llv4math.h"
+#include "llv4matrix3.h"	// just for operator LLV4Matrix3()
+#include "llv4vector3.h"
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix4
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+LL_LLV4MATH_ALIGN_PREFIX
+
+class LLV4Matrix4
+{
+public:
+	union {
+		F32		mMatrix[LLV4_NUM_AXIS][LLV4_NUM_AXIS];
+		V4F32	mV[LLV4_NUM_AXIS];
+	};
+
+	void				lerp(const LLV4Matrix4 &a, const LLV4Matrix4 &b, const F32 &w);
+	void				multiply(const LLVector3 &a, LLVector3& o) const;
+	void				multiply(const LLVector3 &a, LLV4Vector3& o) const;
+
+	const LLV4Matrix4&	transpose();
+	const LLV4Matrix4&  translate(const LLVector3 &vec);
+	const LLV4Matrix4&  translate(const LLV4Vector3 &vec);
+	const LLV4Matrix4&	operator=(const LLMatrix4& a);
+
+	operator			LLMatrix4()	const { return *(reinterpret_cast<const LLMatrix4*>(const_cast<const F32*>(&mMatrix[0][0]))); }
+	operator			LLV4Matrix3()	const { return *(reinterpret_cast<const LLV4Matrix3*>(const_cast<const F32*>(&mMatrix[0][0]))); }
+	
+	friend LLVector3	operator*(const LLVector3 &a, const LLV4Matrix4 &b);
+}
+
+LL_LLV4MATH_ALIGN_POSTFIX;
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix4 - SSE
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if LL_VECTORIZE
+
+inline void LLV4Matrix4::lerp(const LLV4Matrix4 &a, const LLV4Matrix4 &b, const F32 &w)
+{
+	__m128 vw = _mm_set1_ps(w);
+	mV[VX] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VX], a.mV[VX]), vw), a.mV[VX]); // ( b - a ) * w + a
+	mV[VY] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VY], a.mV[VY]), vw), a.mV[VY]);
+	mV[VZ] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VZ], a.mV[VZ]), vw), a.mV[VZ]);
+	mV[VW] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VW], a.mV[VW]), vw), a.mV[VW]);
+}
+
+inline void LLV4Matrix4::multiply(const LLVector3 &a, LLVector3& o) const
+{
+	LLV4Vector3 j;
+	j.v = _mm_add_ps(mV[VW], _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX])); // ( ax * vx ) + vw
+	j.v = _mm_add_ps(j.v   , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+	j.v = _mm_add_ps(j.v   , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+	o.setVec(j.mV);
+}
+
+inline void LLV4Matrix4::multiply(const LLVector3 &a, LLV4Vector3& o) const
+{
+	o.v = _mm_add_ps(mV[VW], _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX])); // ( ax * vx ) + vw
+	o.v = _mm_add_ps(o.v   , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+	o.v = _mm_add_ps(o.v   , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+}
+
+inline const LLV4Matrix4& LLV4Matrix4::translate(const LLV4Vector3 &vec)
+{
+	mV[VW] = _mm_add_ps(mV[VW], vec.v);
+	return (*this);
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix4
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#else
+
+inline void LLV4Matrix4::lerp(const LLV4Matrix4 &a, const LLV4Matrix4 &b, const F32 &w)
+{
+	mMatrix[VX][VX] = llv4lerp(a.mMatrix[VX][VX], b.mMatrix[VX][VX], w);
+	mMatrix[VX][VY] = llv4lerp(a.mMatrix[VX][VY], b.mMatrix[VX][VY], w);
+	mMatrix[VX][VZ] = llv4lerp(a.mMatrix[VX][VZ], b.mMatrix[VX][VZ], w);
+
+	mMatrix[VY][VX] = llv4lerp(a.mMatrix[VY][VX], b.mMatrix[VY][VX], w);
+	mMatrix[VY][VY] = llv4lerp(a.mMatrix[VY][VY], b.mMatrix[VY][VY], w);
+	mMatrix[VY][VZ] = llv4lerp(a.mMatrix[VY][VZ], b.mMatrix[VY][VZ], w);
+
+	mMatrix[VZ][VX] = llv4lerp(a.mMatrix[VZ][VX], b.mMatrix[VZ][VX], w);
+	mMatrix[VZ][VY] = llv4lerp(a.mMatrix[VZ][VY], b.mMatrix[VZ][VY], w);
+	mMatrix[VZ][VZ] = llv4lerp(a.mMatrix[VZ][VZ], b.mMatrix[VZ][VZ], w);
+
+	mMatrix[VW][VX] = llv4lerp(a.mMatrix[VW][VX], b.mMatrix[VW][VX], w);
+	mMatrix[VW][VY] = llv4lerp(a.mMatrix[VW][VY], b.mMatrix[VW][VY], w);
+	mMatrix[VW][VZ] = llv4lerp(a.mMatrix[VW][VZ], b.mMatrix[VW][VZ], w);
+}
+
+inline void LLV4Matrix4::multiply(const LLVector3 &a, LLVector3& o) const
+{
+	o.setVec(		a.mV[VX] * mMatrix[VX][VX] + 
+					a.mV[VY] * mMatrix[VY][VX] + 
+					a.mV[VZ] * mMatrix[VZ][VX] +
+					mMatrix[VW][VX],
+					 
+					a.mV[VX] * mMatrix[VX][VY] + 
+					a.mV[VY] * mMatrix[VY][VY] + 
+					a.mV[VZ] * mMatrix[VZ][VY] +
+					mMatrix[VW][VY],
+					 
+					a.mV[VX] * mMatrix[VX][VZ] + 
+					a.mV[VY] * mMatrix[VY][VZ] + 
+					a.mV[VZ] * mMatrix[VZ][VZ] +
+					mMatrix[VW][VZ]);
+}
+
+inline void LLV4Matrix4::multiply(const LLVector3 &a, LLV4Vector3& o) const
+{
+	o.setVec(		a.mV[VX] * mMatrix[VX][VX] + 
+					a.mV[VY] * mMatrix[VY][VX] + 
+					a.mV[VZ] * mMatrix[VZ][VX] +
+					mMatrix[VW][VX],
+					 
+					a.mV[VX] * mMatrix[VX][VY] + 
+					a.mV[VY] * mMatrix[VY][VY] + 
+					a.mV[VZ] * mMatrix[VZ][VY] +
+					mMatrix[VW][VY],
+					 
+					a.mV[VX] * mMatrix[VX][VZ] + 
+					a.mV[VY] * mMatrix[VY][VZ] + 
+					a.mV[VZ] * mMatrix[VZ][VZ] +
+					mMatrix[VW][VZ]);
+}
+
+inline const LLV4Matrix4& LLV4Matrix4::translate(const LLV4Vector3 &vec)
+{
+	mMatrix[3][0] += vec.mV[0];
+	mMatrix[3][1] += vec.mV[1];
+	mMatrix[3][2] += vec.mV[2];
+	return (*this);
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix4
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#endif
+
+inline const LLV4Matrix4& LLV4Matrix4::operator=(const LLMatrix4& a)
+{
+	memcpy(mMatrix, a.mMatrix, sizeof(F32) * 16 );
+	return *this;
+}
+
+inline const LLV4Matrix4& LLV4Matrix4::transpose()
+{
+#if LL_VECTORIZE && defined(_MM_TRANSPOSE4_PS)
+	_MM_TRANSPOSE4_PS(mV[VX], mV[VY], mV[VZ], mV[VW]);
+#else
+	LLV4Matrix4 mat;
+	mat.mMatrix[0][0] = mMatrix[0][0];
+	mat.mMatrix[1][0] = mMatrix[0][1];
+	mat.mMatrix[2][0] = mMatrix[0][2];
+	mat.mMatrix[3][0] = mMatrix[0][3];
+
+	mat.mMatrix[0][1] = mMatrix[1][0];
+	mat.mMatrix[1][1] = mMatrix[1][1];
+	mat.mMatrix[2][1] = mMatrix[1][2];
+	mat.mMatrix[3][1] = mMatrix[1][3];
+
+	mat.mMatrix[0][2] = mMatrix[2][0];
+	mat.mMatrix[1][2] = mMatrix[2][1];
+	mat.mMatrix[2][2] = mMatrix[2][2];
+	mat.mMatrix[3][2] = mMatrix[2][3];
+
+	mat.mMatrix[0][3] = mMatrix[3][0];
+	mat.mMatrix[1][3] = mMatrix[3][1];
+	mat.mMatrix[2][3] = mMatrix[3][2];
+	mat.mMatrix[3][3] = mMatrix[3][3];
+
+	*this = mat;
+#endif
+	return *this;
+}
+
+inline const LLV4Matrix4& LLV4Matrix4::translate(const LLVector3 &vec)
+{
+	mMatrix[3][0] += vec.mV[0];
+	mMatrix[3][1] += vec.mV[1];
+	mMatrix[3][2] += vec.mV[2];
+	return (*this);
+}
+
+inline LLVector3 operator*(const LLVector3 &a, const LLV4Matrix4 &b)
+{
+	return LLVector3(a.mV[VX] * b.mMatrix[VX][VX] + 
+					 a.mV[VY] * b.mMatrix[VY][VX] + 
+					 a.mV[VZ] * b.mMatrix[VZ][VX] +
+					 b.mMatrix[VW][VX],
+					 
+					 a.mV[VX] * b.mMatrix[VX][VY] + 
+					 a.mV[VY] * b.mMatrix[VY][VY] + 
+					 a.mV[VZ] * b.mMatrix[VZ][VY] +
+					 b.mMatrix[VW][VY],
+					 
+					 a.mV[VX] * b.mMatrix[VX][VZ] + 
+					 a.mV[VY] * b.mMatrix[VY][VZ] + 
+					 a.mV[VZ] * b.mMatrix[VZ][VZ] +
+					 b.mMatrix[VW][VZ]);
+}
+
+
+#endif
diff --git a/indra/llmath/llv4vector3.h b/indra/llmath/llv4vector3.h
new file mode 100644
index 0000000000..7bf8c5ce91
--- /dev/null
+++ b/indra/llmath/llv4vector3.h
@@ -0,0 +1,62 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4* class header file - vector processor enabled math
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+#ifndef LL_LLV4VECTOR3_H
+#define LL_LLV4VECTOR3_H
+
+#include "llv4math.h"
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Vector3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+LL_LLV4MATH_ALIGN_PREFIX
+
+class LLV4Vector3
+{
+public:
+	union {
+		F32		mV[LLV4_NUM_AXIS];
+		V4F32	v;
+	};
+
+	enum {
+		ALIGNMENT = 16
+		};
+
+	void				setVec(F32 x, F32 y, F32 z);
+	void				setVec(F32 a);
+}
+
+LL_LLV4MATH_ALIGN_POSTFIX;
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Vector3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+inline void	LLV4Vector3::setVec(F32 x, F32 y, F32 z)
+{
+	mV[VX] = x;
+	mV[VY] = y;
+	mV[VZ] = z;
+}
+
+inline void	LLV4Vector3::setVec(F32 a)
+{
+#if LL_VECTORIZE
+	v = _mm_set1_ps(a);
+#else
+	setVec(a, a, a);
+#endif
+}
+
+#endif
diff --git a/indra/newview/lldrawable.h b/indra/newview/lldrawable.h
index 48c58dbb4c..328a116f59 100644
--- a/indra/newview/lldrawable.h
+++ b/indra/newview/lldrawable.h
@@ -26,6 +26,7 @@
 #include "llviewerobject.h"
 #include "llrect.h"
 
+class LLCamera;
 class LLDrawPool;
 class LLDrawable;
 class LLFace;
diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp
index 150943465d..ba4c7e1b20 100644
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@@ -19,6 +19,7 @@
 #include "llfasttimer.h"
 
 #include "llagent.h"
+#include "llapr.h"
 #include "llbox.h"
 #include "lldrawable.h"
 #include "lldrawpoolavatar.h"
@@ -29,12 +30,18 @@
 #include "llglheaders.h"
 #include "lltexlayer.h"
 #include "llviewercamera.h"
+#include "llviewercontrol.h"
 #include "llviewerimagelist.h"
 #include "llviewerjointmesh.h"
 #include "llvoavatar.h"
 #include "llsky.h"
 #include "pipeline.h"
 #include "llglslshader.h"
+#include "llmath.h"
+#include "v4math.h"
+#include "m3math.h"
+#include "m4math.h"
+
 
 #if !LL_DARWIN && !LL_LINUX
 extern PFNGLWEIGHTPOINTERARBPROC glWeightPointerARB;
@@ -48,6 +55,7 @@ static const U32 sRenderMask = LLVertexBuffer::MAP_VERTEX |
 							   LLVertexBuffer::MAP_NORMAL |
 							   LLVertexBuffer::MAP_TEXCOORD;
 
+
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 // LLViewerJointMesh::LLSkinJoint
@@ -100,6 +108,7 @@ BOOL LLSkinJoint::setupSkinJoint( LLViewerJoint *joint)
 	return TRUE;
 }
 
+
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 // LLViewerJointMesh
@@ -394,9 +403,9 @@ const S32 NUM_AXES = 3;
 // rotation Z 0-n
 // pivot parent 0-n -- child = n+1
 
-static LLMatrix4 gJointMat[32];
-static LLMatrix3 gJointRot[32];
-static LLVector4 gJointPivot[32];
+static LLMatrix4	gJointMatUnaligned[32];
+static LLMatrix3	gJointRotUnaligned[32];
+static LLVector4	gJointPivot[32];
 
 //-----------------------------------------------------------------------------
 // uploadJointMatrices()
@@ -417,8 +426,8 @@ void LLViewerJointMesh::uploadJointMatrices()
 		{
 			joint_mat *= LLDrawPoolAvatar::getModelView();
 		}
-		gJointMat[joint_num] = joint_mat;
-		gJointRot[joint_num] = joint_mat.getMat3();
+		gJointMatUnaligned[joint_num] = joint_mat;
+		gJointRotUnaligned[joint_num] = joint_mat.getMat3();
 	}
 
 	BOOL last_pivot_uploaded = FALSE;
@@ -455,8 +464,8 @@ void LLViewerJointMesh::uploadJointMatrices()
 	{
 		LLVector3 pivot;
 		pivot = LLVector3(gJointPivot[i]);
-		pivot = pivot * gJointRot[i];
-		gJointMat[i].translate(pivot);
+		pivot = pivot * gJointRotUnaligned[i];
+		gJointMatUnaligned[i].translate(pivot);
 	}
 
 	// upload matrices
@@ -467,11 +476,11 @@ void LLViewerJointMesh::uploadJointMatrices()
 
 		for (joint_num = 0; joint_num < reference_mesh->mJointRenderData.count(); joint_num++)
 		{
-			gJointMat[joint_num].transpose();
+			gJointMatUnaligned[joint_num].transpose();
 
 			for (S32 axis = 0; axis < NUM_AXES; axis++)
 			{
-				F32* vector = gJointMat[joint_num].mMatrix[axis];
+				F32* vector = gJointMatUnaligned[joint_num].mMatrix[axis];
 				//glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, LL_CHARACTER_MAX_JOINTS_PER_MESH * axis + joint_num+5, (GLfloat*)vector);
 				U32 offset = LL_CHARACTER_MAX_JOINTS_PER_MESH*axis+joint_num;
 				memcpy(mat+offset*4, vector, sizeof(GLfloat)*4);
@@ -883,21 +892,9 @@ BOOL LLViewerJointMesh::updateLOD(F32 pixel_area, BOOL activate)
 	return (valid != activate);
 }
 
-
-void LLViewerJointMesh::updateGeometry()
+// static
+void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh)
 {
-	if (!(mValid
-		  && mMesh
-		  && mFace
-		  && mMesh->hasWeights()
-		  && mFace->mVertexBuffer.notNull()
-		  && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0))
-	{
-		return;
-	}
-	
-	uploadJointMatrices();
-
 	LLStrider<LLVector3> o_vertices;
 	LLStrider<LLVector3> o_normals;
 
@@ -938,9 +935,9 @@ void LLViewerJointMesh::updateGeometry()
 		// No lerp required in this case.
 		if (w == 1.0f)
 		{
-			gBlendMat = gJointMat[joint+1];
+			gBlendMat = gJointMatUnaligned[joint+1];
 			o_vertices[bidx] = coords[index] * gBlendMat;
-			gBlendRotMat = gJointRot[joint+1];
+			gBlendRotMat = gJointRotUnaligned[joint+1];
 			o_normals[bidx] = normals[index] * gBlendRotMat;
 			continue;
 		}
@@ -948,8 +945,8 @@ void LLViewerJointMesh::updateGeometry()
 		// Try to keep all the accesses to the matrix data as close
 		// together as possible.  This function is a hot spot on the
 		// Mac. JC
-		LLMatrix4 &m0 = gJointMat[joint+1];
-		LLMatrix4 &m1 = gJointMat[joint+0];
+		LLMatrix4 &m0 = gJointMatUnaligned[joint+1];
+		LLMatrix4 &m1 = gJointMatUnaligned[joint+0];
 		
 		gBlendMat.mMatrix[VX][VX] = lerp(m1.mMatrix[VX][VX], m0.mMatrix[VX][VX], w);
 		gBlendMat.mMatrix[VX][VY] = lerp(m1.mMatrix[VX][VY], m0.mMatrix[VX][VY], w);
@@ -969,8 +966,8 @@ void LLViewerJointMesh::updateGeometry()
 
 		o_vertices[bidx] = coords[index] * gBlendMat;
 		
-		LLMatrix3 &n0 = gJointRot[joint+1];
-		LLMatrix3 &n1 = gJointRot[joint+0];
+		LLMatrix3 &n0 = gJointRotUnaligned[joint+1];
+		LLMatrix3 &n1 = gJointRotUnaligned[joint+0];
 		
 		gBlendRotMat.mMatrix[VX][VX] = lerp(n1.mMatrix[VX][VX], n0.mMatrix[VX][VX], w);
 		gBlendRotMat.mMatrix[VX][VY] = lerp(n1.mMatrix[VX][VY], n0.mMatrix[VX][VY], w);
@@ -988,6 +985,161 @@ void LLViewerJointMesh::updateGeometry()
 	}
 }
 
+const U32 UPDATE_GEOMETRY_CALL_MASK			= 0x1FFF; // 8K samples before overflow
+const U32 UPDATE_GEOMETRY_CALL_OVERFLOW		= ~UPDATE_GEOMETRY_CALL_MASK;
+static bool sUpdateGeometryCallPointer		= false;
+static F64 sUpdateGeometryGlobalTime		= 0.0 ;
+static F64 sUpdateGeometryElapsedTime		= 0.0 ;
+static F64 sUpdateGeometryElapsedTimeOff	= 0.0 ;
+static F64 sUpdateGeometryElapsedTimeOn		= 0.0 ;
+static F64 sUpdateGeometryRunAvgOff[10];
+static F64 sUpdateGeometryRunAvgOn[10];
+static U32 sUpdateGeometryRunCount			= 0 ;
+static U32 sUpdateGeometryCalls				= 0 ;
+static U32 sUpdateGeometryLastProcessor		= 0 ;
+void (*LLViewerJointMesh::sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh);
+
+void LLViewerJointMesh::updateGeometry()
+{
+	extern BOOL gVectorizePerfTest;
+	extern U32	gVectorizeProcessor;
+
+	if (!(mValid
+		  && mMesh
+		  && mFace
+		  && mMesh->hasWeights()
+		  && mFace->mVertexBuffer.notNull()
+		  && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0))
+	{
+		return;
+	}
+
+	if (!gVectorizePerfTest)
+	{
+		// Once we've measured performance, just run the specified
+		// code version.
+		if(sUpdateGeometryFunc == updateGeometryOriginal)
+			uploadJointMatrices();
+		sUpdateGeometryFunc(mFace, mMesh);
+	}
+	else
+	{
+		// At startup, measure the amount of time in skinning and choose
+		// the fastest one.
+		LLTimer ug_timer ;
+		
+		if (sUpdateGeometryCallPointer)
+		{
+			if(sUpdateGeometryFunc == updateGeometryOriginal)
+				uploadJointMatrices();
+			// call accelerated version for this processor
+			sUpdateGeometryFunc(mFace, mMesh);
+		}
+		else
+		{
+			uploadJointMatrices();
+			updateGeometryOriginal(mFace, mMesh);
+		}
+	
+		sUpdateGeometryElapsedTime += ug_timer.getElapsedTimeF64();
+		++sUpdateGeometryCalls;
+		if(0 != (sUpdateGeometryCalls & UPDATE_GEOMETRY_CALL_OVERFLOW))
+		{
+			F64 time_since_app_start = ug_timer.getElapsedSeconds();
+			if(sUpdateGeometryGlobalTime == 0.0 
+				|| sUpdateGeometryLastProcessor != gVectorizeProcessor)
+			{
+				sUpdateGeometryGlobalTime		= time_since_app_start;
+				sUpdateGeometryElapsedTime		= 0;
+				sUpdateGeometryCalls			= 0;
+				sUpdateGeometryRunCount			= 0;
+				sUpdateGeometryLastProcessor	= gVectorizeProcessor;
+				sUpdateGeometryCallPointer		= false;
+				return;
+			}
+			F64 percent_time_in_function = 
+				( sUpdateGeometryElapsedTime * 100.0 ) / ( time_since_app_start - sUpdateGeometryGlobalTime ) ;
+			sUpdateGeometryGlobalTime = time_since_app_start;
+			if (!sUpdateGeometryCallPointer)
+			{
+				// First set of run data is with vectorization off.
+				sUpdateGeometryCallPointer = true;
+				llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
+					<< "vectorize off " << percent_time_in_function
+					<< "% of time with "
+					<< (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
+					<< " seconds per call "
+					<< llendl;
+				sUpdateGeometryRunAvgOff[sUpdateGeometryRunCount] = percent_time_in_function;
+				sUpdateGeometryElapsedTimeOff += sUpdateGeometryElapsedTime;
+				sUpdateGeometryCalls = 0;
+			}
+			else
+			{
+				// Second set of run data is with vectorization on.
+				sUpdateGeometryCallPointer = false;
+				llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
+					<< "VEC on " << percent_time_in_function
+					<< "% of time with "
+					<< (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
+					<< " seconds per call "
+					<< llendl;
+				sUpdateGeometryRunAvgOn[sUpdateGeometryRunCount] = percent_time_in_function ;
+				sUpdateGeometryElapsedTimeOn += sUpdateGeometryElapsedTime;
+
+				sUpdateGeometryCalls = 0;
+				sUpdateGeometryRunCount++;
+				F64 a = 0.0, b = 0.0;
+				for(U32 i = 0; i<sUpdateGeometryRunCount; i++)
+				{
+					a += sUpdateGeometryRunAvgOff[i];
+					b += sUpdateGeometryRunAvgOn[i];
+				}
+				a /= sUpdateGeometryRunCount;
+				b /= sUpdateGeometryRunCount;
+				F64 perf_boost = ( sUpdateGeometryElapsedTimeOff - sUpdateGeometryElapsedTimeOn ) / sUpdateGeometryElapsedTimeOn;
+				llinfos << "run averages (" << (F64)sUpdateGeometryRunCount
+					<< "/10) vectorize off " << a
+					<< "% : vectorize type " << gVectorizeProcessor
+					<< " " << b
+					<< "% : performance boost " 
+					<< perf_boost * 100.0
+					<< "%"
+					<< llendl ;
+				if(sUpdateGeometryRunCount == 10)
+				{
+					// In case user runs test again, force reset of data on
+					// next run.
+					sUpdateGeometryGlobalTime = 0.0;
+
+					// We have data now on which version is faster.  Switch to that
+					// code and save the data for next run.
+					gVectorizePerfTest = FALSE;
+					gSavedSettings.setBOOL("VectorizePerfTest", FALSE);
+
+					if (perf_boost > 0.0)
+					{
+						llinfos << "Vectorization improves avatar skinning performance, "
+							<< "keeping on for future runs."
+							<< llendl;
+						gSavedSettings.setBOOL("VectorizeSkin", TRUE);
+					}
+					else
+					{
+						// SIMD decreases performance, fall back to original code
+						llinfos << "Vectorization decreases avatar skinning performance, "
+							<< "switching back to original code."
+							<< llendl;
+
+						gSavedSettings.setBOOL("VectorizeSkin", FALSE);
+					}
+				}
+			}
+			sUpdateGeometryElapsedTime = 0.0f;
+		}
+	}
+}
+
 void LLViewerJointMesh::dump()
 {
 	if (mValid)
diff --git a/indra/newview/llviewerjointmesh.h b/indra/newview/llviewerjointmesh.h
index b6fd8afcdb..992c3656a1 100644
--- a/indra/newview/llviewerjointmesh.h
+++ b/indra/newview/llviewerjointmesh.h
@@ -126,6 +126,22 @@ public:
 
 	/*virtual*/ BOOL isAnimatable() { return FALSE; }
 	void writeCAL3D(apr_file_t* fp, S32 material_num, LLCharacter* characterp);
+
+	// Avatar vertex skinning is a significant performance issue on computers
+	// with avatar vertex programs turned off (for example, most Macs).  We
+	// therefore have custom versions that use SIMD instructions.
+	//
+	// These functions require compiler options for SSE2, SSE, or neither, and
+	// hence are contained in separate individual .cpp files.  JC
+	static void updateGeometryOriginal(LLFace* face, LLPolyMesh* mesh);
+	// generic vector code, used for Altivec
+	static void updateGeometryVectorized(LLFace* face, LLPolyMesh* mesh);
+	static void updateGeometrySSE(LLFace* face, LLPolyMesh* mesh);
+	static void updateGeometrySSE2(LLFace* face, LLPolyMesh* mesh);
+
+	// Use a fuction pointer to indicate which version we are running.
+	static void (*sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh);
+
 private:
 	// Allocate skin data
 	BOOL allocateSkinData( U32 numSkinJoints );
diff --git a/indra/newview/llviewerjointmesh_sse.cpp b/indra/newview/llviewerjointmesh_sse.cpp
new file mode 100644
index 0000000000..c4f8ff4fa8
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_sse.cpp
@@ -0,0 +1,94 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+
+// Do not use precompiled headers, because we need to build this file with
+// SSE support, but not the precompiled header file. JC
+#include "linden_common.h"
+
+#include "llviewerjointmesh.h"
+
+// project includes
+#include "llface.h"
+#include "llpolymesh.h"
+
+// library includes
+#include "lldarray.h"
+#include "llv4math.h"		// for LL_VECTORIZE
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+#include "v3math.h"
+
+// *NOTE: SSE must be enabled for this module
+
+#if LL_VECTORIZE
+
+static LLV4Matrix4	sJointMat[32];
+
+inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j)
+{
+	m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]);
+	m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]);
+	m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]);
+	m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]);
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY]));
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ]));
+}
+
+// static
+void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh)
+{
+	LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+
+	//upload joint pivots/matrices
+	for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j )
+	{
+		matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix,
+			joint_data[j]->mSkinJoint ?
+				joint_data[j]->mSkinJoint->mRootToJointSkinOffset
+				: joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset);
+	}
+
+	F32					weight		= F32_MAX;
+	LLV4Matrix4			blend_mat;
+
+	LLStrider<LLVector3> o_vertices;
+	LLStrider<LLVector3> o_normals;
+
+	LLVertexBuffer *buffer = face->mVertexBuffer;
+	buffer->getVertexStrider(o_vertices,  mesh->mFaceVertexOffset);
+	buffer->getNormalStrider(o_normals,   mesh->mFaceVertexOffset);
+
+	const F32*			weights			= mesh->getWeights();
+	const LLVector3*	coords			= mesh->getCoords();
+	const LLVector3*	normals			= mesh->getNormals();
+	for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+	{
+		if( weight != weights[index])
+		{
+			S32 joint = llfloor(weight = weights[index]);
+			blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+		}
+		blend_mat.multiply(coords[index], o_vertices[index]);
+		((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+	}
+}
+
+#else
+
+void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh)
+{
+	LLViewerJointMesh::updateGeometryVectorized(face, mesh);
+	return;
+}
+
+#endif
diff --git a/indra/newview/llviewerjointmesh_sse2.cpp b/indra/newview/llviewerjointmesh_sse2.cpp
new file mode 100644
index 0000000000..cae602ac14
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_sse2.cpp
@@ -0,0 +1,96 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+
+// Do not use precompiled headers, because we need to build this file with
+// SSE support, but not the precompiled header file. JC
+#include "linden_common.h"
+
+#include "llviewerjointmesh.h"
+
+// project includes
+#include "llface.h"
+#include "llpolymesh.h"
+
+// library includes
+#include "lldarray.h"
+#include "llstrider.h"
+#include "llv4math.h"		// for LL_VECTORIZE
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+#include "m4math.h"
+#include "v3math.h"
+
+// *NOTE: SSE2 must be enabled for this module
+
+#if LL_VECTORIZE
+
+static LLV4Matrix4	sJointMat[32];
+
+inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j)
+{
+	m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]);
+	m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]);
+	m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]);
+	m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]);
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY]));
+	m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ]));
+}
+
+// static
+void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh)
+{
+	LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+
+	//upload joint pivots/matrices
+	for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j )
+	{
+		matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix,
+			joint_data[j]->mSkinJoint ?
+				joint_data[j]->mSkinJoint->mRootToJointSkinOffset
+				: joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset);
+	}
+
+	F32					weight		= F32_MAX;
+	LLV4Matrix4			blend_mat;
+
+	LLStrider<LLVector3> o_vertices;
+	LLStrider<LLVector3> o_normals;
+
+	LLVertexBuffer *buffer = face->mVertexBuffer;
+	buffer->getVertexStrider(o_vertices,  mesh->mFaceVertexOffset);
+	buffer->getNormalStrider(o_normals,   mesh->mFaceVertexOffset);
+
+	const F32*			weights			= mesh->getWeights();
+	const LLVector3*	coords			= mesh->getCoords();
+	const LLVector3*	normals			= mesh->getNormals();
+	for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+	{
+		if( weight != weights[index])
+		{
+			S32 joint = llfloor(weight = weights[index]);
+			blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+		}
+		blend_mat.multiply(coords[index], o_vertices[index]);
+		((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+	}
+}
+
+#else
+
+void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh)
+{
+	LLViewerJointMesh::updateGeometryVectorized(face, mesh);
+	return;
+}
+
+#endif
diff --git a/indra/newview/llviewerjointmesh_vec.cpp b/indra/newview/llviewerjointmesh_vec.cpp
new file mode 100644
index 0000000000..5b1e080435
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_vec.cpp
@@ -0,0 +1,76 @@
+/** 
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 math class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2001-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+#include "llviewerprecompiledheaders.h"
+
+#include "llviewerjointmesh.h"
+
+#include "llface.h"
+#include "llpolymesh.h"
+#include "llv4math.h"
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+
+// *NOTE: SSE must be disabled for this module
+
+#if LL_VECTORIZE
+#error This module requires vectorization (i.e. SSE) mode to be disabled.
+#endif
+
+static LLV4Matrix4	sJointMat[32];
+
+// static
+void LLViewerJointMesh::updateGeometryVectorized(LLFace *face, LLPolyMesh *mesh)
+{
+	LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+	S32 j, joint_num, joint_end = joint_data.count();
+	LLV4Vector3 pivot;
+
+	//upload joint pivots/matrices
+	for(j = joint_num = 0; joint_num < joint_end ; ++joint_num )
+	{
+		LLSkinJoint *sj;
+		const LLMatrix4 *	wm = joint_data[joint_num]->mWorldMatrix;
+		if (NULL == (sj = joint_data[joint_num]->mSkinJoint))
+		{
+				sj = joint_data[++joint_num]->mSkinJoint;
+				((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToParentJointSkinOffset, pivot);
+				sJointMat[j++].translate(pivot);
+				wm = joint_data[joint_num]->mWorldMatrix;
+		}
+		((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToJointSkinOffset, pivot);
+		sJointMat[j++].translate(pivot);
+	}
+
+	F32					weight		= F32_MAX;
+	LLV4Matrix4			blend_mat;
+
+	LLStrider<LLVector3> o_vertices;
+	LLStrider<LLVector3> o_normals;
+
+	LLVertexBuffer *buffer = face->mVertexBuffer;
+	buffer->getVertexStrider(o_vertices,  mesh->mFaceVertexOffset);
+	buffer->getNormalStrider(o_normals,   mesh->mFaceVertexOffset);
+
+	const F32*			weights			= mesh->getWeights();
+	const LLVector3*	coords			= mesh->getCoords();
+	const LLVector3*	normals			= mesh->getNormals();
+	for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+	{
+		if( weight != weights[index])
+		{
+			S32 joint = llfloor(weight = weights[index]);
+			blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+		}
+		blend_mat.multiply(coords[index], o_vertices[index]);
+		((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+	}
+}
diff --git a/indra/newview/llviewermenu.cpp b/indra/newview/llviewermenu.cpp
index d18859e356..7ad4f1d70b 100644
--- a/indra/newview/llviewermenu.cpp
+++ b/indra/newview/llviewermenu.cpp
@@ -960,6 +960,7 @@ extern BOOL gDebugClicks;
 extern BOOL gDebugWindowProc;
 extern BOOL gDebugTextEditorTips;
 extern BOOL gDebugSelectMgr;
+extern BOOL gVectorizePerfTest;
 
 void init_debug_ui_menu(LLMenuGL* menu)
 {
@@ -1169,6 +1170,8 @@ void init_debug_rendering_menu(LLMenuGL* menu)
 										   (void*)"ShowDepthBuffer"));
 	sub_menu->append(new LLMenuItemToggleGL("Show Select Buffer", &gDebugSelect));
 
+	sub_menu->append(new LLMenuItemToggleGL("Vectorize Perf Test", &gVectorizePerfTest));
+
 	sub_menu = new LLMenuGL("Render Tests");
 
 	sub_menu->append(new LLMenuItemCheckGL("Camera Offset", 
-- 
cgit v1.3