summaryrefslogtreecommitdiff
path: root/indra
diff options
context:
space:
mode:
authorJames Cook <james@lindenlab.com>2007-07-02 23:52:40 +0000
committerJames Cook <james@lindenlab.com>2007-07-02 23:52:40 +0000
commit1a33bc19b4ce94ab210749911dff14409b4454dd (patch)
treeb674d97d37240a29c0a6671adfe950a506ef0ea4 /indra
parente5124431b54d4342d4677371fccca5bc7250c079 (diff)
svn merge -r 62595:62596 and 62598:63308 sse-skinning-3 for faster software avatar rendering. Visual Studio 2005 project file fixed pending.
Diffstat (limited to 'indra')
-rw-r--r--indra/llcommon/llpreprocessor.h22
-rw-r--r--indra/llcommon/llprocessor.cpp8
-rw-r--r--indra/llcommon/llprocessor.h1
-rw-r--r--indra/llcommon/llskiplist.h12
-rw-r--r--indra/llcommon/llsys.cpp25
-rw-r--r--indra/llcommon/llsys.h12
-rw-r--r--indra/llmath/llv4math.h101
-rw-r--r--indra/llmath/llv4matrix3.h202
-rw-r--r--indra/llmath/llv4matrix4.h231
-rw-r--r--indra/llmath/llv4vector3.h62
-rw-r--r--indra/newview/lldrawable.h1
-rw-r--r--indra/newview/llviewerjointmesh.cpp210
-rw-r--r--indra/newview/llviewerjointmesh.h16
-rw-r--r--indra/newview/llviewerjointmesh_sse.cpp94
-rw-r--r--indra/newview/llviewerjointmesh_sse2.cpp96
-rw-r--r--indra/newview/llviewerjointmesh_vec.cpp76
-rw-r--r--indra/newview/llviewermenu.cpp3
17 files changed, 1120 insertions, 52 deletions
diff --git a/indra/llcommon/llpreprocessor.h b/indra/llcommon/llpreprocessor.h
index 0882472242..4389fd3e30 100644
--- a/indra/llcommon/llpreprocessor.h
+++ b/indra/llcommon/llpreprocessor.h
@@ -51,12 +51,22 @@
#define MOZILLA_INTERNAL_API 1
#endif
-// Deal with minor differences on Unixy OSes.
-#if LL_DARWIN || LL_LINUX
+// Figure out differences between compilers
+#if defined(__GNUC__)
#define GCC_VERSION (__GNUC__ * 10000 \
+ __GNUC_MINOR__ * 100 \
+ __GNUC_PATCHLEVEL__)
+ #ifndef LL_GNUC
+ #define LL_GNUC 1
+ #endif
+#elif defined(__MSVC_VER__) || defined(_MSC_VER)
+ #ifndef LL_MSVC
+ #define LL_MSVC 1
+ #endif
+#endif
+// Deal with minor differences on Unixy OSes.
+#if LL_DARWIN || LL_LINUX
// Different name, same functionality.
#define stricmp strcasecmp
#define strnicmp strncasecmp
@@ -69,9 +79,9 @@
#endif
// Deal with the differeneces on Windows
-#if LL_WINDOWS
+#if LL_MSVC
#define snprintf safe_snprintf /* Flawfinder: ignore */
-#endif // LL_WINDOWS
+#endif // LL_MSVC
// Static linking with apr on windows needs to be declared.
#ifdef LL_WINDOWS
@@ -90,7 +100,7 @@
// Deal with VC6 problems
-#if defined(LL_WINDOWS)
+#if LL_MSVC
#pragma warning( 3 : 4701 ) // "local variable used without being initialized" Treat this as level 3, not level 4.
#pragma warning( 3 : 4702 ) // "unreachable code" Treat this as level 3, not level 4.
#pragma warning( 3 : 4189 ) // "local variable initialized but not referenced" Treat this as level 3, not level 4.
@@ -101,6 +111,6 @@
#pragma warning( disable : 4503 ) // 'decorated name length exceeded, name was truncated'. Does not seem to affect compilation.
#pragma warning( disable : 4800 ) // 'BOOL' : forcing value to bool 'true' or 'false' (performance warning)
#pragma warning( disable : 4996 ) // warning: deprecated
-#endif // LL_WINDOWS
+#endif // LL_MSVC
#endif // not LL_LINDEN_PREPROCESSOR_H
diff --git a/indra/llcommon/llprocessor.cpp b/indra/llcommon/llprocessor.cpp
index 00f4a13c39..bcabb47a66 100644
--- a/indra/llcommon/llprocessor.cpp
+++ b/indra/llcommon/llprocessor.cpp
@@ -1518,6 +1518,7 @@ void CProcessor::GetStandardProcessorExtensions()
CPUInfo._Ext.FXSR_FastStreamingSIMD_ExtensionsSaveRestore = CheckBit(edxreg, 24);
CPUInfo._Ext.SSE_StreamingSIMD_Extensions = CheckBit(edxreg, 25);
CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = CheckBit(edxreg, 26);
+ CPUInfo._Ext.Altivec_Extensions = false;
CPUInfo._Ext.SS_SelfSnoop = CheckBit(edxreg, 27);
CPUInfo._Ext.HT_HyperThreading = CheckBit(edxreg, 28);
CPUInfo._Ext.HT_HyterThreadingSiblings = (ebxreg >> 16) & 0xFF;
@@ -1871,11 +1872,12 @@ const ProcessorInfo *CProcessor::GetCPUInfo()
break;
}
- // It's kinda like MMX or SSE...
CPUInfo._Ext.EMMX_MultimediaExtensions =
CPUInfo._Ext.MMX_MultimediaExtensions =
CPUInfo._Ext.SSE_StreamingSIMD_Extensions =
- CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = hasFeature("hw.optional.altivec");
+ CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = false;
+
+ CPUInfo._Ext.Altivec_Extensions = hasFeature("hw.optional.altivec");
#endif
@@ -1892,6 +1894,7 @@ const ProcessorInfo *CProcessor::GetCPUInfo()
CPUInfo._Ext.MMX_MultimediaExtensions = hasFeature("hw.optional.mmx");
CPUInfo._Ext.SSE_StreamingSIMD_Extensions = hasFeature("hw.optional.sse");
CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = hasFeature("hw.optional.sse2");
+ CPUInfo._Ext.Altivec_Extensions = false;
CPUInfo._Ext.AA64_AMD64BitArchitecture = hasFeature("hw.optional.x86_64");
#endif
@@ -2045,6 +2048,7 @@ bool CProcessor::CPUInfoToText(char *strBuffer, unsigned int uiMaxLen)
BOOLADD("SS Self Snoop: ", CPUInfo._Ext.SS_SelfSnoop);
BOOLADD("SSE Streaming SIMD Extensions: ", CPUInfo._Ext.SSE_StreamingSIMD_Extensions);
BOOLADD("SSE2 Streaming SIMD 2 Extensions: ", CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions);
+ BOOLADD("ALTVEC Altivec Extensions: ", CPUInfo._Ext.Altivec_Extensions);
BOOLADD("TM Thermal Monitor: ", CPUInfo._Ext.TM_ThermalMonitor);
BOOLADD("TSC Time Stamp Counter: ", CPUInfo._Ext.TSC_TimeStampCounter);
BOOLADD("VME Virtual 8086 Mode Enhancements: ", CPUInfo._Ext.VME_Virtual8086ModeEnhancements);
diff --git a/indra/llcommon/llprocessor.h b/indra/llcommon/llprocessor.h
index 8453263f9d..9060e8aa95 100644
--- a/indra/llcommon/llprocessor.h
+++ b/indra/llcommon/llprocessor.h
@@ -51,6 +51,7 @@ typedef struct ProcessorExtensions
bool FXSR_FastStreamingSIMD_ExtensionsSaveRestore;
bool SSE_StreamingSIMD_Extensions;
bool SSE2_StreamingSIMD2_Extensions;
+ bool Altivec_Extensions;
bool SS_SelfSnoop;
bool HT_HyperThreading;
unsigned int HT_HyterThreadingSiblings;
diff --git a/indra/llcommon/llskiplist.h b/indra/llcommon/llskiplist.h
index ed1aa1f0aa..4676fb8f18 100644
--- a/indra/llcommon/llskiplist.h
+++ b/indra/llcommon/llskiplist.h
@@ -8,11 +8,10 @@
#ifndef LL_LLSKIPLIST_H
#define LL_LLSKIPLIST_H
-#include "llerror.h"
-//#include "vmath.h"
+#include "llrand.h"
// NOTA BENE: Insert first needs to be < NOT <=
-
+// Binary depth must be >= 2
template <class DATA_TYPE, S32 BINARY_DEPTH = 10>
class LLSkipList
{
@@ -124,14 +123,11 @@ private:
// Implementation
//
+
+// Binary depth must be >= 2
template <class DATA_TYPE, S32 BINARY_DEPTH>
inline void LLSkipList<DATA_TYPE, BINARY_DEPTH>::init()
{
- if (BINARY_DEPTH < 2)
- {
- llerrs << "Trying to create skip list with too little depth, "
- "must be 2 or greater" << llendl;
- }
S32 i;
for (i = 0; i < BINARY_DEPTH; i++)
{
diff --git a/indra/llcommon/llsys.cpp b/indra/llcommon/llsys.cpp
index 6f0bda4b71..90cc374ade 100644
--- a/indra/llcommon/llsys.cpp
+++ b/indra/llcommon/llsys.cpp
@@ -284,12 +284,33 @@ LLCPUInfo::LLCPUInfo()
{
CProcessor proc;
const ProcessorInfo* info = proc.GetCPUInfo();
- mHasSSE = (info->_Ext.SSE_StreamingSIMD_Extensions != 0);
- mHasSSE2 = (info->_Ext.SSE2_StreamingSIMD2_Extensions != 0);
+ // proc.WriteInfoTextFile("procInfo.txt");
+ mHasSSE = info->_Ext.SSE_StreamingSIMD_Extensions;
+ mHasSSE2 = info->_Ext.SSE2_StreamingSIMD2_Extensions;
+ mHasAltivec = info->_Ext.Altivec_Extensions;
mCPUMhz = (S32)(proc.GetCPUFrequency(50)/1000000.0);
mFamily.assign( info->strFamily );
}
+bool LLCPUInfo::hasAltivec() const
+{
+ return mHasAltivec;
+}
+
+bool LLCPUInfo::hasSSE() const
+{
+ return mHasSSE;
+}
+
+bool LLCPUInfo::hasSSE2() const
+{
+ return mHasSSE2;
+}
+
+S32 LLCPUInfo::getMhz() const
+{
+ return mCPUMhz;
+}
std::string LLCPUInfo::getCPUString() const
{
diff --git a/indra/llcommon/llsys.h b/indra/llcommon/llsys.h
index 7808a97b80..4b6fbe149b 100644
--- a/indra/llcommon/llsys.h
+++ b/indra/llcommon/llsys.h
@@ -52,16 +52,18 @@ public:
std::string getCPUString() const;
- BOOL hasSSE() const { return mHasSSE; }
- BOOL hasSSE2() const { return mHasSSE2; }
- S32 getMhz() const { return mCPUMhz; }
+ bool hasAltivec() const;
+ bool hasSSE() const;
+ bool hasSSE2() const;
+ S32 getMhz() const;
// Family is "AMD Duron" or "Intel Pentium Pro"
const std::string& getFamily() const { return mFamily; }
private:
- BOOL mHasSSE;
- BOOL mHasSSE2;
+ bool mHasSSE;
+ bool mHasSSE2;
+ bool mHasAltivec;
S32 mCPUMhz;
std::string mFamily;
};
diff --git a/indra/llmath/llv4math.h b/indra/llmath/llv4math.h
new file mode 100644
index 0000000000..4a299716b1
--- /dev/null
+++ b/indra/llmath/llv4math.h
@@ -0,0 +1,101 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4* class header file - vector processor enabled math
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+#ifndef LL_LLV4MATH_H
+#define LL_LLV4MATH_H
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4MATH - GNUC
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if LL_GNUC && __GNUC__ >= 4 && __SSE__
+
+#define LL_VECTORIZE 1
+
+#if LL_DARWIN
+
+#include <Accelerate/Accelerate.h>
+#include <xmmintrin.h>
+typedef vFloat V4F32;
+
+#else
+
+#include <xmmintrin.h>
+typedef float V4F32 __attribute__((vector_size(16)));
+
+#endif
+
+#endif
+#if LL_GNUC
+
+#define LL_LLV4MATH_ALIGN_PREFIX
+#define LL_LLV4MATH_ALIGN_POSTFIX __attribute__((aligned(16)))
+
+#endif
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4MATH - MSVC
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if LL_MSVC && _M_IX86_FP
+
+#define LL_VECTORIZE 1
+
+#include <xmmintrin.h>
+
+typedef __m128 V4F32;
+
+#endif
+#if LL_MSVC
+
+#define LL_LLV4MATH_ALIGN_PREFIX __declspec(align(16))
+#define LL_LLV4MATH_ALIGN_POSTFIX
+
+#endif
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4MATH - default - no vectorization
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if !LL_VECTORIZE
+
+#define LL_VECTORIZE 0
+
+struct V4F32 { F32 __pad__[4]; };
+
+inline F32 llv4lerp(F32 a, F32 b, F32 w) { return ( b - a ) * w + a; }
+
+#endif
+
+#ifndef LL_LLV4MATH_ALIGN_PREFIX
+# define LL_LLV4MATH_ALIGN_PREFIX
+#endif
+#ifndef LL_LLV4MATH_ALIGN_POSTFIX
+# define LL_LLV4MATH_ALIGN_POSTFIX
+#endif
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4MATH
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+
+#define LLV4_NUM_AXIS 4
+
+class LLV4Vector3;
+class LLV4Matrix3;
+class LLV4Matrix4;
+
+#endif
diff --git a/indra/llmath/llv4matrix3.h b/indra/llmath/llv4matrix3.h
new file mode 100644
index 0000000000..a273abe496
--- /dev/null
+++ b/indra/llmath/llv4matrix3.h
@@ -0,0 +1,202 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4* class header file - vector processor enabled math
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+#ifndef LL_LLV4MATRIX3_H
+#define LL_LLV4MATRIX3_H
+
+#include "llv4math.h"
+#include "llv4vector3.h"
+#include "m3math.h" // for operator LLMatrix3()
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+LL_LLV4MATH_ALIGN_PREFIX
+
+class LLV4Matrix3
+{
+public:
+ union {
+ F32 mMatrix[LLV4_NUM_AXIS][LLV4_NUM_AXIS];
+ V4F32 mV[LLV4_NUM_AXIS];
+ };
+
+ void lerp(const LLV4Matrix3 &a, const LLV4Matrix3 &b, const F32 &w);
+ void multiply(const LLVector3 &a, LLVector3& out) const;
+ void multiply(const LLVector4 &a, LLV4Vector3& out) const;
+ void multiply(const LLVector3 &a, LLV4Vector3& out) const;
+
+ const LLV4Matrix3& transpose();
+ const LLV4Matrix3& operator=(const LLMatrix3& a);
+
+ operator LLMatrix3() const { return (reinterpret_cast<const LLMatrix4*>(const_cast<const F32*>(&mMatrix[0][0])))->getMat3(); }
+
+ friend LLVector3 operator*(const LLVector3& a, const LLV4Matrix3& b);
+}
+
+LL_LLV4MATH_ALIGN_POSTFIX;
+
+
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix3 - SSE
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if LL_VECTORIZE
+
+inline void LLV4Matrix3::lerp(const LLV4Matrix3 &a, const LLV4Matrix3 &b, const F32 &w)
+{
+ __m128 vw = _mm_set1_ps(w);
+ mV[VX] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VX], a.mV[VX]), vw), a.mV[VX]); // ( b - a ) * w + a
+ mV[VY] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VY], a.mV[VY]), vw), a.mV[VY]);
+ mV[VZ] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VZ], a.mV[VZ]), vw), a.mV[VZ]);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector3 &a, LLVector3& o) const
+{
+ LLV4Vector3 j;
+ j.v = _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX]); // ( ax * vx ) + ...
+ j.v = _mm_add_ps(j.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+ j.v = _mm_add_ps(j.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+ o.setVec(j.mV);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector4 &a, LLV4Vector3& o) const
+{
+ o.v = _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX]); // ( ax * vx ) + ...
+ o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+ o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+}
+
+inline void LLV4Matrix3::multiply(const LLVector3 &a, LLV4Vector3& o) const
+{
+ o.v = _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX]); // ( ax * vx ) + ...
+ o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+ o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#else
+
+inline void LLV4Matrix3::lerp(const LLV4Matrix3 &a, const LLV4Matrix3 &b, const F32 &w)
+{
+ mMatrix[VX][VX] = llv4lerp(a.mMatrix[VX][VX], b.mMatrix[VX][VX], w);
+ mMatrix[VX][VY] = llv4lerp(a.mMatrix[VX][VY], b.mMatrix[VX][VY], w);
+ mMatrix[VX][VZ] = llv4lerp(a.mMatrix[VX][VZ], b.mMatrix[VX][VZ], w);
+
+ mMatrix[VY][VX] = llv4lerp(a.mMatrix[VY][VX], b.mMatrix[VY][VX], w);
+ mMatrix[VY][VY] = llv4lerp(a.mMatrix[VY][VY], b.mMatrix[VY][VY], w);
+ mMatrix[VY][VZ] = llv4lerp(a.mMatrix[VY][VZ], b.mMatrix[VY][VZ], w);
+
+ mMatrix[VZ][VX] = llv4lerp(a.mMatrix[VZ][VX], b.mMatrix[VZ][VX], w);
+ mMatrix[VZ][VY] = llv4lerp(a.mMatrix[VZ][VY], b.mMatrix[VZ][VY], w);
+ mMatrix[VZ][VZ] = llv4lerp(a.mMatrix[VZ][VZ], b.mMatrix[VZ][VZ], w);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector3 &a, LLVector3& o) const
+{
+ o.setVec( a.mV[VX] * mMatrix[VX][VX] +
+ a.mV[VY] * mMatrix[VY][VX] +
+ a.mV[VZ] * mMatrix[VZ][VX],
+
+ a.mV[VX] * mMatrix[VX][VY] +
+ a.mV[VY] * mMatrix[VY][VY] +
+ a.mV[VZ] * mMatrix[VZ][VY],
+
+ a.mV[VX] * mMatrix[VX][VZ] +
+ a.mV[VY] * mMatrix[VY][VZ] +
+ a.mV[VZ] * mMatrix[VZ][VZ]);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector4 &a, LLV4Vector3& o) const
+{
+ o.setVec( a.mV[VX] * mMatrix[VX][VX] +
+ a.mV[VY] * mMatrix[VY][VX] +
+ a.mV[VZ] * mMatrix[VZ][VX],
+
+ a.mV[VX] * mMatrix[VX][VY] +
+ a.mV[VY] * mMatrix[VY][VY] +
+ a.mV[VZ] * mMatrix[VZ][VY],
+
+ a.mV[VX] * mMatrix[VX][VZ] +
+ a.mV[VY] * mMatrix[VY][VZ] +
+ a.mV[VZ] * mMatrix[VZ][VZ]);
+}
+
+inline void LLV4Matrix3::multiply(const LLVector3 &a, LLV4Vector3& o) const
+{
+ o.setVec( a.mV[VX] * mMatrix[VX][VX] +
+ a.mV[VY] * mMatrix[VY][VX] +
+ a.mV[VZ] * mMatrix[VZ][VX],
+
+ a.mV[VX] * mMatrix[VX][VY] +
+ a.mV[VY] * mMatrix[VY][VY] +
+ a.mV[VZ] * mMatrix[VZ][VY],
+
+ a.mV[VX] * mMatrix[VX][VZ] +
+ a.mV[VY] * mMatrix[VY][VZ] +
+ a.mV[VZ] * mMatrix[VZ][VZ]);
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#endif
+
+inline const LLV4Matrix3& LLV4Matrix3::transpose()
+{
+#if LL_VECTORIZE && defined(_MM_TRANSPOSE4_PS)
+ _MM_TRANSPOSE4_PS(mV[VX], mV[VY], mV[VZ], mV[VW]);
+ return *this;
+#else
+ F32 temp;
+ temp = mMatrix[VX][VY]; mMatrix[VX][VY] = mMatrix[VY][VX]; mMatrix[VY][VX] = temp;
+ temp = mMatrix[VX][VZ]; mMatrix[VX][VZ] = mMatrix[VZ][VX]; mMatrix[VZ][VX] = temp;
+ temp = mMatrix[VY][VZ]; mMatrix[VY][VZ] = mMatrix[VZ][VY]; mMatrix[VZ][VY] = temp;
+#endif
+ return *this;
+}
+
+inline const LLV4Matrix3& LLV4Matrix3::operator=(const LLMatrix3& a)
+{
+ memcpy(mMatrix[VX], a.mMatrix[VX], sizeof(F32) * 3 );
+ memcpy(mMatrix[VY], a.mMatrix[VY], sizeof(F32) * 3 );
+ memcpy(mMatrix[VZ], a.mMatrix[VZ], sizeof(F32) * 3 );
+ return *this;
+}
+
+inline LLVector3 operator*(const LLVector3& a, const LLV4Matrix3& b)
+{
+ return LLVector3(
+ a.mV[VX] * b.mMatrix[VX][VX] +
+ a.mV[VY] * b.mMatrix[VY][VX] +
+ a.mV[VZ] * b.mMatrix[VZ][VX],
+
+ a.mV[VX] * b.mMatrix[VX][VY] +
+ a.mV[VY] * b.mMatrix[VY][VY] +
+ a.mV[VZ] * b.mMatrix[VZ][VY],
+
+ a.mV[VX] * b.mMatrix[VX][VZ] +
+ a.mV[VY] * b.mMatrix[VY][VZ] +
+ a.mV[VZ] * b.mMatrix[VZ][VZ] );
+}
+
+#endif
diff --git a/indra/llmath/llv4matrix4.h b/indra/llmath/llv4matrix4.h
new file mode 100644
index 0000000000..0673f6fa7d
--- /dev/null
+++ b/indra/llmath/llv4matrix4.h
@@ -0,0 +1,231 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4* class header file - vector processor enabled math
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+#ifndef LL_LLV4MATRIX4_H
+#define LL_LLV4MATRIX4_H
+
+#include "llv4math.h"
+#include "llv4matrix3.h" // just for operator LLV4Matrix3()
+#include "llv4vector3.h"
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix4
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+LL_LLV4MATH_ALIGN_PREFIX
+
+class LLV4Matrix4
+{
+public:
+ union {
+ F32 mMatrix[LLV4_NUM_AXIS][LLV4_NUM_AXIS];
+ V4F32 mV[LLV4_NUM_AXIS];
+ };
+
+ void lerp(const LLV4Matrix4 &a, const LLV4Matrix4 &b, const F32 &w);
+ void multiply(const LLVector3 &a, LLVector3& o) const;
+ void multiply(const LLVector3 &a, LLV4Vector3& o) const;
+
+ const LLV4Matrix4& transpose();
+ const LLV4Matrix4& translate(const LLVector3 &vec);
+ const LLV4Matrix4& translate(const LLV4Vector3 &vec);
+ const LLV4Matrix4& operator=(const LLMatrix4& a);
+
+ operator LLMatrix4() const { return *(reinterpret_cast<const LLMatrix4*>(const_cast<const F32*>(&mMatrix[0][0]))); }
+ operator LLV4Matrix3() const { return *(reinterpret_cast<const LLV4Matrix3*>(const_cast<const F32*>(&mMatrix[0][0]))); }
+
+ friend LLVector3 operator*(const LLVector3 &a, const LLV4Matrix4 &b);
+}
+
+LL_LLV4MATH_ALIGN_POSTFIX;
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix4 - SSE
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#if LL_VECTORIZE
+
+inline void LLV4Matrix4::lerp(const LLV4Matrix4 &a, const LLV4Matrix4 &b, const F32 &w)
+{
+ __m128 vw = _mm_set1_ps(w);
+ mV[VX] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VX], a.mV[VX]), vw), a.mV[VX]); // ( b - a ) * w + a
+ mV[VY] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VY], a.mV[VY]), vw), a.mV[VY]);
+ mV[VZ] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VZ], a.mV[VZ]), vw), a.mV[VZ]);
+ mV[VW] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VW], a.mV[VW]), vw), a.mV[VW]);
+}
+
+inline void LLV4Matrix4::multiply(const LLVector3 &a, LLVector3& o) const
+{
+ LLV4Vector3 j;
+ j.v = _mm_add_ps(mV[VW], _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX])); // ( ax * vx ) + vw
+ j.v = _mm_add_ps(j.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+ j.v = _mm_add_ps(j.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+ o.setVec(j.mV);
+}
+
+inline void LLV4Matrix4::multiply(const LLVector3 &a, LLV4Vector3& o) const
+{
+ o.v = _mm_add_ps(mV[VW], _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX])); // ( ax * vx ) + vw
+ o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY]));
+ o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ]));
+}
+
+inline const LLV4Matrix4& LLV4Matrix4::translate(const LLV4Vector3 &vec)
+{
+ mV[VW] = _mm_add_ps(mV[VW], vec.v);
+ return (*this);
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix4
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#else
+
+inline void LLV4Matrix4::lerp(const LLV4Matrix4 &a, const LLV4Matrix4 &b, const F32 &w)
+{
+ mMatrix[VX][VX] = llv4lerp(a.mMatrix[VX][VX], b.mMatrix[VX][VX], w);
+ mMatrix[VX][VY] = llv4lerp(a.mMatrix[VX][VY], b.mMatrix[VX][VY], w);
+ mMatrix[VX][VZ] = llv4lerp(a.mMatrix[VX][VZ], b.mMatrix[VX][VZ], w);
+
+ mMatrix[VY][VX] = llv4lerp(a.mMatrix[VY][VX], b.mMatrix[VY][VX], w);
+ mMatrix[VY][VY] = llv4lerp(a.mMatrix[VY][VY], b.mMatrix[VY][VY], w);
+ mMatrix[VY][VZ] = llv4lerp(a.mMatrix[VY][VZ], b.mMatrix[VY][VZ], w);
+
+ mMatrix[VZ][VX] = llv4lerp(a.mMatrix[VZ][VX], b.mMatrix[VZ][VX], w);
+ mMatrix[VZ][VY] = llv4lerp(a.mMatrix[VZ][VY], b.mMatrix[VZ][VY], w);
+ mMatrix[VZ][VZ] = llv4lerp(a.mMatrix[VZ][VZ], b.mMatrix[VZ][VZ], w);
+
+ mMatrix[VW][VX] = llv4lerp(a.mMatrix[VW][VX], b.mMatrix[VW][VX], w);
+ mMatrix[VW][VY] = llv4lerp(a.mMatrix[VW][VY], b.mMatrix[VW][VY], w);
+ mMatrix[VW][VZ] = llv4lerp(a.mMatrix[VW][VZ], b.mMatrix[VW][VZ], w);
+}
+
+inline void LLV4Matrix4::multiply(const LLVector3 &a, LLVector3& o) const
+{
+ o.setVec( a.mV[VX] * mMatrix[VX][VX] +
+ a.mV[VY] * mMatrix[VY][VX] +
+ a.mV[VZ] * mMatrix[VZ][VX] +
+ mMatrix[VW][VX],
+
+ a.mV[VX] * mMatrix[VX][VY] +
+ a.mV[VY] * mMatrix[VY][VY] +
+ a.mV[VZ] * mMatrix[VZ][VY] +
+ mMatrix[VW][VY],
+
+ a.mV[VX] * mMatrix[VX][VZ] +
+ a.mV[VY] * mMatrix[VY][VZ] +
+ a.mV[VZ] * mMatrix[VZ][VZ] +
+ mMatrix[VW][VZ]);
+}
+
+inline void LLV4Matrix4::multiply(const LLVector3 &a, LLV4Vector3& o) const
+{
+ o.setVec( a.mV[VX] * mMatrix[VX][VX] +
+ a.mV[VY] * mMatrix[VY][VX] +
+ a.mV[VZ] * mMatrix[VZ][VX] +
+ mMatrix[VW][VX],
+
+ a.mV[VX] * mMatrix[VX][VY] +
+ a.mV[VY] * mMatrix[VY][VY] +
+ a.mV[VZ] * mMatrix[VZ][VY] +
+ mMatrix[VW][VY],
+
+ a.mV[VX] * mMatrix[VX][VZ] +
+ a.mV[VY] * mMatrix[VY][VZ] +
+ a.mV[VZ] * mMatrix[VZ][VZ] +
+ mMatrix[VW][VZ]);
+}
+
+inline const LLV4Matrix4& LLV4Matrix4::translate(const LLV4Vector3 &vec)
+{
+ mMatrix[3][0] += vec.mV[0];
+ mMatrix[3][1] += vec.mV[1];
+ mMatrix[3][2] += vec.mV[2];
+ return (*this);
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Matrix4
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+#endif
+
+inline const LLV4Matrix4& LLV4Matrix4::operator=(const LLMatrix4& a)
+{
+ memcpy(mMatrix, a.mMatrix, sizeof(F32) * 16 );
+ return *this;
+}
+
+inline const LLV4Matrix4& LLV4Matrix4::transpose()
+{
+#if LL_VECTORIZE && defined(_MM_TRANSPOSE4_PS)
+ _MM_TRANSPOSE4_PS(mV[VX], mV[VY], mV[VZ], mV[VW]);
+#else
+ LLV4Matrix4 mat;
+ mat.mMatrix[0][0] = mMatrix[0][0];
+ mat.mMatrix[1][0] = mMatrix[0][1];
+ mat.mMatrix[2][0] = mMatrix[0][2];
+ mat.mMatrix[3][0] = mMatrix[0][3];
+
+ mat.mMatrix[0][1] = mMatrix[1][0];
+ mat.mMatrix[1][1] = mMatrix[1][1];
+ mat.mMatrix[2][1] = mMatrix[1][2];
+ mat.mMatrix[3][1] = mMatrix[1][3];
+
+ mat.mMatrix[0][2] = mMatrix[2][0];
+ mat.mMatrix[1][2] = mMatrix[2][1];
+ mat.mMatrix[2][2] = mMatrix[2][2];
+ mat.mMatrix[3][2] = mMatrix[2][3];
+
+ mat.mMatrix[0][3] = mMatrix[3][0];
+ mat.mMatrix[1][3] = mMatrix[3][1];
+ mat.mMatrix[2][3] = mMatrix[3][2];
+ mat.mMatrix[3][3] = mMatrix[3][3];
+
+ *this = mat;
+#endif
+ return *this;
+}
+
+inline const LLV4Matrix4& LLV4Matrix4::translate(const LLVector3 &vec)
+{
+ mMatrix[3][0] += vec.mV[0];
+ mMatrix[3][1] += vec.mV[1];
+ mMatrix[3][2] += vec.mV[2];
+ return (*this);
+}
+
+inline LLVector3 operator*(const LLVector3 &a, const LLV4Matrix4 &b)
+{
+ return LLVector3(a.mV[VX] * b.mMatrix[VX][VX] +
+ a.mV[VY] * b.mMatrix[VY][VX] +
+ a.mV[VZ] * b.mMatrix[VZ][VX] +
+ b.mMatrix[VW][VX],
+
+ a.mV[VX] * b.mMatrix[VX][VY] +
+ a.mV[VY] * b.mMatrix[VY][VY] +
+ a.mV[VZ] * b.mMatrix[VZ][VY] +
+ b.mMatrix[VW][VY],
+
+ a.mV[VX] * b.mMatrix[VX][VZ] +
+ a.mV[VY] * b.mMatrix[VY][VZ] +
+ a.mV[VZ] * b.mMatrix[VZ][VZ] +
+ b.mMatrix[VW][VZ]);
+}
+
+
+#endif
diff --git a/indra/llmath/llv4vector3.h b/indra/llmath/llv4vector3.h
new file mode 100644
index 0000000000..7bf8c5ce91
--- /dev/null
+++ b/indra/llmath/llv4vector3.h
@@ -0,0 +1,62 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4* class header file - vector processor enabled math
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+#ifndef LL_LLV4VECTOR3_H
+#define LL_LLV4VECTOR3_H
+
+#include "llv4math.h"
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Vector3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+LL_LLV4MATH_ALIGN_PREFIX
+
+class LLV4Vector3
+{
+public:
+ union {
+ F32 mV[LLV4_NUM_AXIS];
+ V4F32 v;
+ };
+
+ enum {
+ ALIGNMENT = 16
+ };
+
+ void setVec(F32 x, F32 y, F32 z);
+ void setVec(F32 a);
+}
+
+LL_LLV4MATH_ALIGN_POSTFIX;
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+// LLV4Vector3
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+
+inline void LLV4Vector3::setVec(F32 x, F32 y, F32 z)
+{
+ mV[VX] = x;
+ mV[VY] = y;
+ mV[VZ] = z;
+}
+
+inline void LLV4Vector3::setVec(F32 a)
+{
+#if LL_VECTORIZE
+ v = _mm_set1_ps(a);
+#else
+ setVec(a, a, a);
+#endif
+}
+
+#endif
diff --git a/indra/newview/lldrawable.h b/indra/newview/lldrawable.h
index 48c58dbb4c..328a116f59 100644
--- a/indra/newview/lldrawable.h
+++ b/indra/newview/lldrawable.h
@@ -26,6 +26,7 @@
#include "llviewerobject.h"
#include "llrect.h"
+class LLCamera;
class LLDrawPool;
class LLDrawable;
class LLFace;
diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp
index 150943465d..ba4c7e1b20 100644
--- a/indra/newview/llviewerjointmesh.cpp
+++ b/indra/newview/llviewerjointmesh.cpp
@@ -19,6 +19,7 @@
#include "llfasttimer.h"
#include "llagent.h"
+#include "llapr.h"
#include "llbox.h"
#include "lldrawable.h"
#include "lldrawpoolavatar.h"
@@ -29,12 +30,18 @@
#include "llglheaders.h"
#include "lltexlayer.h"
#include "llviewercamera.h"
+#include "llviewercontrol.h"
#include "llviewerimagelist.h"
#include "llviewerjointmesh.h"
#include "llvoavatar.h"
#include "llsky.h"
#include "pipeline.h"
#include "llglslshader.h"
+#include "llmath.h"
+#include "v4math.h"
+#include "m3math.h"
+#include "m4math.h"
+
#if !LL_DARWIN && !LL_LINUX
extern PFNGLWEIGHTPOINTERARBPROC glWeightPointerARB;
@@ -48,6 +55,7 @@ static const U32 sRenderMask = LLVertexBuffer::MAP_VERTEX |
LLVertexBuffer::MAP_NORMAL |
LLVertexBuffer::MAP_TEXCOORD;
+
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
// LLViewerJointMesh::LLSkinJoint
@@ -100,6 +108,7 @@ BOOL LLSkinJoint::setupSkinJoint( LLViewerJoint *joint)
return TRUE;
}
+
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
// LLViewerJointMesh
@@ -394,9 +403,9 @@ const S32 NUM_AXES = 3;
// rotation Z 0-n
// pivot parent 0-n -- child = n+1
-static LLMatrix4 gJointMat[32];
-static LLMatrix3 gJointRot[32];
-static LLVector4 gJointPivot[32];
+static LLMatrix4 gJointMatUnaligned[32];
+static LLMatrix3 gJointRotUnaligned[32];
+static LLVector4 gJointPivot[32];
//-----------------------------------------------------------------------------
// uploadJointMatrices()
@@ -417,8 +426,8 @@ void LLViewerJointMesh::uploadJointMatrices()
{
joint_mat *= LLDrawPoolAvatar::getModelView();
}
- gJointMat[joint_num] = joint_mat;
- gJointRot[joint_num] = joint_mat.getMat3();
+ gJointMatUnaligned[joint_num] = joint_mat;
+ gJointRotUnaligned[joint_num] = joint_mat.getMat3();
}
BOOL last_pivot_uploaded = FALSE;
@@ -455,8 +464,8 @@ void LLViewerJointMesh::uploadJointMatrices()
{
LLVector3 pivot;
pivot = LLVector3(gJointPivot[i]);
- pivot = pivot * gJointRot[i];
- gJointMat[i].translate(pivot);
+ pivot = pivot * gJointRotUnaligned[i];
+ gJointMatUnaligned[i].translate(pivot);
}
// upload matrices
@@ -467,11 +476,11 @@ void LLViewerJointMesh::uploadJointMatrices()
for (joint_num = 0; joint_num < reference_mesh->mJointRenderData.count(); joint_num++)
{
- gJointMat[joint_num].transpose();
+ gJointMatUnaligned[joint_num].transpose();
for (S32 axis = 0; axis < NUM_AXES; axis++)
{
- F32* vector = gJointMat[joint_num].mMatrix[axis];
+ F32* vector = gJointMatUnaligned[joint_num].mMatrix[axis];
//glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, LL_CHARACTER_MAX_JOINTS_PER_MESH * axis + joint_num+5, (GLfloat*)vector);
U32 offset = LL_CHARACTER_MAX_JOINTS_PER_MESH*axis+joint_num;
memcpy(mat+offset*4, vector, sizeof(GLfloat)*4);
@@ -883,21 +892,9 @@ BOOL LLViewerJointMesh::updateLOD(F32 pixel_area, BOOL activate)
return (valid != activate);
}
-
-void LLViewerJointMesh::updateGeometry()
+// static
+void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh)
{
- if (!(mValid
- && mMesh
- && mFace
- && mMesh->hasWeights()
- && mFace->mVertexBuffer.notNull()
- && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0))
- {
- return;
- }
-
- uploadJointMatrices();
-
LLStrider<LLVector3> o_vertices;
LLStrider<LLVector3> o_normals;
@@ -938,9 +935,9 @@ void LLViewerJointMesh::updateGeometry()
// No lerp required in this case.
if (w == 1.0f)
{
- gBlendMat = gJointMat[joint+1];
+ gBlendMat = gJointMatUnaligned[joint+1];
o_vertices[bidx] = coords[index] * gBlendMat;
- gBlendRotMat = gJointRot[joint+1];
+ gBlendRotMat = gJointRotUnaligned[joint+1];
o_normals[bidx] = normals[index] * gBlendRotMat;
continue;
}
@@ -948,8 +945,8 @@ void LLViewerJointMesh::updateGeometry()
// Try to keep all the accesses to the matrix data as close
// together as possible. This function is a hot spot on the
// Mac. JC
- LLMatrix4 &m0 = gJointMat[joint+1];
- LLMatrix4 &m1 = gJointMat[joint+0];
+ LLMatrix4 &m0 = gJointMatUnaligned[joint+1];
+ LLMatrix4 &m1 = gJointMatUnaligned[joint+0];
gBlendMat.mMatrix[VX][VX] = lerp(m1.mMatrix[VX][VX], m0.mMatrix[VX][VX], w);
gBlendMat.mMatrix[VX][VY] = lerp(m1.mMatrix[VX][VY], m0.mMatrix[VX][VY], w);
@@ -969,8 +966,8 @@ void LLViewerJointMesh::updateGeometry()
o_vertices[bidx] = coords[index] * gBlendMat;
- LLMatrix3 &n0 = gJointRot[joint+1];
- LLMatrix3 &n1 = gJointRot[joint+0];
+ LLMatrix3 &n0 = gJointRotUnaligned[joint+1];
+ LLMatrix3 &n1 = gJointRotUnaligned[joint+0];
gBlendRotMat.mMatrix[VX][VX] = lerp(n1.mMatrix[VX][VX], n0.mMatrix[VX][VX], w);
gBlendRotMat.mMatrix[VX][VY] = lerp(n1.mMatrix[VX][VY], n0.mMatrix[VX][VY], w);
@@ -988,6 +985,161 @@ void LLViewerJointMesh::updateGeometry()
}
}
+const U32 UPDATE_GEOMETRY_CALL_MASK = 0x1FFF; // 8K samples before overflow
+const U32 UPDATE_GEOMETRY_CALL_OVERFLOW = ~UPDATE_GEOMETRY_CALL_MASK;
+static bool sUpdateGeometryCallPointer = false;
+static F64 sUpdateGeometryGlobalTime = 0.0 ;
+static F64 sUpdateGeometryElapsedTime = 0.0 ;
+static F64 sUpdateGeometryElapsedTimeOff = 0.0 ;
+static F64 sUpdateGeometryElapsedTimeOn = 0.0 ;
+static F64 sUpdateGeometryRunAvgOff[10];
+static F64 sUpdateGeometryRunAvgOn[10];
+static U32 sUpdateGeometryRunCount = 0 ;
+static U32 sUpdateGeometryCalls = 0 ;
+static U32 sUpdateGeometryLastProcessor = 0 ;
+void (*LLViewerJointMesh::sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh);
+
+void LLViewerJointMesh::updateGeometry()
+{
+ extern BOOL gVectorizePerfTest;
+ extern U32 gVectorizeProcessor;
+
+ if (!(mValid
+ && mMesh
+ && mFace
+ && mMesh->hasWeights()
+ && mFace->mVertexBuffer.notNull()
+ && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0))
+ {
+ return;
+ }
+
+ if (!gVectorizePerfTest)
+ {
+ // Once we've measured performance, just run the specified
+ // code version.
+ if(sUpdateGeometryFunc == updateGeometryOriginal)
+ uploadJointMatrices();
+ sUpdateGeometryFunc(mFace, mMesh);
+ }
+ else
+ {
+ // At startup, measure the amount of time in skinning and choose
+ // the fastest one.
+ LLTimer ug_timer ;
+
+ if (sUpdateGeometryCallPointer)
+ {
+ if(sUpdateGeometryFunc == updateGeometryOriginal)
+ uploadJointMatrices();
+ // call accelerated version for this processor
+ sUpdateGeometryFunc(mFace, mMesh);
+ }
+ else
+ {
+ uploadJointMatrices();
+ updateGeometryOriginal(mFace, mMesh);
+ }
+
+ sUpdateGeometryElapsedTime += ug_timer.getElapsedTimeF64();
+ ++sUpdateGeometryCalls;
+ if(0 != (sUpdateGeometryCalls & UPDATE_GEOMETRY_CALL_OVERFLOW))
+ {
+ F64 time_since_app_start = ug_timer.getElapsedSeconds();
+ if(sUpdateGeometryGlobalTime == 0.0
+ || sUpdateGeometryLastProcessor != gVectorizeProcessor)
+ {
+ sUpdateGeometryGlobalTime = time_since_app_start;
+ sUpdateGeometryElapsedTime = 0;
+ sUpdateGeometryCalls = 0;
+ sUpdateGeometryRunCount = 0;
+ sUpdateGeometryLastProcessor = gVectorizeProcessor;
+ sUpdateGeometryCallPointer = false;
+ return;
+ }
+ F64 percent_time_in_function =
+ ( sUpdateGeometryElapsedTime * 100.0 ) / ( time_since_app_start - sUpdateGeometryGlobalTime ) ;
+ sUpdateGeometryGlobalTime = time_since_app_start;
+ if (!sUpdateGeometryCallPointer)
+ {
+ // First set of run data is with vectorization off.
+ sUpdateGeometryCallPointer = true;
+ llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
+ << "vectorize off " << percent_time_in_function
+ << "% of time with "
+ << (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
+ << " seconds per call "
+ << llendl;
+ sUpdateGeometryRunAvgOff[sUpdateGeometryRunCount] = percent_time_in_function;
+ sUpdateGeometryElapsedTimeOff += sUpdateGeometryElapsedTime;
+ sUpdateGeometryCalls = 0;
+ }
+ else
+ {
+ // Second set of run data is with vectorization on.
+ sUpdateGeometryCallPointer = false;
+ llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = "
+ << "VEC on " << percent_time_in_function
+ << "% of time with "
+ << (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls)
+ << " seconds per call "
+ << llendl;
+ sUpdateGeometryRunAvgOn[sUpdateGeometryRunCount] = percent_time_in_function ;
+ sUpdateGeometryElapsedTimeOn += sUpdateGeometryElapsedTime;
+
+ sUpdateGeometryCalls = 0;
+ sUpdateGeometryRunCount++;
+ F64 a = 0.0, b = 0.0;
+ for(U32 i = 0; i<sUpdateGeometryRunCount; i++)
+ {
+ a += sUpdateGeometryRunAvgOff[i];
+ b += sUpdateGeometryRunAvgOn[i];
+ }
+ a /= sUpdateGeometryRunCount;
+ b /= sUpdateGeometryRunCount;
+ F64 perf_boost = ( sUpdateGeometryElapsedTimeOff - sUpdateGeometryElapsedTimeOn ) / sUpdateGeometryElapsedTimeOn;
+ llinfos << "run averages (" << (F64)sUpdateGeometryRunCount
+ << "/10) vectorize off " << a
+ << "% : vectorize type " << gVectorizeProcessor
+ << " " << b
+ << "% : performance boost "
+ << perf_boost * 100.0
+ << "%"
+ << llendl ;
+ if(sUpdateGeometryRunCount == 10)
+ {
+ // In case user runs test again, force reset of data on
+ // next run.
+ sUpdateGeometryGlobalTime = 0.0;
+
+ // We have data now on which version is faster. Switch to that
+ // code and save the data for next run.
+ gVectorizePerfTest = FALSE;
+ gSavedSettings.setBOOL("VectorizePerfTest", FALSE);
+
+ if (perf_boost > 0.0)
+ {
+ llinfos << "Vectorization improves avatar skinning performance, "
+ << "keeping on for future runs."
+ << llendl;
+ gSavedSettings.setBOOL("VectorizeSkin", TRUE);
+ }
+ else
+ {
+ // SIMD decreases performance, fall back to original code
+ llinfos << "Vectorization decreases avatar skinning performance, "
+ << "switching back to original code."
+ << llendl;
+
+ gSavedSettings.setBOOL("VectorizeSkin", FALSE);
+ }
+ }
+ }
+ sUpdateGeometryElapsedTime = 0.0f;
+ }
+ }
+}
+
void LLViewerJointMesh::dump()
{
if (mValid)
diff --git a/indra/newview/llviewerjointmesh.h b/indra/newview/llviewerjointmesh.h
index b6fd8afcdb..992c3656a1 100644
--- a/indra/newview/llviewerjointmesh.h
+++ b/indra/newview/llviewerjointmesh.h
@@ -126,6 +126,22 @@ public:
/*virtual*/ BOOL isAnimatable() { return FALSE; }
void writeCAL3D(apr_file_t* fp, S32 material_num, LLCharacter* characterp);
+
+ // Avatar vertex skinning is a significant performance issue on computers
+ // with avatar vertex programs turned off (for example, most Macs). We
+ // therefore have custom versions that use SIMD instructions.
+ //
+ // These functions require compiler options for SSE2, SSE, or neither, and
+ // hence are contained in separate individual .cpp files. JC
+ static void updateGeometryOriginal(LLFace* face, LLPolyMesh* mesh);
+ // generic vector code, used for Altivec
+ static void updateGeometryVectorized(LLFace* face, LLPolyMesh* mesh);
+ static void updateGeometrySSE(LLFace* face, LLPolyMesh* mesh);
+ static void updateGeometrySSE2(LLFace* face, LLPolyMesh* mesh);
+
+ // Use a fuction pointer to indicate which version we are running.
+ static void (*sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh);
+
private:
// Allocate skin data
BOOL allocateSkinData( U32 numSkinJoints );
diff --git a/indra/newview/llviewerjointmesh_sse.cpp b/indra/newview/llviewerjointmesh_sse.cpp
new file mode 100644
index 0000000000..c4f8ff4fa8
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_sse.cpp
@@ -0,0 +1,94 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+
+// Do not use precompiled headers, because we need to build this file with
+// SSE support, but not the precompiled header file. JC
+#include "linden_common.h"
+
+#include "llviewerjointmesh.h"
+
+// project includes
+#include "llface.h"
+#include "llpolymesh.h"
+
+// library includes
+#include "lldarray.h"
+#include "llv4math.h" // for LL_VECTORIZE
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+#include "v3math.h"
+
+// *NOTE: SSE must be enabled for this module
+
+#if LL_VECTORIZE
+
+static LLV4Matrix4 sJointMat[32];
+
+inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j)
+{
+ m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]);
+ m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]);
+ m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]);
+ m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]);
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY]));
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ]));
+}
+
+// static
+void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh)
+{
+ LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+
+ //upload joint pivots/matrices
+ for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j )
+ {
+ matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix,
+ joint_data[j]->mSkinJoint ?
+ joint_data[j]->mSkinJoint->mRootToJointSkinOffset
+ : joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset);
+ }
+
+ F32 weight = F32_MAX;
+ LLV4Matrix4 blend_mat;
+
+ LLStrider<LLVector3> o_vertices;
+ LLStrider<LLVector3> o_normals;
+
+ LLVertexBuffer *buffer = face->mVertexBuffer;
+ buffer->getVertexStrider(o_vertices, mesh->mFaceVertexOffset);
+ buffer->getNormalStrider(o_normals, mesh->mFaceVertexOffset);
+
+ const F32* weights = mesh->getWeights();
+ const LLVector3* coords = mesh->getCoords();
+ const LLVector3* normals = mesh->getNormals();
+ for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+ {
+ if( weight != weights[index])
+ {
+ S32 joint = llfloor(weight = weights[index]);
+ blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+ }
+ blend_mat.multiply(coords[index], o_vertices[index]);
+ ((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+ }
+}
+
+#else
+
+void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh)
+{
+ LLViewerJointMesh::updateGeometryVectorized(face, mesh);
+ return;
+}
+
+#endif
diff --git a/indra/newview/llviewerjointmesh_sse2.cpp b/indra/newview/llviewerjointmesh_sse2.cpp
new file mode 100644
index 0000000000..cae602ac14
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_sse2.cpp
@@ -0,0 +1,96 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+
+// Do not use precompiled headers, because we need to build this file with
+// SSE support, but not the precompiled header file. JC
+#include "linden_common.h"
+
+#include "llviewerjointmesh.h"
+
+// project includes
+#include "llface.h"
+#include "llpolymesh.h"
+
+// library includes
+#include "lldarray.h"
+#include "llstrider.h"
+#include "llv4math.h" // for LL_VECTORIZE
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+#include "m4math.h"
+#include "v3math.h"
+
+// *NOTE: SSE2 must be enabled for this module
+
+#if LL_VECTORIZE
+
+static LLV4Matrix4 sJointMat[32];
+
+inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j)
+{
+ m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]);
+ m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]);
+ m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]);
+ m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]);
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY]));
+ m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ]));
+}
+
+// static
+void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh)
+{
+ LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+
+ //upload joint pivots/matrices
+ for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j )
+ {
+ matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix,
+ joint_data[j]->mSkinJoint ?
+ joint_data[j]->mSkinJoint->mRootToJointSkinOffset
+ : joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset);
+ }
+
+ F32 weight = F32_MAX;
+ LLV4Matrix4 blend_mat;
+
+ LLStrider<LLVector3> o_vertices;
+ LLStrider<LLVector3> o_normals;
+
+ LLVertexBuffer *buffer = face->mVertexBuffer;
+ buffer->getVertexStrider(o_vertices, mesh->mFaceVertexOffset);
+ buffer->getNormalStrider(o_normals, mesh->mFaceVertexOffset);
+
+ const F32* weights = mesh->getWeights();
+ const LLVector3* coords = mesh->getCoords();
+ const LLVector3* normals = mesh->getNormals();
+ for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+ {
+ if( weight != weights[index])
+ {
+ S32 joint = llfloor(weight = weights[index]);
+ blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+ }
+ blend_mat.multiply(coords[index], o_vertices[index]);
+ ((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+ }
+}
+
+#else
+
+void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh)
+{
+ LLViewerJointMesh::updateGeometryVectorized(face, mesh);
+ return;
+}
+
+#endif
diff --git a/indra/newview/llviewerjointmesh_vec.cpp b/indra/newview/llviewerjointmesh_vec.cpp
new file mode 100644
index 0000000000..5b1e080435
--- /dev/null
+++ b/indra/newview/llviewerjointmesh_vec.cpp
@@ -0,0 +1,76 @@
+/**
+ * @file llviewerjointmesh.cpp
+ * @brief LLV4 math class implementation with LLViewerJointMesh class
+ *
+ * Copyright (c) 2001-$CurrentYear$, Linden Research, Inc.
+ * $License$
+ */
+
+//-----------------------------------------------------------------------------
+// Header Files
+//-----------------------------------------------------------------------------
+#include "llviewerprecompiledheaders.h"
+
+#include "llviewerjointmesh.h"
+
+#include "llface.h"
+#include "llpolymesh.h"
+#include "llv4math.h"
+#include "llv4matrix3.h"
+#include "llv4matrix4.h"
+
+// *NOTE: SSE must be disabled for this module
+
+#if LL_VECTORIZE
+#error This module requires vectorization (i.e. SSE) mode to be disabled.
+#endif
+
+static LLV4Matrix4 sJointMat[32];
+
+// static
+void LLViewerJointMesh::updateGeometryVectorized(LLFace *face, LLPolyMesh *mesh)
+{
+ LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
+ S32 j, joint_num, joint_end = joint_data.count();
+ LLV4Vector3 pivot;
+
+ //upload joint pivots/matrices
+ for(j = joint_num = 0; joint_num < joint_end ; ++joint_num )
+ {
+ LLSkinJoint *sj;
+ const LLMatrix4 * wm = joint_data[joint_num]->mWorldMatrix;
+ if (NULL == (sj = joint_data[joint_num]->mSkinJoint))
+ {
+ sj = joint_data[++joint_num]->mSkinJoint;
+ ((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToParentJointSkinOffset, pivot);
+ sJointMat[j++].translate(pivot);
+ wm = joint_data[joint_num]->mWorldMatrix;
+ }
+ ((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToJointSkinOffset, pivot);
+ sJointMat[j++].translate(pivot);
+ }
+
+ F32 weight = F32_MAX;
+ LLV4Matrix4 blend_mat;
+
+ LLStrider<LLVector3> o_vertices;
+ LLStrider<LLVector3> o_normals;
+
+ LLVertexBuffer *buffer = face->mVertexBuffer;
+ buffer->getVertexStrider(o_vertices, mesh->mFaceVertexOffset);
+ buffer->getNormalStrider(o_normals, mesh->mFaceVertexOffset);
+
+ const F32* weights = mesh->getWeights();
+ const LLVector3* coords = mesh->getCoords();
+ const LLVector3* normals = mesh->getNormals();
+ for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index)
+ {
+ if( weight != weights[index])
+ {
+ S32 joint = llfloor(weight = weights[index]);
+ blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
+ }
+ blend_mat.multiply(coords[index], o_vertices[index]);
+ ((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]);
+ }
+}
diff --git a/indra/newview/llviewermenu.cpp b/indra/newview/llviewermenu.cpp
index d18859e356..7ad4f1d70b 100644
--- a/indra/newview/llviewermenu.cpp
+++ b/indra/newview/llviewermenu.cpp
@@ -960,6 +960,7 @@ extern BOOL gDebugClicks;
extern BOOL gDebugWindowProc;
extern BOOL gDebugTextEditorTips;
extern BOOL gDebugSelectMgr;
+extern BOOL gVectorizePerfTest;
void init_debug_ui_menu(LLMenuGL* menu)
{
@@ -1169,6 +1170,8 @@ void init_debug_rendering_menu(LLMenuGL* menu)
(void*)"ShowDepthBuffer"));
sub_menu->append(new LLMenuItemToggleGL("Show Select Buffer", &gDebugSelect));
+ sub_menu->append(new LLMenuItemToggleGL("Vectorize Perf Test", &gVectorizePerfTest));
+
sub_menu = new LLMenuGL("Render Tests");
sub_menu->append(new LLMenuItemCheckGL("Camera Offset",