diff options
author | James Cook <james@lindenlab.com> | 2007-07-02 23:52:40 +0000 |
---|---|---|
committer | James Cook <james@lindenlab.com> | 2007-07-02 23:52:40 +0000 |
commit | 1a33bc19b4ce94ab210749911dff14409b4454dd (patch) | |
tree | b674d97d37240a29c0a6671adfe950a506ef0ea4 /indra | |
parent | e5124431b54d4342d4677371fccca5bc7250c079 (diff) |
svn merge -r 62595:62596 and 62598:63308 sse-skinning-3 for faster software avatar rendering. Visual Studio 2005 project file fixed pending.
Diffstat (limited to 'indra')
-rw-r--r-- | indra/llcommon/llpreprocessor.h | 22 | ||||
-rw-r--r-- | indra/llcommon/llprocessor.cpp | 8 | ||||
-rw-r--r-- | indra/llcommon/llprocessor.h | 1 | ||||
-rw-r--r-- | indra/llcommon/llskiplist.h | 12 | ||||
-rw-r--r-- | indra/llcommon/llsys.cpp | 25 | ||||
-rw-r--r-- | indra/llcommon/llsys.h | 12 | ||||
-rw-r--r-- | indra/llmath/llv4math.h | 101 | ||||
-rw-r--r-- | indra/llmath/llv4matrix3.h | 202 | ||||
-rw-r--r-- | indra/llmath/llv4matrix4.h | 231 | ||||
-rw-r--r-- | indra/llmath/llv4vector3.h | 62 | ||||
-rw-r--r-- | indra/newview/lldrawable.h | 1 | ||||
-rw-r--r-- | indra/newview/llviewerjointmesh.cpp | 210 | ||||
-rw-r--r-- | indra/newview/llviewerjointmesh.h | 16 | ||||
-rw-r--r-- | indra/newview/llviewerjointmesh_sse.cpp | 94 | ||||
-rw-r--r-- | indra/newview/llviewerjointmesh_sse2.cpp | 96 | ||||
-rw-r--r-- | indra/newview/llviewerjointmesh_vec.cpp | 76 | ||||
-rw-r--r-- | indra/newview/llviewermenu.cpp | 3 |
17 files changed, 1120 insertions, 52 deletions
diff --git a/indra/llcommon/llpreprocessor.h b/indra/llcommon/llpreprocessor.h index 0882472242..4389fd3e30 100644 --- a/indra/llcommon/llpreprocessor.h +++ b/indra/llcommon/llpreprocessor.h @@ -51,12 +51,22 @@ #define MOZILLA_INTERNAL_API 1 #endif -// Deal with minor differences on Unixy OSes. -#if LL_DARWIN || LL_LINUX +// Figure out differences between compilers +#if defined(__GNUC__) #define GCC_VERSION (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) + #ifndef LL_GNUC + #define LL_GNUC 1 + #endif +#elif defined(__MSVC_VER__) || defined(_MSC_VER) + #ifndef LL_MSVC + #define LL_MSVC 1 + #endif +#endif +// Deal with minor differences on Unixy OSes. +#if LL_DARWIN || LL_LINUX // Different name, same functionality. #define stricmp strcasecmp #define strnicmp strncasecmp @@ -69,9 +79,9 @@ #endif // Deal with the differeneces on Windows -#if LL_WINDOWS +#if LL_MSVC #define snprintf safe_snprintf /* Flawfinder: ignore */ -#endif // LL_WINDOWS +#endif // LL_MSVC // Static linking with apr on windows needs to be declared. #ifdef LL_WINDOWS @@ -90,7 +100,7 @@ // Deal with VC6 problems -#if defined(LL_WINDOWS) +#if LL_MSVC #pragma warning( 3 : 4701 ) // "local variable used without being initialized" Treat this as level 3, not level 4. #pragma warning( 3 : 4702 ) // "unreachable code" Treat this as level 3, not level 4. #pragma warning( 3 : 4189 ) // "local variable initialized but not referenced" Treat this as level 3, not level 4. @@ -101,6 +111,6 @@ #pragma warning( disable : 4503 ) // 'decorated name length exceeded, name was truncated'. Does not seem to affect compilation. #pragma warning( disable : 4800 ) // 'BOOL' : forcing value to bool 'true' or 'false' (performance warning) #pragma warning( disable : 4996 ) // warning: deprecated -#endif // LL_WINDOWS +#endif // LL_MSVC #endif // not LL_LINDEN_PREPROCESSOR_H diff --git a/indra/llcommon/llprocessor.cpp b/indra/llcommon/llprocessor.cpp index 00f4a13c39..bcabb47a66 100644 --- a/indra/llcommon/llprocessor.cpp +++ b/indra/llcommon/llprocessor.cpp @@ -1518,6 +1518,7 @@ void CProcessor::GetStandardProcessorExtensions() CPUInfo._Ext.FXSR_FastStreamingSIMD_ExtensionsSaveRestore = CheckBit(edxreg, 24); CPUInfo._Ext.SSE_StreamingSIMD_Extensions = CheckBit(edxreg, 25); CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = CheckBit(edxreg, 26); + CPUInfo._Ext.Altivec_Extensions = false; CPUInfo._Ext.SS_SelfSnoop = CheckBit(edxreg, 27); CPUInfo._Ext.HT_HyperThreading = CheckBit(edxreg, 28); CPUInfo._Ext.HT_HyterThreadingSiblings = (ebxreg >> 16) & 0xFF; @@ -1871,11 +1872,12 @@ const ProcessorInfo *CProcessor::GetCPUInfo() break; } - // It's kinda like MMX or SSE... CPUInfo._Ext.EMMX_MultimediaExtensions = CPUInfo._Ext.MMX_MultimediaExtensions = CPUInfo._Ext.SSE_StreamingSIMD_Extensions = - CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = hasFeature("hw.optional.altivec"); + CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = false; + + CPUInfo._Ext.Altivec_Extensions = hasFeature("hw.optional.altivec"); #endif @@ -1892,6 +1894,7 @@ const ProcessorInfo *CProcessor::GetCPUInfo() CPUInfo._Ext.MMX_MultimediaExtensions = hasFeature("hw.optional.mmx"); CPUInfo._Ext.SSE_StreamingSIMD_Extensions = hasFeature("hw.optional.sse"); CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions = hasFeature("hw.optional.sse2"); + CPUInfo._Ext.Altivec_Extensions = false; CPUInfo._Ext.AA64_AMD64BitArchitecture = hasFeature("hw.optional.x86_64"); #endif @@ -2045,6 +2048,7 @@ bool CProcessor::CPUInfoToText(char *strBuffer, unsigned int uiMaxLen) BOOLADD("SS Self Snoop: ", CPUInfo._Ext.SS_SelfSnoop); BOOLADD("SSE Streaming SIMD Extensions: ", CPUInfo._Ext.SSE_StreamingSIMD_Extensions); BOOLADD("SSE2 Streaming SIMD 2 Extensions: ", CPUInfo._Ext.SSE2_StreamingSIMD2_Extensions); + BOOLADD("ALTVEC Altivec Extensions: ", CPUInfo._Ext.Altivec_Extensions); BOOLADD("TM Thermal Monitor: ", CPUInfo._Ext.TM_ThermalMonitor); BOOLADD("TSC Time Stamp Counter: ", CPUInfo._Ext.TSC_TimeStampCounter); BOOLADD("VME Virtual 8086 Mode Enhancements: ", CPUInfo._Ext.VME_Virtual8086ModeEnhancements); diff --git a/indra/llcommon/llprocessor.h b/indra/llcommon/llprocessor.h index 8453263f9d..9060e8aa95 100644 --- a/indra/llcommon/llprocessor.h +++ b/indra/llcommon/llprocessor.h @@ -51,6 +51,7 @@ typedef struct ProcessorExtensions bool FXSR_FastStreamingSIMD_ExtensionsSaveRestore; bool SSE_StreamingSIMD_Extensions; bool SSE2_StreamingSIMD2_Extensions; + bool Altivec_Extensions; bool SS_SelfSnoop; bool HT_HyperThreading; unsigned int HT_HyterThreadingSiblings; diff --git a/indra/llcommon/llskiplist.h b/indra/llcommon/llskiplist.h index ed1aa1f0aa..4676fb8f18 100644 --- a/indra/llcommon/llskiplist.h +++ b/indra/llcommon/llskiplist.h @@ -8,11 +8,10 @@ #ifndef LL_LLSKIPLIST_H #define LL_LLSKIPLIST_H -#include "llerror.h" -//#include "vmath.h" +#include "llrand.h" // NOTA BENE: Insert first needs to be < NOT <= - +// Binary depth must be >= 2 template <class DATA_TYPE, S32 BINARY_DEPTH = 10> class LLSkipList { @@ -124,14 +123,11 @@ private: // Implementation // + +// Binary depth must be >= 2 template <class DATA_TYPE, S32 BINARY_DEPTH> inline void LLSkipList<DATA_TYPE, BINARY_DEPTH>::init() { - if (BINARY_DEPTH < 2) - { - llerrs << "Trying to create skip list with too little depth, " - "must be 2 or greater" << llendl; - } S32 i; for (i = 0; i < BINARY_DEPTH; i++) { diff --git a/indra/llcommon/llsys.cpp b/indra/llcommon/llsys.cpp index 6f0bda4b71..90cc374ade 100644 --- a/indra/llcommon/llsys.cpp +++ b/indra/llcommon/llsys.cpp @@ -284,12 +284,33 @@ LLCPUInfo::LLCPUInfo() { CProcessor proc; const ProcessorInfo* info = proc.GetCPUInfo(); - mHasSSE = (info->_Ext.SSE_StreamingSIMD_Extensions != 0); - mHasSSE2 = (info->_Ext.SSE2_StreamingSIMD2_Extensions != 0); + // proc.WriteInfoTextFile("procInfo.txt"); + mHasSSE = info->_Ext.SSE_StreamingSIMD_Extensions; + mHasSSE2 = info->_Ext.SSE2_StreamingSIMD2_Extensions; + mHasAltivec = info->_Ext.Altivec_Extensions; mCPUMhz = (S32)(proc.GetCPUFrequency(50)/1000000.0); mFamily.assign( info->strFamily ); } +bool LLCPUInfo::hasAltivec() const +{ + return mHasAltivec; +} + +bool LLCPUInfo::hasSSE() const +{ + return mHasSSE; +} + +bool LLCPUInfo::hasSSE2() const +{ + return mHasSSE2; +} + +S32 LLCPUInfo::getMhz() const +{ + return mCPUMhz; +} std::string LLCPUInfo::getCPUString() const { diff --git a/indra/llcommon/llsys.h b/indra/llcommon/llsys.h index 7808a97b80..4b6fbe149b 100644 --- a/indra/llcommon/llsys.h +++ b/indra/llcommon/llsys.h @@ -52,16 +52,18 @@ public: std::string getCPUString() const; - BOOL hasSSE() const { return mHasSSE; } - BOOL hasSSE2() const { return mHasSSE2; } - S32 getMhz() const { return mCPUMhz; } + bool hasAltivec() const; + bool hasSSE() const; + bool hasSSE2() const; + S32 getMhz() const; // Family is "AMD Duron" or "Intel Pentium Pro" const std::string& getFamily() const { return mFamily; } private: - BOOL mHasSSE; - BOOL mHasSSE2; + bool mHasSSE; + bool mHasSSE2; + bool mHasAltivec; S32 mCPUMhz; std::string mFamily; }; diff --git a/indra/llmath/llv4math.h b/indra/llmath/llv4math.h new file mode 100644 index 0000000000..4a299716b1 --- /dev/null +++ b/indra/llmath/llv4math.h @@ -0,0 +1,101 @@ +/** + * @file llviewerjointmesh.cpp + * @brief LLV4* class header file - vector processor enabled math + * + * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc. + * $License$ + */ + +#ifndef LL_LLV4MATH_H +#define LL_LLV4MATH_H + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4MATH - GNUC +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +#if LL_GNUC && __GNUC__ >= 4 && __SSE__ + +#define LL_VECTORIZE 1 + +#if LL_DARWIN + +#include <Accelerate/Accelerate.h> +#include <xmmintrin.h> +typedef vFloat V4F32; + +#else + +#include <xmmintrin.h> +typedef float V4F32 __attribute__((vector_size(16))); + +#endif + +#endif +#if LL_GNUC + +#define LL_LLV4MATH_ALIGN_PREFIX +#define LL_LLV4MATH_ALIGN_POSTFIX __attribute__((aligned(16))) + +#endif + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4MATH - MSVC +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +#if LL_MSVC && _M_IX86_FP + +#define LL_VECTORIZE 1 + +#include <xmmintrin.h> + +typedef __m128 V4F32; + +#endif +#if LL_MSVC + +#define LL_LLV4MATH_ALIGN_PREFIX __declspec(align(16)) +#define LL_LLV4MATH_ALIGN_POSTFIX + +#endif + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4MATH - default - no vectorization +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +#if !LL_VECTORIZE + +#define LL_VECTORIZE 0 + +struct V4F32 { F32 __pad__[4]; }; + +inline F32 llv4lerp(F32 a, F32 b, F32 w) { return ( b - a ) * w + a; } + +#endif + +#ifndef LL_LLV4MATH_ALIGN_PREFIX +# define LL_LLV4MATH_ALIGN_PREFIX +#endif +#ifndef LL_LLV4MATH_ALIGN_POSTFIX +# define LL_LLV4MATH_ALIGN_POSTFIX +#endif + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4MATH +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + + +#define LLV4_NUM_AXIS 4 + +class LLV4Vector3; +class LLV4Matrix3; +class LLV4Matrix4; + +#endif diff --git a/indra/llmath/llv4matrix3.h b/indra/llmath/llv4matrix3.h new file mode 100644 index 0000000000..a273abe496 --- /dev/null +++ b/indra/llmath/llv4matrix3.h @@ -0,0 +1,202 @@ +/** + * @file llviewerjointmesh.cpp + * @brief LLV4* class header file - vector processor enabled math + * + * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc. + * $License$ + */ + +#ifndef LL_LLV4MATRIX3_H +#define LL_LLV4MATRIX3_H + +#include "llv4math.h" +#include "llv4vector3.h" +#include "m3math.h" // for operator LLMatrix3() + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Matrix3 +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +LL_LLV4MATH_ALIGN_PREFIX + +class LLV4Matrix3 +{ +public: + union { + F32 mMatrix[LLV4_NUM_AXIS][LLV4_NUM_AXIS]; + V4F32 mV[LLV4_NUM_AXIS]; + }; + + void lerp(const LLV4Matrix3 &a, const LLV4Matrix3 &b, const F32 &w); + void multiply(const LLVector3 &a, LLVector3& out) const; + void multiply(const LLVector4 &a, LLV4Vector3& out) const; + void multiply(const LLVector3 &a, LLV4Vector3& out) const; + + const LLV4Matrix3& transpose(); + const LLV4Matrix3& operator=(const LLMatrix3& a); + + operator LLMatrix3() const { return (reinterpret_cast<const LLMatrix4*>(const_cast<const F32*>(&mMatrix[0][0])))->getMat3(); } + + friend LLVector3 operator*(const LLVector3& a, const LLV4Matrix3& b); +} + +LL_LLV4MATH_ALIGN_POSTFIX; + + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Matrix3 - SSE +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +#if LL_VECTORIZE + +inline void LLV4Matrix3::lerp(const LLV4Matrix3 &a, const LLV4Matrix3 &b, const F32 &w) +{ + __m128 vw = _mm_set1_ps(w); + mV[VX] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VX], a.mV[VX]), vw), a.mV[VX]); // ( b - a ) * w + a + mV[VY] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VY], a.mV[VY]), vw), a.mV[VY]); + mV[VZ] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VZ], a.mV[VZ]), vw), a.mV[VZ]); +} + +inline void LLV4Matrix3::multiply(const LLVector3 &a, LLVector3& o) const +{ + LLV4Vector3 j; + j.v = _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX]); // ( ax * vx ) + ... + j.v = _mm_add_ps(j.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY])); + j.v = _mm_add_ps(j.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ])); + o.setVec(j.mV); +} + +inline void LLV4Matrix3::multiply(const LLVector4 &a, LLV4Vector3& o) const +{ + o.v = _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX]); // ( ax * vx ) + ... + o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY])); + o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ])); +} + +inline void LLV4Matrix3::multiply(const LLVector3 &a, LLV4Vector3& o) const +{ + o.v = _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX]); // ( ax * vx ) + ... + o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY])); + o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ])); +} + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Matrix3 +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +#else + +inline void LLV4Matrix3::lerp(const LLV4Matrix3 &a, const LLV4Matrix3 &b, const F32 &w) +{ + mMatrix[VX][VX] = llv4lerp(a.mMatrix[VX][VX], b.mMatrix[VX][VX], w); + mMatrix[VX][VY] = llv4lerp(a.mMatrix[VX][VY], b.mMatrix[VX][VY], w); + mMatrix[VX][VZ] = llv4lerp(a.mMatrix[VX][VZ], b.mMatrix[VX][VZ], w); + + mMatrix[VY][VX] = llv4lerp(a.mMatrix[VY][VX], b.mMatrix[VY][VX], w); + mMatrix[VY][VY] = llv4lerp(a.mMatrix[VY][VY], b.mMatrix[VY][VY], w); + mMatrix[VY][VZ] = llv4lerp(a.mMatrix[VY][VZ], b.mMatrix[VY][VZ], w); + + mMatrix[VZ][VX] = llv4lerp(a.mMatrix[VZ][VX], b.mMatrix[VZ][VX], w); + mMatrix[VZ][VY] = llv4lerp(a.mMatrix[VZ][VY], b.mMatrix[VZ][VY], w); + mMatrix[VZ][VZ] = llv4lerp(a.mMatrix[VZ][VZ], b.mMatrix[VZ][VZ], w); +} + +inline void LLV4Matrix3::multiply(const LLVector3 &a, LLVector3& o) const +{ + o.setVec( a.mV[VX] * mMatrix[VX][VX] + + a.mV[VY] * mMatrix[VY][VX] + + a.mV[VZ] * mMatrix[VZ][VX], + + a.mV[VX] * mMatrix[VX][VY] + + a.mV[VY] * mMatrix[VY][VY] + + a.mV[VZ] * mMatrix[VZ][VY], + + a.mV[VX] * mMatrix[VX][VZ] + + a.mV[VY] * mMatrix[VY][VZ] + + a.mV[VZ] * mMatrix[VZ][VZ]); +} + +inline void LLV4Matrix3::multiply(const LLVector4 &a, LLV4Vector3& o) const +{ + o.setVec( a.mV[VX] * mMatrix[VX][VX] + + a.mV[VY] * mMatrix[VY][VX] + + a.mV[VZ] * mMatrix[VZ][VX], + + a.mV[VX] * mMatrix[VX][VY] + + a.mV[VY] * mMatrix[VY][VY] + + a.mV[VZ] * mMatrix[VZ][VY], + + a.mV[VX] * mMatrix[VX][VZ] + + a.mV[VY] * mMatrix[VY][VZ] + + a.mV[VZ] * mMatrix[VZ][VZ]); +} + +inline void LLV4Matrix3::multiply(const LLVector3 &a, LLV4Vector3& o) const +{ + o.setVec( a.mV[VX] * mMatrix[VX][VX] + + a.mV[VY] * mMatrix[VY][VX] + + a.mV[VZ] * mMatrix[VZ][VX], + + a.mV[VX] * mMatrix[VX][VY] + + a.mV[VY] * mMatrix[VY][VY] + + a.mV[VZ] * mMatrix[VZ][VY], + + a.mV[VX] * mMatrix[VX][VZ] + + a.mV[VY] * mMatrix[VY][VZ] + + a.mV[VZ] * mMatrix[VZ][VZ]); +} + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Matrix3 +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +#endif + +inline const LLV4Matrix3& LLV4Matrix3::transpose() +{ +#if LL_VECTORIZE && defined(_MM_TRANSPOSE4_PS) + _MM_TRANSPOSE4_PS(mV[VX], mV[VY], mV[VZ], mV[VW]); + return *this; +#else + F32 temp; + temp = mMatrix[VX][VY]; mMatrix[VX][VY] = mMatrix[VY][VX]; mMatrix[VY][VX] = temp; + temp = mMatrix[VX][VZ]; mMatrix[VX][VZ] = mMatrix[VZ][VX]; mMatrix[VZ][VX] = temp; + temp = mMatrix[VY][VZ]; mMatrix[VY][VZ] = mMatrix[VZ][VY]; mMatrix[VZ][VY] = temp; +#endif + return *this; +} + +inline const LLV4Matrix3& LLV4Matrix3::operator=(const LLMatrix3& a) +{ + memcpy(mMatrix[VX], a.mMatrix[VX], sizeof(F32) * 3 ); + memcpy(mMatrix[VY], a.mMatrix[VY], sizeof(F32) * 3 ); + memcpy(mMatrix[VZ], a.mMatrix[VZ], sizeof(F32) * 3 ); + return *this; +} + +inline LLVector3 operator*(const LLVector3& a, const LLV4Matrix3& b) +{ + return LLVector3( + a.mV[VX] * b.mMatrix[VX][VX] + + a.mV[VY] * b.mMatrix[VY][VX] + + a.mV[VZ] * b.mMatrix[VZ][VX], + + a.mV[VX] * b.mMatrix[VX][VY] + + a.mV[VY] * b.mMatrix[VY][VY] + + a.mV[VZ] * b.mMatrix[VZ][VY], + + a.mV[VX] * b.mMatrix[VX][VZ] + + a.mV[VY] * b.mMatrix[VY][VZ] + + a.mV[VZ] * b.mMatrix[VZ][VZ] ); +} + +#endif diff --git a/indra/llmath/llv4matrix4.h b/indra/llmath/llv4matrix4.h new file mode 100644 index 0000000000..0673f6fa7d --- /dev/null +++ b/indra/llmath/llv4matrix4.h @@ -0,0 +1,231 @@ +/** + * @file llviewerjointmesh.cpp + * @brief LLV4* class header file - vector processor enabled math + * + * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc. + * $License$ + */ + +#ifndef LL_LLV4MATRIX4_H +#define LL_LLV4MATRIX4_H + +#include "llv4math.h" +#include "llv4matrix3.h" // just for operator LLV4Matrix3() +#include "llv4vector3.h" + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Matrix4 +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +LL_LLV4MATH_ALIGN_PREFIX + +class LLV4Matrix4 +{ +public: + union { + F32 mMatrix[LLV4_NUM_AXIS][LLV4_NUM_AXIS]; + V4F32 mV[LLV4_NUM_AXIS]; + }; + + void lerp(const LLV4Matrix4 &a, const LLV4Matrix4 &b, const F32 &w); + void multiply(const LLVector3 &a, LLVector3& o) const; + void multiply(const LLVector3 &a, LLV4Vector3& o) const; + + const LLV4Matrix4& transpose(); + const LLV4Matrix4& translate(const LLVector3 &vec); + const LLV4Matrix4& translate(const LLV4Vector3 &vec); + const LLV4Matrix4& operator=(const LLMatrix4& a); + + operator LLMatrix4() const { return *(reinterpret_cast<const LLMatrix4*>(const_cast<const F32*>(&mMatrix[0][0]))); } + operator LLV4Matrix3() const { return *(reinterpret_cast<const LLV4Matrix3*>(const_cast<const F32*>(&mMatrix[0][0]))); } + + friend LLVector3 operator*(const LLVector3 &a, const LLV4Matrix4 &b); +} + +LL_LLV4MATH_ALIGN_POSTFIX; + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Matrix4 - SSE +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +#if LL_VECTORIZE + +inline void LLV4Matrix4::lerp(const LLV4Matrix4 &a, const LLV4Matrix4 &b, const F32 &w) +{ + __m128 vw = _mm_set1_ps(w); + mV[VX] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VX], a.mV[VX]), vw), a.mV[VX]); // ( b - a ) * w + a + mV[VY] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VY], a.mV[VY]), vw), a.mV[VY]); + mV[VZ] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VZ], a.mV[VZ]), vw), a.mV[VZ]); + mV[VW] = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b.mV[VW], a.mV[VW]), vw), a.mV[VW]); +} + +inline void LLV4Matrix4::multiply(const LLVector3 &a, LLVector3& o) const +{ + LLV4Vector3 j; + j.v = _mm_add_ps(mV[VW], _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX])); // ( ax * vx ) + vw + j.v = _mm_add_ps(j.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY])); + j.v = _mm_add_ps(j.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ])); + o.setVec(j.mV); +} + +inline void LLV4Matrix4::multiply(const LLVector3 &a, LLV4Vector3& o) const +{ + o.v = _mm_add_ps(mV[VW], _mm_mul_ps(_mm_set1_ps(a.mV[VX]), mV[VX])); // ( ax * vx ) + vw + o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VY]), mV[VY])); + o.v = _mm_add_ps(o.v , _mm_mul_ps(_mm_set1_ps(a.mV[VZ]), mV[VZ])); +} + +inline const LLV4Matrix4& LLV4Matrix4::translate(const LLV4Vector3 &vec) +{ + mV[VW] = _mm_add_ps(mV[VW], vec.v); + return (*this); +} + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Matrix4 +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +#else + +inline void LLV4Matrix4::lerp(const LLV4Matrix4 &a, const LLV4Matrix4 &b, const F32 &w) +{ + mMatrix[VX][VX] = llv4lerp(a.mMatrix[VX][VX], b.mMatrix[VX][VX], w); + mMatrix[VX][VY] = llv4lerp(a.mMatrix[VX][VY], b.mMatrix[VX][VY], w); + mMatrix[VX][VZ] = llv4lerp(a.mMatrix[VX][VZ], b.mMatrix[VX][VZ], w); + + mMatrix[VY][VX] = llv4lerp(a.mMatrix[VY][VX], b.mMatrix[VY][VX], w); + mMatrix[VY][VY] = llv4lerp(a.mMatrix[VY][VY], b.mMatrix[VY][VY], w); + mMatrix[VY][VZ] = llv4lerp(a.mMatrix[VY][VZ], b.mMatrix[VY][VZ], w); + + mMatrix[VZ][VX] = llv4lerp(a.mMatrix[VZ][VX], b.mMatrix[VZ][VX], w); + mMatrix[VZ][VY] = llv4lerp(a.mMatrix[VZ][VY], b.mMatrix[VZ][VY], w); + mMatrix[VZ][VZ] = llv4lerp(a.mMatrix[VZ][VZ], b.mMatrix[VZ][VZ], w); + + mMatrix[VW][VX] = llv4lerp(a.mMatrix[VW][VX], b.mMatrix[VW][VX], w); + mMatrix[VW][VY] = llv4lerp(a.mMatrix[VW][VY], b.mMatrix[VW][VY], w); + mMatrix[VW][VZ] = llv4lerp(a.mMatrix[VW][VZ], b.mMatrix[VW][VZ], w); +} + +inline void LLV4Matrix4::multiply(const LLVector3 &a, LLVector3& o) const +{ + o.setVec( a.mV[VX] * mMatrix[VX][VX] + + a.mV[VY] * mMatrix[VY][VX] + + a.mV[VZ] * mMatrix[VZ][VX] + + mMatrix[VW][VX], + + a.mV[VX] * mMatrix[VX][VY] + + a.mV[VY] * mMatrix[VY][VY] + + a.mV[VZ] * mMatrix[VZ][VY] + + mMatrix[VW][VY], + + a.mV[VX] * mMatrix[VX][VZ] + + a.mV[VY] * mMatrix[VY][VZ] + + a.mV[VZ] * mMatrix[VZ][VZ] + + mMatrix[VW][VZ]); +} + +inline void LLV4Matrix4::multiply(const LLVector3 &a, LLV4Vector3& o) const +{ + o.setVec( a.mV[VX] * mMatrix[VX][VX] + + a.mV[VY] * mMatrix[VY][VX] + + a.mV[VZ] * mMatrix[VZ][VX] + + mMatrix[VW][VX], + + a.mV[VX] * mMatrix[VX][VY] + + a.mV[VY] * mMatrix[VY][VY] + + a.mV[VZ] * mMatrix[VZ][VY] + + mMatrix[VW][VY], + + a.mV[VX] * mMatrix[VX][VZ] + + a.mV[VY] * mMatrix[VY][VZ] + + a.mV[VZ] * mMatrix[VZ][VZ] + + mMatrix[VW][VZ]); +} + +inline const LLV4Matrix4& LLV4Matrix4::translate(const LLV4Vector3 &vec) +{ + mMatrix[3][0] += vec.mV[0]; + mMatrix[3][1] += vec.mV[1]; + mMatrix[3][2] += vec.mV[2]; + return (*this); +} + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Matrix4 +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +#endif + +inline const LLV4Matrix4& LLV4Matrix4::operator=(const LLMatrix4& a) +{ + memcpy(mMatrix, a.mMatrix, sizeof(F32) * 16 ); + return *this; +} + +inline const LLV4Matrix4& LLV4Matrix4::transpose() +{ +#if LL_VECTORIZE && defined(_MM_TRANSPOSE4_PS) + _MM_TRANSPOSE4_PS(mV[VX], mV[VY], mV[VZ], mV[VW]); +#else + LLV4Matrix4 mat; + mat.mMatrix[0][0] = mMatrix[0][0]; + mat.mMatrix[1][0] = mMatrix[0][1]; + mat.mMatrix[2][0] = mMatrix[0][2]; + mat.mMatrix[3][0] = mMatrix[0][3]; + + mat.mMatrix[0][1] = mMatrix[1][0]; + mat.mMatrix[1][1] = mMatrix[1][1]; + mat.mMatrix[2][1] = mMatrix[1][2]; + mat.mMatrix[3][1] = mMatrix[1][3]; + + mat.mMatrix[0][2] = mMatrix[2][0]; + mat.mMatrix[1][2] = mMatrix[2][1]; + mat.mMatrix[2][2] = mMatrix[2][2]; + mat.mMatrix[3][2] = mMatrix[2][3]; + + mat.mMatrix[0][3] = mMatrix[3][0]; + mat.mMatrix[1][3] = mMatrix[3][1]; + mat.mMatrix[2][3] = mMatrix[3][2]; + mat.mMatrix[3][3] = mMatrix[3][3]; + + *this = mat; +#endif + return *this; +} + +inline const LLV4Matrix4& LLV4Matrix4::translate(const LLVector3 &vec) +{ + mMatrix[3][0] += vec.mV[0]; + mMatrix[3][1] += vec.mV[1]; + mMatrix[3][2] += vec.mV[2]; + return (*this); +} + +inline LLVector3 operator*(const LLVector3 &a, const LLV4Matrix4 &b) +{ + return LLVector3(a.mV[VX] * b.mMatrix[VX][VX] + + a.mV[VY] * b.mMatrix[VY][VX] + + a.mV[VZ] * b.mMatrix[VZ][VX] + + b.mMatrix[VW][VX], + + a.mV[VX] * b.mMatrix[VX][VY] + + a.mV[VY] * b.mMatrix[VY][VY] + + a.mV[VZ] * b.mMatrix[VZ][VY] + + b.mMatrix[VW][VY], + + a.mV[VX] * b.mMatrix[VX][VZ] + + a.mV[VY] * b.mMatrix[VY][VZ] + + a.mV[VZ] * b.mMatrix[VZ][VZ] + + b.mMatrix[VW][VZ]); +} + + +#endif diff --git a/indra/llmath/llv4vector3.h b/indra/llmath/llv4vector3.h new file mode 100644 index 0000000000..7bf8c5ce91 --- /dev/null +++ b/indra/llmath/llv4vector3.h @@ -0,0 +1,62 @@ +/** + * @file llviewerjointmesh.cpp + * @brief LLV4* class header file - vector processor enabled math + * + * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc. + * $License$ + */ + +#ifndef LL_LLV4VECTOR3_H +#define LL_LLV4VECTOR3_H + +#include "llv4math.h" + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Vector3 +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +LL_LLV4MATH_ALIGN_PREFIX + +class LLV4Vector3 +{ +public: + union { + F32 mV[LLV4_NUM_AXIS]; + V4F32 v; + }; + + enum { + ALIGNMENT = 16 + }; + + void setVec(F32 x, F32 y, F32 z); + void setVec(F32 a); +} + +LL_LLV4MATH_ALIGN_POSTFIX; + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +// LLV4Vector3 +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +inline void LLV4Vector3::setVec(F32 x, F32 y, F32 z) +{ + mV[VX] = x; + mV[VY] = y; + mV[VZ] = z; +} + +inline void LLV4Vector3::setVec(F32 a) +{ +#if LL_VECTORIZE + v = _mm_set1_ps(a); +#else + setVec(a, a, a); +#endif +} + +#endif diff --git a/indra/newview/lldrawable.h b/indra/newview/lldrawable.h index 48c58dbb4c..328a116f59 100644 --- a/indra/newview/lldrawable.h +++ b/indra/newview/lldrawable.h @@ -26,6 +26,7 @@ #include "llviewerobject.h" #include "llrect.h" +class LLCamera; class LLDrawPool; class LLDrawable; class LLFace; diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp index 150943465d..ba4c7e1b20 100644 --- a/indra/newview/llviewerjointmesh.cpp +++ b/indra/newview/llviewerjointmesh.cpp @@ -19,6 +19,7 @@ #include "llfasttimer.h" #include "llagent.h" +#include "llapr.h" #include "llbox.h" #include "lldrawable.h" #include "lldrawpoolavatar.h" @@ -29,12 +30,18 @@ #include "llglheaders.h" #include "lltexlayer.h" #include "llviewercamera.h" +#include "llviewercontrol.h" #include "llviewerimagelist.h" #include "llviewerjointmesh.h" #include "llvoavatar.h" #include "llsky.h" #include "pipeline.h" #include "llglslshader.h" +#include "llmath.h" +#include "v4math.h" +#include "m3math.h" +#include "m4math.h" + #if !LL_DARWIN && !LL_LINUX extern PFNGLWEIGHTPOINTERARBPROC glWeightPointerARB; @@ -48,6 +55,7 @@ static const U32 sRenderMask = LLVertexBuffer::MAP_VERTEX | LLVertexBuffer::MAP_NORMAL | LLVertexBuffer::MAP_TEXCOORD; + //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- // LLViewerJointMesh::LLSkinJoint @@ -100,6 +108,7 @@ BOOL LLSkinJoint::setupSkinJoint( LLViewerJoint *joint) return TRUE; } + //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- // LLViewerJointMesh @@ -394,9 +403,9 @@ const S32 NUM_AXES = 3; // rotation Z 0-n // pivot parent 0-n -- child = n+1 -static LLMatrix4 gJointMat[32]; -static LLMatrix3 gJointRot[32]; -static LLVector4 gJointPivot[32]; +static LLMatrix4 gJointMatUnaligned[32]; +static LLMatrix3 gJointRotUnaligned[32]; +static LLVector4 gJointPivot[32]; //----------------------------------------------------------------------------- // uploadJointMatrices() @@ -417,8 +426,8 @@ void LLViewerJointMesh::uploadJointMatrices() { joint_mat *= LLDrawPoolAvatar::getModelView(); } - gJointMat[joint_num] = joint_mat; - gJointRot[joint_num] = joint_mat.getMat3(); + gJointMatUnaligned[joint_num] = joint_mat; + gJointRotUnaligned[joint_num] = joint_mat.getMat3(); } BOOL last_pivot_uploaded = FALSE; @@ -455,8 +464,8 @@ void LLViewerJointMesh::uploadJointMatrices() { LLVector3 pivot; pivot = LLVector3(gJointPivot[i]); - pivot = pivot * gJointRot[i]; - gJointMat[i].translate(pivot); + pivot = pivot * gJointRotUnaligned[i]; + gJointMatUnaligned[i].translate(pivot); } // upload matrices @@ -467,11 +476,11 @@ void LLViewerJointMesh::uploadJointMatrices() for (joint_num = 0; joint_num < reference_mesh->mJointRenderData.count(); joint_num++) { - gJointMat[joint_num].transpose(); + gJointMatUnaligned[joint_num].transpose(); for (S32 axis = 0; axis < NUM_AXES; axis++) { - F32* vector = gJointMat[joint_num].mMatrix[axis]; + F32* vector = gJointMatUnaligned[joint_num].mMatrix[axis]; //glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, LL_CHARACTER_MAX_JOINTS_PER_MESH * axis + joint_num+5, (GLfloat*)vector); U32 offset = LL_CHARACTER_MAX_JOINTS_PER_MESH*axis+joint_num; memcpy(mat+offset*4, vector, sizeof(GLfloat)*4); @@ -883,21 +892,9 @@ BOOL LLViewerJointMesh::updateLOD(F32 pixel_area, BOOL activate) return (valid != activate); } - -void LLViewerJointMesh::updateGeometry() +// static +void LLViewerJointMesh::updateGeometryOriginal(LLFace *mFace, LLPolyMesh *mMesh) { - if (!(mValid - && mMesh - && mFace - && mMesh->hasWeights() - && mFace->mVertexBuffer.notNull() - && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0)) - { - return; - } - - uploadJointMatrices(); - LLStrider<LLVector3> o_vertices; LLStrider<LLVector3> o_normals; @@ -938,9 +935,9 @@ void LLViewerJointMesh::updateGeometry() // No lerp required in this case. if (w == 1.0f) { - gBlendMat = gJointMat[joint+1]; + gBlendMat = gJointMatUnaligned[joint+1]; o_vertices[bidx] = coords[index] * gBlendMat; - gBlendRotMat = gJointRot[joint+1]; + gBlendRotMat = gJointRotUnaligned[joint+1]; o_normals[bidx] = normals[index] * gBlendRotMat; continue; } @@ -948,8 +945,8 @@ void LLViewerJointMesh::updateGeometry() // Try to keep all the accesses to the matrix data as close // together as possible. This function is a hot spot on the // Mac. JC - LLMatrix4 &m0 = gJointMat[joint+1]; - LLMatrix4 &m1 = gJointMat[joint+0]; + LLMatrix4 &m0 = gJointMatUnaligned[joint+1]; + LLMatrix4 &m1 = gJointMatUnaligned[joint+0]; gBlendMat.mMatrix[VX][VX] = lerp(m1.mMatrix[VX][VX], m0.mMatrix[VX][VX], w); gBlendMat.mMatrix[VX][VY] = lerp(m1.mMatrix[VX][VY], m0.mMatrix[VX][VY], w); @@ -969,8 +966,8 @@ void LLViewerJointMesh::updateGeometry() o_vertices[bidx] = coords[index] * gBlendMat; - LLMatrix3 &n0 = gJointRot[joint+1]; - LLMatrix3 &n1 = gJointRot[joint+0]; + LLMatrix3 &n0 = gJointRotUnaligned[joint+1]; + LLMatrix3 &n1 = gJointRotUnaligned[joint+0]; gBlendRotMat.mMatrix[VX][VX] = lerp(n1.mMatrix[VX][VX], n0.mMatrix[VX][VX], w); gBlendRotMat.mMatrix[VX][VY] = lerp(n1.mMatrix[VX][VY], n0.mMatrix[VX][VY], w); @@ -988,6 +985,161 @@ void LLViewerJointMesh::updateGeometry() } } +const U32 UPDATE_GEOMETRY_CALL_MASK = 0x1FFF; // 8K samples before overflow +const U32 UPDATE_GEOMETRY_CALL_OVERFLOW = ~UPDATE_GEOMETRY_CALL_MASK; +static bool sUpdateGeometryCallPointer = false; +static F64 sUpdateGeometryGlobalTime = 0.0 ; +static F64 sUpdateGeometryElapsedTime = 0.0 ; +static F64 sUpdateGeometryElapsedTimeOff = 0.0 ; +static F64 sUpdateGeometryElapsedTimeOn = 0.0 ; +static F64 sUpdateGeometryRunAvgOff[10]; +static F64 sUpdateGeometryRunAvgOn[10]; +static U32 sUpdateGeometryRunCount = 0 ; +static U32 sUpdateGeometryCalls = 0 ; +static U32 sUpdateGeometryLastProcessor = 0 ; +void (*LLViewerJointMesh::sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh); + +void LLViewerJointMesh::updateGeometry() +{ + extern BOOL gVectorizePerfTest; + extern U32 gVectorizeProcessor; + + if (!(mValid + && mMesh + && mFace + && mMesh->hasWeights() + && mFace->mVertexBuffer.notNull() + && LLShaderMgr::getVertexShaderLevel(LLShaderMgr::SHADER_AVATAR) == 0)) + { + return; + } + + if (!gVectorizePerfTest) + { + // Once we've measured performance, just run the specified + // code version. + if(sUpdateGeometryFunc == updateGeometryOriginal) + uploadJointMatrices(); + sUpdateGeometryFunc(mFace, mMesh); + } + else + { + // At startup, measure the amount of time in skinning and choose + // the fastest one. + LLTimer ug_timer ; + + if (sUpdateGeometryCallPointer) + { + if(sUpdateGeometryFunc == updateGeometryOriginal) + uploadJointMatrices(); + // call accelerated version for this processor + sUpdateGeometryFunc(mFace, mMesh); + } + else + { + uploadJointMatrices(); + updateGeometryOriginal(mFace, mMesh); + } + + sUpdateGeometryElapsedTime += ug_timer.getElapsedTimeF64(); + ++sUpdateGeometryCalls; + if(0 != (sUpdateGeometryCalls & UPDATE_GEOMETRY_CALL_OVERFLOW)) + { + F64 time_since_app_start = ug_timer.getElapsedSeconds(); + if(sUpdateGeometryGlobalTime == 0.0 + || sUpdateGeometryLastProcessor != gVectorizeProcessor) + { + sUpdateGeometryGlobalTime = time_since_app_start; + sUpdateGeometryElapsedTime = 0; + sUpdateGeometryCalls = 0; + sUpdateGeometryRunCount = 0; + sUpdateGeometryLastProcessor = gVectorizeProcessor; + sUpdateGeometryCallPointer = false; + return; + } + F64 percent_time_in_function = + ( sUpdateGeometryElapsedTime * 100.0 ) / ( time_since_app_start - sUpdateGeometryGlobalTime ) ; + sUpdateGeometryGlobalTime = time_since_app_start; + if (!sUpdateGeometryCallPointer) + { + // First set of run data is with vectorization off. + sUpdateGeometryCallPointer = true; + llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = " + << "vectorize off " << percent_time_in_function + << "% of time with " + << (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls) + << " seconds per call " + << llendl; + sUpdateGeometryRunAvgOff[sUpdateGeometryRunCount] = percent_time_in_function; + sUpdateGeometryElapsedTimeOff += sUpdateGeometryElapsedTime; + sUpdateGeometryCalls = 0; + } + else + { + // Second set of run data is with vectorization on. + sUpdateGeometryCallPointer = false; + llinfos << "profile (avg of " << sUpdateGeometryCalls << " samples) = " + << "VEC on " << percent_time_in_function + << "% of time with " + << (sUpdateGeometryElapsedTime / (F64)sUpdateGeometryCalls) + << " seconds per call " + << llendl; + sUpdateGeometryRunAvgOn[sUpdateGeometryRunCount] = percent_time_in_function ; + sUpdateGeometryElapsedTimeOn += sUpdateGeometryElapsedTime; + + sUpdateGeometryCalls = 0; + sUpdateGeometryRunCount++; + F64 a = 0.0, b = 0.0; + for(U32 i = 0; i<sUpdateGeometryRunCount; i++) + { + a += sUpdateGeometryRunAvgOff[i]; + b += sUpdateGeometryRunAvgOn[i]; + } + a /= sUpdateGeometryRunCount; + b /= sUpdateGeometryRunCount; + F64 perf_boost = ( sUpdateGeometryElapsedTimeOff - sUpdateGeometryElapsedTimeOn ) / sUpdateGeometryElapsedTimeOn; + llinfos << "run averages (" << (F64)sUpdateGeometryRunCount + << "/10) vectorize off " << a + << "% : vectorize type " << gVectorizeProcessor + << " " << b + << "% : performance boost " + << perf_boost * 100.0 + << "%" + << llendl ; + if(sUpdateGeometryRunCount == 10) + { + // In case user runs test again, force reset of data on + // next run. + sUpdateGeometryGlobalTime = 0.0; + + // We have data now on which version is faster. Switch to that + // code and save the data for next run. + gVectorizePerfTest = FALSE; + gSavedSettings.setBOOL("VectorizePerfTest", FALSE); + + if (perf_boost > 0.0) + { + llinfos << "Vectorization improves avatar skinning performance, " + << "keeping on for future runs." + << llendl; + gSavedSettings.setBOOL("VectorizeSkin", TRUE); + } + else + { + // SIMD decreases performance, fall back to original code + llinfos << "Vectorization decreases avatar skinning performance, " + << "switching back to original code." + << llendl; + + gSavedSettings.setBOOL("VectorizeSkin", FALSE); + } + } + } + sUpdateGeometryElapsedTime = 0.0f; + } + } +} + void LLViewerJointMesh::dump() { if (mValid) diff --git a/indra/newview/llviewerjointmesh.h b/indra/newview/llviewerjointmesh.h index b6fd8afcdb..992c3656a1 100644 --- a/indra/newview/llviewerjointmesh.h +++ b/indra/newview/llviewerjointmesh.h @@ -126,6 +126,22 @@ public: /*virtual*/ BOOL isAnimatable() { return FALSE; } void writeCAL3D(apr_file_t* fp, S32 material_num, LLCharacter* characterp); + + // Avatar vertex skinning is a significant performance issue on computers + // with avatar vertex programs turned off (for example, most Macs). We + // therefore have custom versions that use SIMD instructions. + // + // These functions require compiler options for SSE2, SSE, or neither, and + // hence are contained in separate individual .cpp files. JC + static void updateGeometryOriginal(LLFace* face, LLPolyMesh* mesh); + // generic vector code, used for Altivec + static void updateGeometryVectorized(LLFace* face, LLPolyMesh* mesh); + static void updateGeometrySSE(LLFace* face, LLPolyMesh* mesh); + static void updateGeometrySSE2(LLFace* face, LLPolyMesh* mesh); + + // Use a fuction pointer to indicate which version we are running. + static void (*sUpdateGeometryFunc)(LLFace* face, LLPolyMesh* mesh); + private: // Allocate skin data BOOL allocateSkinData( U32 numSkinJoints ); diff --git a/indra/newview/llviewerjointmesh_sse.cpp b/indra/newview/llviewerjointmesh_sse.cpp new file mode 100644 index 0000000000..c4f8ff4fa8 --- /dev/null +++ b/indra/newview/llviewerjointmesh_sse.cpp @@ -0,0 +1,94 @@ +/** + * @file llviewerjointmesh.cpp + * @brief LLV4 class implementation with LLViewerJointMesh class + * + * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc. + * $License$ + */ + +//----------------------------------------------------------------------------- +// Header Files +//----------------------------------------------------------------------------- + +// Do not use precompiled headers, because we need to build this file with +// SSE support, but not the precompiled header file. JC +#include "linden_common.h" + +#include "llviewerjointmesh.h" + +// project includes +#include "llface.h" +#include "llpolymesh.h" + +// library includes +#include "lldarray.h" +#include "llv4math.h" // for LL_VECTORIZE +#include "llv4matrix3.h" +#include "llv4matrix4.h" +#include "v3math.h" + +// *NOTE: SSE must be enabled for this module + +#if LL_VECTORIZE + +static LLV4Matrix4 sJointMat[32]; + +inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j) +{ + m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]); + m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]); + m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]); + m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]); + m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw + m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY])); + m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ])); +} + +// static +void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh) +{ + LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData; + + //upload joint pivots/matrices + for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j ) + { + matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix, + joint_data[j]->mSkinJoint ? + joint_data[j]->mSkinJoint->mRootToJointSkinOffset + : joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset); + } + + F32 weight = F32_MAX; + LLV4Matrix4 blend_mat; + + LLStrider<LLVector3> o_vertices; + LLStrider<LLVector3> o_normals; + + LLVertexBuffer *buffer = face->mVertexBuffer; + buffer->getVertexStrider(o_vertices, mesh->mFaceVertexOffset); + buffer->getNormalStrider(o_normals, mesh->mFaceVertexOffset); + + const F32* weights = mesh->getWeights(); + const LLVector3* coords = mesh->getCoords(); + const LLVector3* normals = mesh->getNormals(); + for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index) + { + if( weight != weights[index]) + { + S32 joint = llfloor(weight = weights[index]); + blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint); + } + blend_mat.multiply(coords[index], o_vertices[index]); + ((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]); + } +} + +#else + +void LLViewerJointMesh::updateGeometrySSE(LLFace *face, LLPolyMesh *mesh) +{ + LLViewerJointMesh::updateGeometryVectorized(face, mesh); + return; +} + +#endif diff --git a/indra/newview/llviewerjointmesh_sse2.cpp b/indra/newview/llviewerjointmesh_sse2.cpp new file mode 100644 index 0000000000..cae602ac14 --- /dev/null +++ b/indra/newview/llviewerjointmesh_sse2.cpp @@ -0,0 +1,96 @@ +/** + * @file llviewerjointmesh.cpp + * @brief LLV4 class implementation with LLViewerJointMesh class + * + * Copyright (c) 2007-$CurrentYear$, Linden Research, Inc. + * $License$ + */ + +//----------------------------------------------------------------------------- +// Header Files +//----------------------------------------------------------------------------- + +// Do not use precompiled headers, because we need to build this file with +// SSE support, but not the precompiled header file. JC +#include "linden_common.h" + +#include "llviewerjointmesh.h" + +// project includes +#include "llface.h" +#include "llpolymesh.h" + +// library includes +#include "lldarray.h" +#include "llstrider.h" +#include "llv4math.h" // for LL_VECTORIZE +#include "llv4matrix3.h" +#include "llv4matrix4.h" +#include "m4math.h" +#include "v3math.h" + +// *NOTE: SSE2 must be enabled for this module + +#if LL_VECTORIZE + +static LLV4Matrix4 sJointMat[32]; + +inline void matrix_translate(LLV4Matrix4& m, const LLMatrix4* w, const LLVector3& j) +{ + m.mV[VX] = _mm_loadu_ps(w->mMatrix[VX]); + m.mV[VY] = _mm_loadu_ps(w->mMatrix[VY]); + m.mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ]); + m.mV[VW] = _mm_loadu_ps(w->mMatrix[VW]); + m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VX]), m.mV[VX])); // ( ax * vx ) + vw + m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VY]), m.mV[VY])); + m.mV[VW] = _mm_add_ps(m.mV[VW], _mm_mul_ps(_mm_set1_ps(j.mV[VZ]), m.mV[VZ])); +} + +// static +void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh) +{ + LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData; + + //upload joint pivots/matrices + for(S32 j = 0, jend = joint_data.count(); j < jend ; ++j ) + { + matrix_translate(sJointMat[j], joint_data[j]->mWorldMatrix, + joint_data[j]->mSkinJoint ? + joint_data[j]->mSkinJoint->mRootToJointSkinOffset + : joint_data[j+1]->mSkinJoint->mRootToParentJointSkinOffset); + } + + F32 weight = F32_MAX; + LLV4Matrix4 blend_mat; + + LLStrider<LLVector3> o_vertices; + LLStrider<LLVector3> o_normals; + + LLVertexBuffer *buffer = face->mVertexBuffer; + buffer->getVertexStrider(o_vertices, mesh->mFaceVertexOffset); + buffer->getNormalStrider(o_normals, mesh->mFaceVertexOffset); + + const F32* weights = mesh->getWeights(); + const LLVector3* coords = mesh->getCoords(); + const LLVector3* normals = mesh->getNormals(); + for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index) + { + if( weight != weights[index]) + { + S32 joint = llfloor(weight = weights[index]); + blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint); + } + blend_mat.multiply(coords[index], o_vertices[index]); + ((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]); + } +} + +#else + +void LLViewerJointMesh::updateGeometrySSE2(LLFace *face, LLPolyMesh *mesh) +{ + LLViewerJointMesh::updateGeometryVectorized(face, mesh); + return; +} + +#endif diff --git a/indra/newview/llviewerjointmesh_vec.cpp b/indra/newview/llviewerjointmesh_vec.cpp new file mode 100644 index 0000000000..5b1e080435 --- /dev/null +++ b/indra/newview/llviewerjointmesh_vec.cpp @@ -0,0 +1,76 @@ +/** + * @file llviewerjointmesh.cpp + * @brief LLV4 math class implementation with LLViewerJointMesh class + * + * Copyright (c) 2001-$CurrentYear$, Linden Research, Inc. + * $License$ + */ + +//----------------------------------------------------------------------------- +// Header Files +//----------------------------------------------------------------------------- +#include "llviewerprecompiledheaders.h" + +#include "llviewerjointmesh.h" + +#include "llface.h" +#include "llpolymesh.h" +#include "llv4math.h" +#include "llv4matrix3.h" +#include "llv4matrix4.h" + +// *NOTE: SSE must be disabled for this module + +#if LL_VECTORIZE +#error This module requires vectorization (i.e. SSE) mode to be disabled. +#endif + +static LLV4Matrix4 sJointMat[32]; + +// static +void LLViewerJointMesh::updateGeometryVectorized(LLFace *face, LLPolyMesh *mesh) +{ + LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData; + S32 j, joint_num, joint_end = joint_data.count(); + LLV4Vector3 pivot; + + //upload joint pivots/matrices + for(j = joint_num = 0; joint_num < joint_end ; ++joint_num ) + { + LLSkinJoint *sj; + const LLMatrix4 * wm = joint_data[joint_num]->mWorldMatrix; + if (NULL == (sj = joint_data[joint_num]->mSkinJoint)) + { + sj = joint_data[++joint_num]->mSkinJoint; + ((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToParentJointSkinOffset, pivot); + sJointMat[j++].translate(pivot); + wm = joint_data[joint_num]->mWorldMatrix; + } + ((LLV4Matrix3)(sJointMat[j] = *wm)).multiply(sj->mRootToJointSkinOffset, pivot); + sJointMat[j++].translate(pivot); + } + + F32 weight = F32_MAX; + LLV4Matrix4 blend_mat; + + LLStrider<LLVector3> o_vertices; + LLStrider<LLVector3> o_normals; + + LLVertexBuffer *buffer = face->mVertexBuffer; + buffer->getVertexStrider(o_vertices, mesh->mFaceVertexOffset); + buffer->getNormalStrider(o_normals, mesh->mFaceVertexOffset); + + const F32* weights = mesh->getWeights(); + const LLVector3* coords = mesh->getCoords(); + const LLVector3* normals = mesh->getNormals(); + for (U32 index = 0, index_end = mesh->getNumVertices(); index < index_end; ++index) + { + if( weight != weights[index]) + { + S32 joint = llfloor(weight = weights[index]); + blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint); + } + blend_mat.multiply(coords[index], o_vertices[index]); + ((LLV4Matrix3)blend_mat).multiply(normals[index], o_normals[index]); + } +} diff --git a/indra/newview/llviewermenu.cpp b/indra/newview/llviewermenu.cpp index d18859e356..7ad4f1d70b 100644 --- a/indra/newview/llviewermenu.cpp +++ b/indra/newview/llviewermenu.cpp @@ -960,6 +960,7 @@ extern BOOL gDebugClicks; extern BOOL gDebugWindowProc; extern BOOL gDebugTextEditorTips; extern BOOL gDebugSelectMgr; +extern BOOL gVectorizePerfTest; void init_debug_ui_menu(LLMenuGL* menu) { @@ -1169,6 +1170,8 @@ void init_debug_rendering_menu(LLMenuGL* menu) (void*)"ShowDepthBuffer")); sub_menu->append(new LLMenuItemToggleGL("Show Select Buffer", &gDebugSelect)); + sub_menu->append(new LLMenuItemToggleGL("Vectorize Perf Test", &gVectorizePerfTest)); + sub_menu = new LLMenuGL("Render Tests"); sub_menu->append(new LLMenuItemCheckGL("Camera Offset", |