9 files changed, 138 insertions, 29 deletions
diff --git a/indra/llmath/CMakeLists.txt b/indra/llmath/CMakeLists.txt
index 0614fd92ef..4c8bcdac91 100644
--- a/indra/llmath/CMakeLists.txt
+++ b/indra/llmath/CMakeLists.txt
@@ -4,6 +4,7 @@ project(llmath)
 
 include(00-Common)
 include(LLCommon)
+include(Boost)
 
 include_directories(
     ${LLCOMMON_INCLUDE_DIRS}
@@ -20,6 +21,7 @@ set(llmath_SOURCE_FILES
     llline.cpp
     llmatrix3a.cpp
     llmodularmath.cpp
+    lloctree.cpp
     llperlin.cpp
     llquaternion.cpp
     llrect.cpp
@@ -117,6 +119,11 @@ if (LL_TESTS)
     v4color.cpp
     v4coloru.cpp
     )
+  set_source_files_properties(
+    ${llmath_TEST_SOURCE_FILES}
+    PROPERTIES
+    LL_TEST_ADDITIONAL_LIBRARIES "${BOOST_THREAD_LIBRARY}"
+  )
   LL_ADD_PROJECT_UNIT_TESTS(llmath "${llmath_TEST_SOURCE_FILES}")
 
   # INTEGRATION TESTS
diff --git a/indra/llmath/llmath.h b/indra/llmath/llmath.h
index 93b9f22b25..e508c9a199 100644
--- a/indra/llmath/llmath.h
+++ b/indra/llmath/llmath.h
@@ -153,7 +153,7 @@ inline F64 llabs(const F64 a)
 
 inline S32 lltrunc( F32 f )
 {
-#if LL_WINDOWS && !defined( __INTEL_COMPILER )
+#if LL_WINDOWS && !defined( __INTEL_COMPILER ) && (ADDRESS_SIZE == 32)
 		// Avoids changing the floating point control word.
 		// Add or subtract 0.5 - epsilon and then round
 		const static U32 zpfp[] = { 0xBEFFFFFF, 0x3EFFFFFF };
@@ -179,7 +179,7 @@ inline S32 lltrunc( F64 f )
 
 inline S32 llfloor( F32 f )
 {
-#if LL_WINDOWS && !defined( __INTEL_COMPILER )
+#if LL_WINDOWS && !defined( __INTEL_COMPILER ) && (ADDRESS_SIZE == 32)
 		// Avoids changing the floating point control word.
 		// Accurate (unlike Stereopsis version) for all values between S32_MIN and S32_MAX and slightly faster than Stereopsis version.
 		// Add -(0.5 - epsilon) and then round
@@ -254,6 +254,11 @@ inline int round_int(double x)
 }
 #endif // BOGUS_ROUND
 
+inline F64 ll_round(const F64 val)
+{
+	return F64(floor(val + 0.5f));
+}
+
 inline F32 ll_round( F32 val, F32 nearest )
 {
 	return F32(floor(val * (1.0f / nearest) + 0.5f)) * nearest;
diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h
index d141298f69..216334752a 100644
--- a/indra/llmath/llmatrix4a.h
+++ b/indra/llmath/llmatrix4a.h
@@ -121,7 +121,7 @@ public:
 		res.add(z);
 	}
 
-	inline void affineTransform(const LLVector4a& v, LLVector4a& res)
+	inline void affineTransformSSE(const LLVector4a& v, LLVector4a& res)
 	{
 		LLVector4a x,y,z;
 
@@ -137,6 +137,43 @@ public:
 		z.add(mMatrix[3]);
 		res.setAdd(x,z);
 	}
+
+    inline void affineTransformNonSSE(const LLVector4a& v, LLVector4a& res)
+    {
+        F32 x = v[0] * mMatrix[0][0] + v[1] * mMatrix[1][0] + v[2] * mMatrix[2][0] + mMatrix[3][0];
+        F32 y = v[0] * mMatrix[0][1] + v[1] * mMatrix[1][1] + v[2] * mMatrix[2][1] + mMatrix[3][1];
+        F32 z = v[0] * mMatrix[0][2] + v[1] * mMatrix[1][2] + v[2] * mMatrix[2][2] + mMatrix[3][2];
+        F32 w = 1.0f;
+        res.set(x,y,z,w);
+    }
+
+	inline void affineTransform(const LLVector4a& v, LLVector4a& res)
+    {
+        affineTransformSSE(v,res);
+    }
 };
 
+inline LLVector4a rowMul(const LLVector4a &row, const LLMatrix4a &mat)
+{
+    LLVector4a result;
+    result = _mm_mul_ps(_mm_shuffle_ps(row, row, _MM_SHUFFLE(0, 0, 0, 0)), mat.mMatrix[0]);
+    result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(row, row, _MM_SHUFFLE(1, 1, 1, 1)), mat.mMatrix[1]));
+    result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(row, row, _MM_SHUFFLE(2, 2, 2, 2)), mat.mMatrix[2]));
+    result = _mm_add_ps(result, _mm_mul_ps(_mm_shuffle_ps(row, row, _MM_SHUFFLE(3, 3, 3, 3)), mat.mMatrix[3]));
+    return result;
+}
+
+inline void matMul(const LLMatrix4a &a, const LLMatrix4a &b, LLMatrix4a &res)
+{
+    LLVector4a row0 = rowMul(a.mMatrix[0], b);
+    LLVector4a row1 = rowMul(a.mMatrix[1], b);
+    LLVector4a row2 = rowMul(a.mMatrix[2], b);
+    LLVector4a row3 = rowMul(a.mMatrix[3], b);
+
+    res.mMatrix[0] = row0;
+    res.mMatrix[1] = row1;
+    res.mMatrix[2] = row2;
+    res.mMatrix[3] = row3;
+}
+
 #endif
diff --git a/indra/llmath/lloctree.cpp b/indra/llmath/lloctree.cpp
new file mode 100644
index 0000000000..3fcb3a27d7
--- /dev/null
+++ b/indra/llmath/lloctree.cpp
@@ -0,0 +1,29 @@
+/** 
+ * @file lloctree.cpp
+ *
+ * $LicenseInfo:firstyear=2005&license=viewerlgpl$
+ * Second Life Viewer Source Code
+ * Copyright (C) 2010, Linden Research, Inc.
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License only.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * 
+ * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
+ * $/LicenseInfo$
+ */
+#include "stdtypes.h"
+
+U32 gOctreeMaxCapacity;
+F32 gOctreeMinSize;
+
diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h
index cebd2ace7d..54a275633f 100644
--- a/indra/llmath/llsimdmath.h
+++ b/indra/llmath/llsimdmath.h
@@ -31,7 +31,7 @@
 #error "Please include llmath.h before this file."
 #endif
 
-#if ( ( LL_DARWIN || LL_LINUX ) && !(__SSE2__) ) || ( LL_WINDOWS && ( _M_IX86_FP < 2 ) )
+#if ( ( LL_DARWIN || LL_LINUX ) && !(__SSE2__) ) || ( LL_WINDOWS && ( _M_IX86_FP < 2 && ADDRESS_SIZE == 32 ) )
 #error SSE2 not enabled. LLVector4a and related class will not compile.
 #endif
 
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index d932eb53a0..f63a721c35 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -89,7 +89,7 @@ const F32 SKEW_MAX	=  0.95f;
 const F32 SCULPT_MIN_AREA = 0.002f;
 const S32 SCULPT_MIN_AREA_DETAIL = 1;
 
-extern BOOL gDebugGL;
+BOOL gDebugGL = FALSE;
 
 BOOL check_same_clock_dir( const LLVector3& pt1, const LLVector3& pt2, const LLVector3& pt3, const LLVector3& norm)
 {    
@@ -2143,19 +2143,22 @@ BOOL LLVolume::generate()
 	
 	F32 profile_detail = mDetail;
 	F32 path_detail = mDetail;
-	
-	U8 path_type = mParams.getPathParams().getCurveType();
-	U8 profile_type = mParams.getProfileParams().getCurveType();
-	
-	if (path_type == LL_PCODE_PATH_LINE && profile_type == LL_PCODE_PROFILE_CIRCLE)
-	{ //cylinders don't care about Z-Axis
-		mLODScaleBias.setVec(0.6f, 0.6f, 0.0f);
-	}
-	else if (path_type == LL_PCODE_PATH_CIRCLE) 
-	{	
-		mLODScaleBias.setVec(0.6f, 0.6f, 0.6f);
+
+	if ((mParams.getSculptType() & LL_SCULPT_TYPE_MASK) != LL_SCULPT_TYPE_MESH)
+	{
+		U8 path_type = mParams.getPathParams().getCurveType();
+		U8 profile_type = mParams.getProfileParams().getCurveType();
+		if (path_type == LL_PCODE_PATH_LINE && profile_type == LL_PCODE_PROFILE_CIRCLE)
+		{
+			//cylinders don't care about Z-Axis
+			mLODScaleBias.setVec(0.6f, 0.6f, 0.0f);
+		}
+		else if (path_type == LL_PCODE_PATH_CIRCLE)
+		{
+			mLODScaleBias.setVec(0.6f, 0.6f, 0.6f);
+		}
 	}
-	
+
 	BOOL regenPath = mPathp->generate(mParams.getPathParams(), path_detail, split);
 	BOOL regenProf = mProfilep->generate(mParams.getProfileParams(), mPathp->isOpen(),profile_detail, split);
 
@@ -2544,7 +2547,7 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 						U16 influence = weights[idx++];
 						influence |= ((U16) weights[idx++] << 8);
 
-						F32 w = llclamp((F32) influence / 65535.f, 0.f, 0.99999f);
+						F32 w = llclamp((F32) influence / 65535.f, 0.001f, 0.999f);
 						wght.mV[cur_influence] = w;
 						joints[cur_influence] = joint;
 						cur_influence++;
@@ -2561,11 +2564,15 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
                     F32 wsum = wght.mV[VX] + wght.mV[VY] + wght.mV[VZ] + wght.mV[VW];
                     if (wsum <= 0.f)
                     {
-                        wght = LLVector4(0.99999f,0.f,0.f,0.f);
+                        wght = LLVector4(0.999f,0.f,0.f,0.f);
                     }
                     for (U32 k=0; k<4; k++)
                     {
-                        joints_with_weights[k] = (F32) joints[k] + wght[k];
+                        F32 f_combined = (F32) joints[k] + wght[k];
+                        joints_with_weights[k] = f_combined;
+                        // Any weights we added above should wind up non-zero and applied to a specific bone.
+                        // A failure here would indicate a floating point precision error in the math.
+                        llassert((k >= cur_influence) || (f_combined - S32(f_combined) > 0.0f));
                     }
 					face.mWeights[cur_vertex].loadua(joints_with_weights.mV);
 
@@ -4568,6 +4575,7 @@ LLVolumeFace::LLVolumeFace() :
 	mTexCoords(NULL),
 	mIndices(NULL),
 	mWeights(NULL),
+    mWeightsScrubbed(FALSE),
 	mOctree(NULL),
 	mOptimized(FALSE)
 {
@@ -4593,6 +4601,7 @@ LLVolumeFace::LLVolumeFace(const LLVolumeFace& src)
 	mTexCoords(NULL),
 	mIndices(NULL),
 	mWeights(NULL),
+    mWeightsScrubbed(FALSE),
 	mOctree(NULL)
 { 
 	mExtents = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*3);
@@ -4664,6 +4673,7 @@ LLVolumeFace& LLVolumeFace::operator=(const LLVolumeFace& src)
 			ll_aligned_free_16(mWeights);
 			mWeights = NULL;
 		}
+        mWeightsScrubbed = src.mWeightsScrubbed;
 	}
 
 	if (mNumIndices)
diff --git a/indra/llmath/llvolume.h b/indra/llmath/llvolume.h
index 1da2d0c6b1..bf81c978a0 100644
--- a/indra/llmath/llvolume.h
+++ b/indra/llmath/llvolume.h
@@ -199,6 +199,8 @@ const U8 LL_SCULPT_FLAG_MASK = LL_SCULPT_FLAG_INVERT | LL_SCULPT_FLAG_MIRROR;
 
 const S32 LL_SCULPT_MESH_MAX_FACES = 8;
 
+extern BOOL gDebugGL;
+
 class LLProfileParams
 {
 public:
@@ -953,6 +955,8 @@ public:
 	// mWeights.size() should be empty or match mVertices.size()  
 	LLVector4a* mWeights;
 
+    mutable BOOL mWeightsScrubbed;
+    
 	LLOctreeNode<LLVolumeTriangle>* mOctree;
 
 	//whether or not face has been cache optimized
diff --git a/indra/llmath/tests/v4math_test.cpp b/indra/llmath/tests/v4math_test.cpp
index 191ac864df..9779dfded3 100644
--- a/indra/llmath/tests/v4math_test.cpp
+++ b/indra/llmath/tests/v4math_test.cpp
@@ -355,7 +355,8 @@ namespace tut
 		val3 = z1 + (z2 - z1)* val;
 		val4 = w1 + (w2 - w1)* val;
 		LLVector4 vec4b = lerp(vec4,vec4a,val);
-		ensure("lerp failed", ((val1 ==vec4b.mV[VX])&& (val2 ==vec4b.mV[VY]) && (val3 ==vec4b.mV[VZ])&& (val4 ==vec4b.mV[VW])));	
+		LLVector4 check(val1, val2, val3, val4);
+		ensure_equals("lerp failed", check, vec4b);
 	}
 
 	template<> template<>
diff --git a/indra/llmath/v4coloru.h b/indra/llmath/v4coloru.h
index fddad34978..704ce852d9 100644
--- a/indra/llmath/v4coloru.h
+++ b/indra/llmath/v4coloru.h
@@ -47,14 +47,7 @@ class LLColor4U
 {
 public:
 
-	union
-	{
-		U8         mV[LENGTHOFCOLOR4U];
-		U32        mAll;
-		LLColor4*  mSources;
-		LLColor4U* mSourcesU;
-	};
-
+	U8 mV[LENGTHOFCOLOR4U];
 
 	LLColor4U();						// Initializes LLColor4U to (0, 0, 0, 1)
 	LLColor4U(U8 r, U8 g, U8 b);		// Initializes LLColor4U to (r, g, b, 1)
@@ -132,6 +125,9 @@ public:
 		return LLColor4(*this);
 	}
 
+	U32 asRGBA() const;
+	void fromRGBA( U32 aVal );
+
 	static LLColor4U white;
 	static LLColor4U black;
 	static LLColor4U red;
@@ -565,6 +561,26 @@ void LLColor4U::setVecScaleClamp(const LLColor3& color)
 	mV[3] = 255;
 }
 
+inline U32 LLColor4U::asRGBA() const
+{
+	// Little endian: values are swapped in memory. The original code access the array like a U32, so we need to swap here
+
+	return (mV[3] << 24) | (mV[2] << 16) | (mV[1] << 8) | mV[0];
+}
+
+inline void LLColor4U::fromRGBA( U32 aVal )
+{
+	// Little endian: values are swapped in memory. The original code access the array like a U32, so we need to swap here
+
+	mV[ 0 ] = aVal & 0xFF;
+	aVal >>= 8;
+	mV[ 1 ] = aVal & 0xFF;
+	aVal >>= 8;
+	mV[ 2 ] = aVal & 0xFF;
+	aVal >>= 8;
+	mV[ 3 ] = aVal & 0xFF;
+}
+
 
 #endif