42 files changed, 4663 insertions, 1054 deletions
diff --git a/indra/llmath/CMakeLists.txt b/indra/llmath/CMakeLists.txt
index e93fe90650..9dadad7dd3 100644
--- a/indra/llmath/CMakeLists.txt
+++ b/indra/llmath/CMakeLists.txt
@@ -15,13 +15,16 @@ set(llmath_SOURCE_FILES
     llcamera.cpp
     llcoordframe.cpp
     llline.cpp
+    llmatrix3a.cpp
     llmodularmath.cpp
     llperlin.cpp
     llquaternion.cpp
     llrect.cpp
     llsphere.cpp
+    llvector4a.cpp
     llvolume.cpp
     llvolumemgr.cpp
+    llvolumeoctree.cpp
     llsdutil_math.cpp
     m3math.cpp
     m4math.cpp
@@ -49,21 +52,32 @@ set(llmath_HEADER_FILES
     llinterp.h
     llline.h
     llmath.h
+    llmatrix3a.h
+    llmatrix3a.inl
     llmodularmath.h
     lloctree.h
     llperlin.h
     llplane.h
     llquantize.h
     llquaternion.h
+    llquaternion2.h
+    llquaternion2.inl
     llrect.h
+    llsimdmath.h
+    llsimdtypes.h
+    llsimdtypes.inl
     llsphere.h
     lltreenode.h
+    llvector4a.h
+    llvector4a.inl
+    llvector4logical.h
     llv4math.h
     llv4matrix3.h
     llv4matrix4.h
     llv4vector3.h
     llvolume.h
     llvolumemgr.h
+    llvolumeoctree.h
     llsdutil_math.h
     m3math.h
     m4math.h
diff --git a/indra/llmath/llcamera.cpp b/indra/llmath/llcamera.cpp
index 487ed6451f..beb5c48624 100644
--- a/indra/llmath/llcamera.cpp
+++ b/indra/llmath/llcamera.cpp
@@ -48,10 +48,10 @@ LLCamera::LLCamera() :
 	mPlaneCount(6),
 	mFrustumCornerDist(0.f)
 {
+	alignPlanes();
 	calculateFrustumPlanes();
 } 
 
-
 LLCamera::LLCamera(F32 vertical_fov_rads, F32 aspect_ratio, S32 view_height_in_pixels, F32 near_plane, F32 far_plane) :
 	LLCoordFrame(),
 	mViewHeightInPixels(view_height_in_pixels),
@@ -59,6 +59,7 @@ LLCamera::LLCamera(F32 vertical_fov_rads, F32 aspect_ratio, S32 view_height_in_p
 	mPlaneCount(6),
 	mFrustumCornerDist(0.f)
 {
+	alignPlanes();
 	mAspect = llclamp(aspect_ratio, MIN_ASPECT_RATIO, MAX_ASPECT_RATIO);
 	mNearPlane = llclamp(near_plane, MIN_NEAR_PLANE, MAX_NEAR_PLANE);
 	if(far_plane < 0) far_plane = DEFAULT_FAR_PLANE;
@@ -67,6 +68,23 @@ LLCamera::LLCamera(F32 vertical_fov_rads, F32 aspect_ratio, S32 view_height_in_p
 	setView(vertical_fov_rads);
 } 
 
+LLCamera::~LLCamera()
+{
+
+}
+
+const LLCamera& LLCamera::operator=(const LLCamera& rhs)
+{
+	memcpy(this, &rhs, sizeof(LLCamera));
+	alignPlanes();
+	LLVector4a::memcpyNonAliased16((F32*) mAgentPlanes, (F32*) rhs.mAgentPlanes, 4*7*sizeof(F32));
+	return *this;
+}
+
+void LLCamera::alignPlanes()
+{
+	mAgentPlanes = (LLPlane*) LL_NEXT_ALIGNED_ADDRESS<U8>(mAgentPlaneBuffer);
+}
 
 // ---------------- LLCamera::getFoo() member functions ----------------
 
@@ -91,8 +109,8 @@ F32 LLCamera::getMaxView() const
 void LLCamera::setUserClipPlane(LLPlane plane)
 {
 	mPlaneCount = 7;
-	mAgentPlanes[6].p = plane;
-	mAgentPlanes[6].mask = calcPlaneMask(plane);
+	mAgentPlanes[6] = plane;
+	mPlaneMask[6] = calcPlaneMask(plane);
 }
 
 void LLCamera::disableUserClipPlane()
@@ -164,129 +182,66 @@ size_t LLCamera::readFrustumFromBuffer(const char *buffer)
 
 // ---------------- test methods  ---------------- 
 
-S32 LLCamera::AABBInFrustum(const LLVector3 &center, const LLVector3& radius) 
-{
-	static const LLVector3 scaler[] = {
-		LLVector3(-1,-1,-1),
-		LLVector3( 1,-1,-1),
-		LLVector3(-1, 1,-1),
-		LLVector3( 1, 1,-1),
-		LLVector3(-1,-1, 1),
-		LLVector3( 1,-1, 1),
-		LLVector3(-1, 1, 1),
-		LLVector3( 1, 1, 1)
+S32 LLCamera::AABBInFrustum(const LLVector4a &center, const LLVector4a& radius) 
+{
+	static const LLVector4a scaler[] = {
+		LLVector4a(-1,-1,-1),
+		LLVector4a( 1,-1,-1),
+		LLVector4a(-1, 1,-1),
+		LLVector4a( 1, 1,-1),
+		LLVector4a(-1,-1, 1),
+		LLVector4a( 1,-1, 1),
+		LLVector4a(-1, 1, 1),
+		LLVector4a( 1, 1, 1)
 	};
 
 	U8 mask = 0;
 	S32 result = 2;
 
-	/*if (mFrustumCornerDist > 0.f && radius.magVecSquared() > mFrustumCornerDist * mFrustumCornerDist)
-	{ //box is larger than frustum, check frustum quads against box planes
-
-		static const LLVector3 dir[] = 
-		{
-			LLVector3(1, 0, 0),
-			LLVector3(-1, 0, 0),
-			LLVector3(0, 1, 0),
-			LLVector3(0, -1, 0),
-			LLVector3(0, 0, 1),
-			LLVector3(0, 0, -1)
-		};
-
-		U32 quads[] = 
+	for (U32 i = 0; i < mPlaneCount; i++)
+	{
+		mask = mPlaneMask[i];
+		if (mask == 0xff)
 		{
-			0, 1, 2, 3,
-			0, 1, 5, 4,
-			2, 3, 7, 6,
-			3, 0, 7, 4,
-			1, 2, 6, 4,
-			4, 5, 6, 7
-		};
-
-		result = 0;
-
-		BOOL total_inside = TRUE;
-		for (U32 i = 0; i < 6; i++)
-		{ 
-			LLVector3 p = center + radius.scaledVec(dir[i]);
-			F32 d = -p*dir[i];
-
-			for (U32 j = 0; j <	6; j++)
-			{ //for each quad
-				F32 dist = mAgentFrustum[quads[j*4+0]]*dir[i] + d;
-				if (dist > 0)
-				{ //at least one frustum point is outside the AABB
-					total_inside = FALSE;
-					for (U32 k = 1; k < 4; k++)
-					{ //for each other point on quad
-						if ( mAgentFrustum[quads[j*4+k]]*dir[i]+d  <= 0.f)
-						{ //quad is straddling some plane of AABB
-							return 1;
-						}
-					}
-				}
-				else
-				{
-					for (U32 k = 1; k < 4; k++)
-					{
-						if (mAgentFrustum[quads[j*4+k]]*dir[i]+d > 0.f)
-						{
-							return 1;
-						}
-					}
-				}
-			}
+			continue;
 		}
 
-		if (total_inside)
+		const LLPlane& p = mAgentPlanes[i];
+		const LLVector4a& n = reinterpret_cast<const LLVector4a&>(p);
+		float d = p.mV[3];
+		LLVector4a rscale;
+		rscale.setMul(radius, scaler[mask]);
+
+		LLVector4a minp, maxp;
+		minp.setSub(center, rscale);
+		maxp.setAdd(center, rscale);
+
+		if (n.dot3(minp) > -d) 
 		{
-			result = 1;
+			return 0;
 		}
-	}
-	else*/
-	{
-		for (U32 i = 0; i < mPlaneCount; i++)
+	
+		if (n.dot3(maxp) > -d)
 		{
-			mask = mAgentPlanes[i].mask;
-			if (mask == 0xff)
-			{
-				continue;
-			}
-			LLPlane p = mAgentPlanes[i].p;
-			LLVector3 n = LLVector3(p);
-			float d = p.mV[3];
-			LLVector3 rscale = radius.scaledVec(scaler[mask]);
-
-			LLVector3 minp = center - rscale;
-			LLVector3 maxp = center + rscale;
-
-			if (n * minp > -d) 
-			{
-				return 0;
-			}
-		
-			if (n * maxp > -d)
-			{
-				result = 1;
-			}
+			result = 1;
 		}
 	}
 
-	
 	return result;
 }
 
-S32 LLCamera::AABBInFrustumNoFarClip(const LLVector3 &center, const LLVector3& radius) 
-{
-	static const LLVector3 scaler[] = {
-		LLVector3(-1,-1,-1),
-		LLVector3( 1,-1,-1),
-		LLVector3(-1, 1,-1),
-		LLVector3( 1, 1,-1),
-		LLVector3(-1,-1, 1),
-		LLVector3( 1,-1, 1),
-		LLVector3(-1, 1, 1),
-		LLVector3( 1, 1, 1)
+
+S32 LLCamera::AABBInFrustumNoFarClip(const LLVector4a& center, const LLVector4a& radius) 
+{
+	static const LLVector4a scaler[] = {
+		LLVector4a(-1,-1,-1),
+		LLVector4a( 1,-1,-1),
+		LLVector4a(-1, 1,-1),
+		LLVector4a( 1, 1,-1),
+		LLVector4a(-1,-1, 1),
+		LLVector4a( 1,-1, 1),
+		LLVector4a(-1, 1, 1),
+		LLVector4a( 1, 1, 1)
 	};
 
 	U8 mask = 0;
@@ -299,25 +254,28 @@ S32 LLCamera::AABBInFrustumNoFarClip(const LLVector3 &center, const LLVector3& r
 			continue;
 		}
 
-		mask = mAgentPlanes[i].mask;
+		mask = mPlaneMask[i];
 		if (mask == 0xff)
 		{
 			continue;
 		}
-		LLPlane p = mAgentPlanes[i].p;
-		LLVector3 n = LLVector3(p);
+
+		const LLPlane& p = mAgentPlanes[i];
+		const LLVector4a& n = reinterpret_cast<const LLVector4a&>(p);
 		float d = p.mV[3];
-		LLVector3 rscale = radius.scaledVec(scaler[mask]);
+		LLVector4a rscale;
+		rscale.setMul(radius, scaler[mask]);
 
-		LLVector3 minp = center - rscale;
-		LLVector3 maxp = center + rscale;
+		LLVector4a minp, maxp;
+		minp.setSub(center, rscale);
+		maxp.setAdd(center, rscale);
 
-		if (n * minp > -d) 
+		if (n.dot3(minp) > -d) 
 		{
 			return 0;
 		}
 	
-		if (n * maxp > -d)
+		if (n.dot3(maxp) > -d)
 		{
 			result = 1;
 		}
@@ -447,12 +405,12 @@ int LLCamera::sphereInFrustum(const LLVector3 &sphere_center, const F32 radius)
 	int res = 2;
 	for (int i = 0; i < 6; i++)
 	{
-		if (mAgentPlanes[i].mask == 0xff)
+		if (mPlaneMask[i] == 0xff)
 		{
 			continue;
 		}
 
-		float d = mAgentPlanes[i].p.dist(sphere_center);
+		float d = mAgentPlanes[i].dist(sphere_center);
 
 		if (d > radius) 
 		{
@@ -644,12 +602,14 @@ void LLCamera::ignoreAgentFrustumPlane(S32 idx)
 		return;
 	}
 
-	mAgentPlanes[idx].mask = 0xff;
-	mAgentPlanes[idx].p.clearVec();
+	mPlaneMask[idx] = 0xff;
+	mAgentPlanes[idx].clearVec();
 }
 
 void LLCamera::calcAgentFrustumPlanes(LLVector3* frust)
 {
+	alignPlanes();
+
 	for (int i = 0; i < 8; i++)
 	{
 		mAgentFrustum[i] = frust[i];
@@ -662,27 +622,27 @@ void LLCamera::calcAgentFrustumPlanes(LLVector3* frust)
 	//order of planes is important, keep most likely to fail in the front of the list
 
 	//near - frust[0], frust[1], frust[2]
-	mAgentPlanes[2].p = planeFromPoints(frust[0], frust[1], frust[2]);
+	mAgentPlanes[2] = planeFromPoints(frust[0], frust[1], frust[2]);
 
 	//far  
-	mAgentPlanes[5].p = planeFromPoints(frust[5], frust[4], frust[6]);
+	mAgentPlanes[5] = planeFromPoints(frust[5], frust[4], frust[6]);
 
 	//left  
-	mAgentPlanes[0].p = planeFromPoints(frust[4], frust[0], frust[7]);
+	mAgentPlanes[0] = planeFromPoints(frust[4], frust[0], frust[7]);
 
 	//right  
-	mAgentPlanes[1].p = planeFromPoints(frust[1], frust[5], frust[6]);
+	mAgentPlanes[1] = planeFromPoints(frust[1], frust[5], frust[6]);
 
 	//top  
-	mAgentPlanes[4].p = planeFromPoints(frust[3], frust[2], frust[6]);
+	mAgentPlanes[4] = planeFromPoints(frust[3], frust[2], frust[6]);
 
 	//bottom  
-	mAgentPlanes[3].p = planeFromPoints(frust[1], frust[0], frust[4]);
+	mAgentPlanes[3] = planeFromPoints(frust[1], frust[0], frust[4]);
 
 	//cache plane octant facing mask for use in AABBInFrustum
 	for (U32 i = 0; i < mPlaneCount; i++)
 	{
-		mAgentPlanes[i].mask = calcPlaneMask(mAgentPlanes[i].p);
+		mPlaneMask[i] = calcPlaneMask(mAgentPlanes[i]);
 	}
 }
 
diff --git a/indra/llmath/llcamera.h b/indra/llmath/llcamera.h
index 0c81067919..c40e819dcf 100644
--- a/indra/llmath/llcamera.h
+++ b/indra/llmath/llcamera.h
@@ -37,6 +37,7 @@
 #include "llmath.h"
 #include "llcoordframe.h"
 #include "llplane.h"
+#include "llvector4a.h"
 
 const F32 DEFAULT_FIELD_OF_VIEW 	= 60.f * DEG_TO_RAD;
 const F32 DEFAULT_ASPECT_RATIO 		= 640.f / 480.f;
@@ -79,6 +80,14 @@ class LLCamera
 : 	public LLCoordFrame
 {
 public:
+	
+	LLCamera(const LLCamera& rhs)
+	{
+		*this = rhs;
+	}
+
+	const LLCamera& operator=(const LLCamera& rhs);
+	
 	enum {
 		PLANE_LEFT = 0,
 		PLANE_RIGHT = 1,
@@ -129,13 +138,9 @@ private:
 	LLPlane mWorldPlanes[PLANE_NUM];
 	LLPlane mHorizPlanes[HORIZ_PLANE_NUM];
 
-	struct frustum_plane
-	{
-		frustum_plane() : mask(0) {}
-		LLPlane p;
-		U8 mask;
-	};
-	frustum_plane mAgentPlanes[7];  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP
+	LLPlane* mAgentPlanes;  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP
+	U8 mAgentPlaneBuffer[sizeof(LLPlane)*8];
+	U8 mPlaneMask[7];
 
 	U32 mPlaneCount;  //defaults to 6, if setUserClipPlane is called, uses user supplied clip plane in
 
@@ -143,12 +148,14 @@ private:
 public:
 	LLVector3 mAgentFrustum[8];  //8 corners of 6-plane frustum
 	F32	mFrustumCornerDist;		//distance to corner of frustum against far clip plane
-	LLPlane getAgentPlane(U32 idx) { return mAgentPlanes[idx].p; }
+	LLPlane& getAgentPlane(U32 idx) { return mAgentPlanes[idx]; }
 
 public:
 	LLCamera();
 	LLCamera(F32 vertical_fov_rads, F32 aspect_ratio, S32 view_height_in_pixels, F32 near_plane, F32 far_plane);
-	virtual ~LLCamera(){} // no-op virtual destructor
+	virtual ~LLCamera();
+	
+	void alignPlanes();
 
 	void setUserClipPlane(LLPlane plane);
 	void disableUserClipPlane();
@@ -199,8 +206,8 @@ public:
 	S32 sphereInFrustum(const LLVector3 &center, const F32 radius) const;
 	S32 pointInFrustum(const LLVector3 &point) const { return sphereInFrustum(point, 0.0f); }
 	S32 sphereInFrustumFull(const LLVector3 &center, const F32 radius) const { return sphereInFrustum(center, radius); }
-	S32 AABBInFrustum(const LLVector3 &center, const LLVector3& radius);
-	S32 AABBInFrustumNoFarClip(const LLVector3 &center, const LLVector3& radius);
+	S32 AABBInFrustum(const LLVector4a& center, const LLVector4a& radius);
+	S32 AABBInFrustumNoFarClip(const LLVector4a& center, const LLVector4a& radius);
 
 	//does a quick 'n dirty sphere-sphere check
 	S32 sphereInFrustumQuick(const LLVector3 &sphere_center, const F32 radius); 
diff --git a/indra/llmath/llmath.h b/indra/llmath/llmath.h
index 209b506c30..e572381b1a 100644
--- a/indra/llmath/llmath.h
+++ b/indra/llmath/llmath.h
@@ -35,7 +35,6 @@
 
 #include <cmath>
 #include <cstdlib>
-#include <complex>
 #include "lldefs.h"
 //#include "llstl.h" // *TODO: Remove when LLString is gone
 //#include "llstring.h" // *TODO: Remove when LLString is gone
@@ -61,29 +60,11 @@
 #endif
 
 // Single Precision Floating Point Routines
-#ifndef fsqrtf
-#define fsqrtf(x)		((F32)sqrt((F64)(x)))
-#endif
-#ifndef sqrtf
-#define sqrtf(x)		((F32)sqrt((F64)(x)))
-#endif
-
-#ifndef cosf
-#define cosf(x)		((F32)cos((F64)(x)))
-#endif
-#ifndef sinf
-#define sinf(x)		((F32)sin((F64)(x)))
-#endif
-#ifndef tanf
+// (There used to be more defined here, but they appeared to be redundant and 
+// were breaking some other includes. Removed by Falcon, reviewed by Andrew, 11/25/09)
+/*#ifndef tanf
 #define tanf(x)		((F32)tan((F64)(x)))
-#endif
-#ifndef acosf
-#define acosf(x)		((F32)acos((F64)(x)))
-#endif
-
-#ifndef powf
-#define powf(x,y) ((F32)pow((F64)(x),(F64)(y)))
-#endif
+#endif*/
 
 const F32	GRAVITY			= -9.8f;
 
@@ -203,7 +184,7 @@ inline S32 llfloor( F32 f )
 		}
 		return result;
 #else
-		return (S32)floorf(f);
+		return (S32)floor(f);
 #endif
 }
 
@@ -381,11 +362,14 @@ inline F32 snap_to_sig_figs(F32 foo, S32 sig_figs)
 		bar *= 10.f;
 	}
 
-	foo = (F32)llround(foo * bar);
+	//F32 new_foo = (F32)llround(foo * bar);
+	// the llround() implementation sucks.  Don't us it.
 
-	// shift back
-	foo /= bar;
-	return foo;
+	F32 sign = (foo > 0.f) ? 1.f : -1.f;
+	F32 new_foo = F32( S64(foo * bar + sign * 0.5f));
+	new_foo /= bar;
+
+	return new_foo;
 }
 
 inline F32 lerp(F32 a, F32 b, F32 u) 
@@ -519,4 +503,7 @@ inline F32 llgaussian(F32 x, F32 o)
 	return 1.f/(F_SQRT_TWO_PI*o)*powf(F_E, -(x*x)/(2*o*o));
 }
 
+// Include simd math header
+#include "llsimdmath.h"
+
 #endif
diff --git a/indra/llmath/llmatrix3a.cpp b/indra/llmath/llmatrix3a.cpp
new file mode 100644
index 0000000000..b7468f4914
--- /dev/null
+++ b/indra/llmath/llmatrix3a.cpp
@@ -0,0 +1,140 @@
+/** 
+ * @file llvector4a.cpp
+ * @brief SIMD vector implementation
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#include "llmath.h"
+
+static LL_ALIGN_16(const F32 M_IDENT_3A[12]) = 
+												{	1.f, 0.f, 0.f, 0.f, // Column 1
+													0.f, 1.f, 0.f, 0.f, // Column 2
+													0.f, 0.f, 1.f, 0.f }; // Column 3
+
+extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast<const LLMatrix3a*> (M_IDENT_3A);
+
+void LLMatrix3a::setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs )
+{
+	const LLVector4a col0 = lhs.getColumn(0);
+	const LLVector4a col1 = lhs.getColumn(1);
+	const LLVector4a col2 = lhs.getColumn(2);
+
+	for ( int i = 0; i < 3; i++ )
+	{
+		LLVector4a xxxx = _mm_load_ss( rhs.mColumns[i].getF32ptr() );
+		xxxx.splat<0>( xxxx );
+		xxxx.mul( col0 );
+
+		{
+			LLVector4a yyyy = _mm_load_ss( rhs.mColumns[i].getF32ptr() +  1 );
+			yyyy.splat<0>( yyyy );
+			yyyy.mul( col1 ); 
+			xxxx.add( yyyy );
+		}
+
+		{
+			LLVector4a zzzz = _mm_load_ss( rhs.mColumns[i].getF32ptr() +  2 );
+			zzzz.splat<0>( zzzz );
+			zzzz.mul( col2 );
+			xxxx.add( zzzz );
+		}
+
+		xxxx.store4a( mColumns[i].getF32ptr() );
+	}
+	
+}
+
+/*static */void LLMatrix3a::batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst )
+{
+	const LLVector4a col0 = xform.getColumn(0);
+	const LLVector4a col1 = xform.getColumn(1);
+	const LLVector4a col2 = xform.getColumn(2);
+	const LLVector4a* maxAddr = src + numVectors;
+
+	if ( numVectors & 0x1 )
+	{
+		LLVector4a xxxx = _mm_load_ss( (const F32*)src );
+		LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 );
+		LLVector4a zzzz = _mm_load_ss( (const F32*)src + 2 );
+		xxxx.splat<0>( xxxx );
+		yyyy.splat<0>( yyyy );
+		zzzz.splat<0>( zzzz );
+		xxxx.mul( col0 );
+		yyyy.mul( col1 ); 
+		zzzz.mul( col2 );
+		xxxx.add( yyyy );
+		xxxx.add( zzzz );
+		xxxx.store4a( (F32*)dst );
+		src++;
+		dst++;
+	}
+
+
+	numVectors >>= 1;
+	while ( src < maxAddr )
+	{
+		_mm_prefetch( (const char*)(src + 32 ), _MM_HINT_NTA );
+		_mm_prefetch( (const char*)(dst + 32), _MM_HINT_NTA );
+		LLVector4a xxxx = _mm_load_ss( (const F32*)src );
+		LLVector4a xxxx1= _mm_load_ss( (const F32*)(src + 1) );
+
+		xxxx.splat<0>( xxxx );
+		xxxx1.splat<0>( xxxx1 );
+		xxxx.mul( col0 );
+		xxxx1.mul( col0 );
+
+		{
+			LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 );
+			LLVector4a yyyy1 = _mm_load_ss( (const F32*)(src + 1) + 1);
+			yyyy.splat<0>( yyyy );
+			yyyy1.splat<0>( yyyy1 );
+			yyyy.mul( col1 );
+			yyyy1.mul( col1 );
+			xxxx.add( yyyy );
+			xxxx1.add( yyyy1 );
+		}
+
+		{
+			LLVector4a zzzz = _mm_load_ss( (const F32*)(src) + 2 );
+			LLVector4a zzzz1 = _mm_load_ss( (const F32*)(++src) + 2 );
+			zzzz.splat<0>( zzzz );
+			zzzz1.splat<0>( zzzz1 );
+			zzzz.mul( col2 );
+			zzzz1.mul( col2 );
+			xxxx.add( zzzz );
+			xxxx1.add( zzzz1 );
+		}
+
+		xxxx.store4a(dst->getF32ptr());
+		src++;
+		dst++;
+
+		xxxx1.store4a((F32*)dst++);
+	}
+}
diff --git a/indra/llmath/llmatrix3a.h b/indra/llmath/llmatrix3a.h
new file mode 100644
index 0000000000..56327f9f6d
--- /dev/null
+++ b/indra/llmath/llmatrix3a.h
@@ -0,0 +1,134 @@
+/** 
+ * @file llmatrix3a.h
+ * @brief LLMatrix3a class header file - memory aligned and vectorized 3x3 matrix
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#ifndef	LL_LLMATRIX3A_H
+#define	LL_LLMATRIX3A_H
+
+/////////////////////////////
+// LLMatrix3a, LLRotation
+/////////////////////////////
+// This class stores a 3x3 (technically 4x3) matrix in column-major order
+/////////////////////////////
+/////////////////////////////
+// These classes are intentionally minimal right now. If you need additional
+// functionality, please contact someone with SSE experience (e.g., Falcon or
+// Huseby).
+/////////////////////////////
+
+// LLMatrix3a is the base class for LLRotation, which should be used instead any time you're dealing with a 
+// rotation matrix.
+class LLMatrix3a
+{
+public:
+
+	// Utility function for quickly transforming an array of LLVector4a's
+	// For transforming a single LLVector4a, see LLVector4a::setRotated
+	static void batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst );
+
+	// Utility function to obtain the identity matrix
+	static inline const LLMatrix3a& getIdentity();
+
+	//////////////////////////
+	// Ctors
+	//////////////////////////
+	
+	// Ctor
+	LLMatrix3a() {}
+
+	// Ctor for setting by columns
+	inline LLMatrix3a( const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2 );
+
+	//////////////////////////
+	// Get/Set
+	//////////////////////////
+
+	// Loads from an LLMatrix3
+	inline void loadu(const LLMatrix3& src);
+	
+	// Set rows
+	inline void setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2);
+	
+	// Set columns
+	inline void setColumns(const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2);
+
+	// Get the read-only access to a specified column. Valid columns are 0-2, but the 
+	// function is unchecked. You've been warned.
+	inline const LLVector4a& getColumn(const U32 column) const;
+
+	/////////////////////////
+	// Matrix modification
+	/////////////////////////
+	
+	// Set this matrix to the product of lhs and rhs ( this = lhs * rhs )
+	void setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs );
+
+	// Set this matrix to the transpose of src
+	inline void setTranspose(const LLMatrix3a& src);
+
+	// Set this matrix to a*w + b*(1-w)
+	inline void setLerp(const LLMatrix3a& a, const LLMatrix3a& b, F32 w);
+
+	/////////////////////////
+	// Matrix inspection
+	/////////////////////////
+
+	// Sets all 4 elements in 'dest' to the determinant of this matrix.
+	// If you will be using the determinant in subsequent ops with LLVector4a, use this version
+	inline void getDeterminant( LLVector4a& dest ) const;
+
+	// Returns the determinant as an LLSimdScalar. Use this if you will be using the determinant
+	// primary for scalar operations.
+	inline LLSimdScalar getDeterminant() const;
+
+	// Returns nonzero if rows 0-2 and colums 0-2 contain no NaN or INF values. Row 3 is ignored
+	inline LLBool32 isFinite() const;
+
+	// Returns true if this matrix is equal to 'rhs' up to 'tolerance'
+	inline bool isApproximatelyEqual( const LLMatrix3a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
+
+protected:
+
+	LLVector4a mColumns[3];
+
+};
+
+class LLRotation : public LLMatrix3a
+{
+public:
+	
+	LLRotation() {}
+	
+	// Returns true if this rotation is orthonormal with det ~= 1
+	inline bool isOkRotation() const;		
+};
+
+#endif
diff --git a/indra/llmath/llmatrix3a.inl b/indra/llmath/llmatrix3a.inl
new file mode 100644
index 0000000000..65fd949f78
--- /dev/null
+++ b/indra/llmath/llmatrix3a.inl
@@ -0,0 +1,125 @@
+/** 
+ * @file llmatrix3a.inl
+ * @brief LLMatrix3a inline definitions
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#include "llmatrix3a.h"
+#include "m3math.h"
+
+inline LLMatrix3a::LLMatrix3a( const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2 )
+{
+	setColumns( c0, c1, c2 );
+}
+
+inline void LLMatrix3a::loadu(const LLMatrix3& src)
+{
+	mColumns[0].load3(src.mMatrix[0]);
+	mColumns[1].load3(src.mMatrix[1]);
+	mColumns[2].load3(src.mMatrix[2]);
+}
+
+inline void LLMatrix3a::setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2)
+{
+	mColumns[0] = r0;
+	mColumns[1] = r1;
+	mColumns[2] = r2;
+	setTranspose( *this );
+}
+
+inline void LLMatrix3a::setColumns(const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2)
+{
+	mColumns[0] = c0;
+	mColumns[1] = c1;
+	mColumns[2] = c2;
+}
+
+inline void LLMatrix3a::setTranspose(const LLMatrix3a& src)
+{
+	const LLQuad srcCol0 = src.mColumns[0];
+	const LLQuad srcCol1 = src.mColumns[1];
+	const LLQuad unpacklo = _mm_unpacklo_ps( srcCol0, srcCol1 );
+	mColumns[0] = _mm_movelh_ps( unpacklo, src.mColumns[2] );
+	mColumns[1] = _mm_shuffle_ps( _mm_movehl_ps( srcCol0, unpacklo ), src.mColumns[2], _MM_SHUFFLE(0, 1, 1, 0) );
+	mColumns[2] = _mm_shuffle_ps( _mm_unpackhi_ps( srcCol0, srcCol1 ), src.mColumns[2], _MM_SHUFFLE(0, 2, 1, 0) );
+}
+
+inline const LLVector4a& LLMatrix3a::getColumn(const U32 column) const
+{
+	llassert( column < 3 );
+	return mColumns[column];
+}
+
+inline void LLMatrix3a::setLerp(const LLMatrix3a& a, const LLMatrix3a& b, F32 w)
+{
+	mColumns[0].setLerp( a.mColumns[0], b.mColumns[0], w );
+	mColumns[1].setLerp( a.mColumns[1], b.mColumns[1], w );
+	mColumns[2].setLerp( a.mColumns[2], b.mColumns[2], w );
+}
+
+inline LLBool32 LLMatrix3a::isFinite() const
+{
+	return mColumns[0].isFinite3() && mColumns[1].isFinite3() && mColumns[2].isFinite3();
+}
+
+inline void LLMatrix3a::getDeterminant( LLVector4a& dest ) const
+{
+	LLVector4a col1xcol2; col1xcol2.setCross3( mColumns[1], mColumns[2] );
+	dest.setAllDot3( col1xcol2, mColumns[0] );
+}
+
+inline LLSimdScalar LLMatrix3a::getDeterminant() const
+{
+	LLVector4a col1xcol2; col1xcol2.setCross3( mColumns[1], mColumns[2] );
+	return col1xcol2.dot3( mColumns[0] );
+}
+
+inline bool LLMatrix3a::isApproximatelyEqual( const LLMatrix3a& rhs, F32 tolerance /*= F_APPROXIMATELY_ZERO*/ ) const
+{
+	return rhs.getColumn(0).equals3(mColumns[0], tolerance) 
+		&& rhs.getColumn(1).equals3(mColumns[1], tolerance) 
+		&& rhs.getColumn(2).equals3(mColumns[2], tolerance); 
+}
+
+inline const LLMatrix3a& LLMatrix3a::getIdentity()
+{
+	extern const LLMatrix3a LL_M3A_IDENTITY;
+	return LL_M3A_IDENTITY;
+}
+
+inline bool LLRotation::isOkRotation() const
+{
+	LLMatrix3a transpose; transpose.setTranspose( *this );
+	LLMatrix3a product; product.setMul( *this, transpose );
+
+	LLSimdScalar detMinusOne = getDeterminant() - 1.f;
+
+	return product.isApproximatelyEqual( LLMatrix3a::getIdentity() ) && (detMinusOne.getAbs() < F_APPROXIMATELY_ZERO);
+}
+
diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h
new file mode 100644
index 0000000000..0ead045d04
--- /dev/null
+++ b/indra/llmath/llmatrix4a.h
@@ -0,0 +1,149 @@
+/** 
+ * @file llmatrix4a.h
+ * @brief LLMatrix4a class header file - memory aligned and vectorized 4x4 matrix
+ *
+ * $LicenseInfo:firstyear=2007&license=viewergpl$
+ * 
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#ifndef	LL_LLMATRIX4A_H
+#define	LL_LLMATRIX4A_H
+
+#include "llvector4a.h"
+#include "m4math.h"
+#include "m3math.h"
+
+class LLMatrix4a
+{
+public:
+	LLVector4a mMatrix[4];
+
+	inline void clear()
+	{
+		mMatrix[0].clear();
+		mMatrix[1].clear();
+		mMatrix[2].clear();
+		mMatrix[3].clear();
+	}
+
+	inline void loadu(const LLMatrix4& src)
+	{
+		mMatrix[0] = _mm_loadu_ps(src.mMatrix[0]);
+		mMatrix[1] = _mm_loadu_ps(src.mMatrix[1]);
+		mMatrix[2] = _mm_loadu_ps(src.mMatrix[2]);
+		mMatrix[3] = _mm_loadu_ps(src.mMatrix[3]);
+		
+	}
+
+	inline void loadu(const LLMatrix3& src)
+	{
+		mMatrix[0].load3(src.mMatrix[0]);
+		mMatrix[1].load3(src.mMatrix[1]);
+		mMatrix[2].load3(src.mMatrix[2]);
+		mMatrix[3].set(0,0,0,1.f);
+	}
+
+	inline void add(const LLMatrix4a& rhs)
+	{
+		mMatrix[0].add(rhs.mMatrix[0]);
+		mMatrix[1].add(rhs.mMatrix[1]);
+		mMatrix[2].add(rhs.mMatrix[2]);
+		mMatrix[3].add(rhs.mMatrix[3]);
+	}
+
+	inline void setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2)
+	{
+		mMatrix[0] = r0;
+		mMatrix[1] = r1;
+		mMatrix[2] = r2;
+	}
+
+	inline void setMul(const LLMatrix4a& m, const F32 s)
+	{
+		mMatrix[0].setMul(m.mMatrix[0], s);
+		mMatrix[1].setMul(m.mMatrix[1], s);
+		mMatrix[2].setMul(m.mMatrix[2], s);
+		mMatrix[3].setMul(m.mMatrix[3], s);
+	}
+
+	inline void setLerp(const LLMatrix4a& a, const LLMatrix4a& b, F32 w)
+	{
+		LLVector4a d0,d1,d2,d3;
+		d0.setSub(b.mMatrix[0], a.mMatrix[0]);
+		d1.setSub(b.mMatrix[1], a.mMatrix[1]);
+		d2.setSub(b.mMatrix[2], a.mMatrix[2]);
+		d3.setSub(b.mMatrix[3], a.mMatrix[3]);
+
+		// this = a + d*w
+		
+		d0.mul(w);
+		d1.mul(w);
+		d2.mul(w);
+		d3.mul(w);
+
+		mMatrix[0].setAdd(a.mMatrix[0],d0);
+		mMatrix[1].setAdd(a.mMatrix[1],d1);
+		mMatrix[2].setAdd(a.mMatrix[2],d2);
+		mMatrix[3].setAdd(a.mMatrix[3],d3);
+	}
+
+	inline void rotate(const LLVector4a& v, LLVector4a& res)
+	{
+		res = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
+		res.mul(mMatrix[0]);
+		
+		LLVector4a y;
+		y = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
+		y.mul(mMatrix[1]);
+
+		LLVector4a z;
+		z = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
+		z.mul(mMatrix[2]);
+
+		res.add(y);
+		res.add(z);
+	}
+
+	inline void affineTransform(const LLVector4a& v, LLVector4a& res)
+	{
+		LLVector4a x,y,z;
+
+		x = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
+		y = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
+		z = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
+		
+		x.mul(mMatrix[0]);
+		y.mul(mMatrix[1]);
+		z.mul(mMatrix[2]);
+
+		x.add(y);
+		z.add(mMatrix[3]);
+		res.setAdd(x,z);
+	}
+};
+
+#endif
diff --git a/indra/llmath/lloctree.h b/indra/llmath/lloctree.h
index 2f34fb1bb0..432e9fbcd8 100644
--- a/indra/llmath/lloctree.h
+++ b/indra/llmath/lloctree.h
@@ -35,6 +35,7 @@
 
 #include "lltreenode.h"
 #include "v3math.h"
+#include "llvector4a.h"
 #include <vector>
 #include <set>
 
@@ -73,6 +74,13 @@ public:
 };
 
 template <class T>
+class LLOctreeTravelerDepthFirst : public LLOctreeTraveler<T>
+{
+public:
+	virtual void traverse(const LLOctreeNode<T>* node);
+};
+
+template <class T>
 class LLOctreeNode : public LLTreeNode<T>
 {
 public:
@@ -87,23 +95,22 @@ public:
 	typedef LLOctreeNode<T>		oct_node;
 	typedef LLOctreeListener<T>	oct_listener;
 
-	static const U8 OCTANT_POSITIVE_X = 0x01;
-	static const U8 OCTANT_POSITIVE_Y = 0x02;
-	static const U8 OCTANT_POSITIVE_Z = 0x04;
-		
-	LLOctreeNode(	LLVector3d center, 
-					LLVector3d size, 
+	LLOctreeNode(	const LLVector4a& center, 
+					const LLVector4a& size, 
 					BaseType* parent, 
-					U8 octant = 255)
+					S32 octant = -1)
 	:	mParent((oct_node*)parent), 
-		mCenter(center), 
-		mSize(size), 
 		mOctant(octant) 
 	{ 
+		mD = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*4);
+
+		mD[CENTER] = center;
+		mD[SIZE] = size;
+
 		updateMinMax();
-		if ((mOctant == 255) && mParent)
+		if ((mOctant == -1) && mParent)
 		{
-			mOctant = ((oct_node*) mParent)->getOctant(mCenter.mdV);
+			mOctant = ((oct_node*) mParent)->getOctant(mD[CENTER]);
 		}
 
 		clearChildren();
@@ -117,43 +124,30 @@ public:
 		{
 			delete getChild(i);
 		} 
+
+		ll_aligned_free_16(mD);
 	}
 
 	inline const BaseType* getParent()	const			{ return mParent; }
-	inline void setParent(BaseType* parent)			{ mParent = (oct_node*) parent; }
-	inline const LLVector3d& getCenter() const			{ return mCenter; }
-	inline const LLVector3d& getSize() const			{ return mSize; }
-	inline void setCenter(LLVector3d center)			{ mCenter = center; }
-	inline void setSize(LLVector3d size)				{ mSize = size; }
-    inline oct_node* getNodeAt(T* data)				{ return getNodeAt(data->getPositionGroup(), data->getBinRadius()); }
-	inline U8 getOctant() const						{ return mOctant; }
-	inline void setOctant(U8 octant)					{ mOctant = octant; }
+	inline void setParent(BaseType* parent)				{ mParent = (oct_node*) parent; }
+	inline const LLVector4a& getCenter() const			{ return mD[CENTER]; }
+	inline const LLVector4a& getSize() const			{ return mD[SIZE]; }
+	inline void setCenter(const LLVector4a& center)		{ mD[CENTER] = center; }
+	inline void setSize(const LLVector4a& size)			{ mD[SIZE] = size; }
+    inline oct_node* getNodeAt(T* data)					{ return getNodeAt(data->getPositionGroup(), data->getBinRadius()); }
+	inline S32 getOctant() const						{ return mOctant; }
+	inline void setOctant(S32 octant)					{ mOctant = octant; }
 	inline const oct_node*	getOctParent() const		{ return (const oct_node*) getParent(); }
 	inline oct_node* getOctParent() 					{ return (oct_node*) getParent(); }
 	
-	U8 getOctant(const F64 pos[]) const	//get the octant pos is in
+	S32 getOctant(const LLVector4a& pos) const			//get the octant pos is in
 	{
-		U8 ret = 0;
-
-		if (pos[0] > mCenter.mdV[0])
-		{
-			ret |= OCTANT_POSITIVE_X;
-		}
-		if (pos[1] > mCenter.mdV[1])
-		{
-			ret |= OCTANT_POSITIVE_Y;
-		}
-		if (pos[2] > mCenter.mdV[2])
-		{
-			ret |= OCTANT_POSITIVE_Z;
-		}
-
-		return ret;
+		return pos.greaterThan(mD[CENTER]).getGatheredBits() & 0x7;
 	}
 	
-	inline bool isInside(const LLVector3d& pos, const F64& rad) const
+	inline bool isInside(const LLVector4a& pos, const F32& rad) const
 	{
-		return rad <= mSize.mdV[0]*2.0 && isInside(pos); 
+		return rad <= mD[SIZE][0]*2.f && isInside(pos); 
 	}
 
 	inline bool isInside(T* data) const			
@@ -161,29 +155,27 @@ public:
 		return isInside(data->getPositionGroup(), data->getBinRadius());
 	}
 
-	bool isInside(const LLVector3d& pos) const
+	bool isInside(const LLVector4a& pos) const
 	{
-		const F64& x = pos.mdV[0];
-		const F64& y = pos.mdV[1];
-		const F64& z = pos.mdV[2];
-			
-		if (x > mMax.mdV[0] || x <= mMin.mdV[0] ||
-			y > mMax.mdV[1] || y <= mMin.mdV[1] ||
-			z > mMax.mdV[2] || z <= mMin.mdV[2])
+		S32 gt = pos.greaterThan(mD[MAX]).getGatheredBits() & 0x7;
+		if (gt)
 		{
 			return false;
 		}
-		
+
+		S32 lt = pos.lessEqual(mD[MIN]).getGatheredBits() & 0x7;
+		if (lt)
+		{
+			return false;
+		}
+				
 		return true;
 	}
 	
 	void updateMinMax()
 	{
-		for (U32 i = 0; i < 3; i++)
-		{
-			mMax.mdV[i] = mCenter.mdV[i] + mSize.mdV[i];
-			mMin.mdV[i] = mCenter.mdV[i] - mSize.mdV[i];
-		}
+		mD[MAX].setAdd(mD[CENTER], mD[SIZE]);
+		mD[MIN].setSub(mD[CENTER], mD[SIZE]);
 	}
 
 	inline oct_listener* getOctListener(U32 index) 
@@ -196,34 +188,34 @@ public:
 		return contains(xform->getBinRadius());
 	}
 
-	bool contains(F64 radius)
+	bool contains(F32 radius)
 	{
 		if (mParent == NULL)
 		{	//root node contains nothing
 			return false;
 		}
 
-		F64 size = mSize.mdV[0];
-		F64 p_size = size * 2.0;
+		F32 size = mD[SIZE][0];
+		F32 p_size = size * 2.f;
 
-		return (radius <= 0.001 && size <= 0.001) ||
+		return (radius <= 0.001f && size <= 0.001f) ||
 				(radius <= p_size && radius > size);
 	}
 
-	static void pushCenter(LLVector3d &center, const LLVector3d &size, const T* data)
+	static void pushCenter(LLVector4a &center, const LLVector4a &size, const T* data)
 	{
-		const LLVector3d& pos = data->getPositionGroup();
-		for (U32 i = 0; i < 3; i++)
-		{
-			if (pos.mdV[i] > center.mdV[i])
-			{
-				center.mdV[i] += size.mdV[i];
-			}
-			else 
-			{
-				center.mdV[i] -= size.mdV[i];
-			}
-		}
+		const LLVector4a& pos = data->getPositionGroup();
+
+		LLVector4a gt = pos.greaterThan(center);
+
+		LLVector4a up;
+		up = _mm_and_ps(size, gt);
+
+		LLVector4a down;
+		down = _mm_andnot_ps(gt, size);
+
+		center.add(up);
+		center.sub(down);
 	}
 
 	void accept(oct_traveler* visitor)				{ visitor->visit(this); }
@@ -242,21 +234,21 @@ public:
 	void accept(tree_traveler* visitor) const		{ visitor->visit(this); }
 	void accept(oct_traveler* visitor) const		{ visitor->visit(this); }
 	
-	oct_node* getNodeAt(const LLVector3d& pos, const F64& rad)
+	oct_node* getNodeAt(const LLVector4a& pos, const F32& rad)
 	{ 
 		LLOctreeNode<T>* node = this;
 
 		if (node->isInside(pos, rad))
 		{		
 			//do a quick search by octant
-			U8 octant = node->getOctant(pos.mdV);
+			S32 octant = node->getOctant(pos);
 			BOOL keep_going = TRUE;
 
 			//traverse the tree until we find a node that has no node
 			//at the appropriate octant or is smaller than the object.  
 			//by definition, that node is the smallest node that contains 
 			// the data
-			while (keep_going && node->getSize().mdV[0] >= rad)
+			while (keep_going && node->getSize()[0] >= rad)
 			{	
 				keep_going = FALSE;
 				for (U32 i = 0; i < node->getChildCount() && !keep_going; i++)
@@ -264,7 +256,7 @@ public:
 					if (node->getChild(i)->getOctant() == octant)
 					{
 						node = node->getChild(i);
-						octant = node->getOctant(pos.mdV);
+						octant = node->getOctant(pos);
 						keep_going = TRUE;
 					}
 				}
@@ -282,7 +274,7 @@ public:
 	{
 		if (data == NULL)
 		{
-			//OCT_ERRS << "!!! INVALID ELEMENT ADDED TO OCTREE BRANCH !!!" << llendl;
+			OCT_ERRS << "!!! INVALID ELEMENT ADDED TO OCTREE BRANCH !!!" << llendl;
 			return false;
 		}
 		LLOctreeNode<T>* parent = getOctParent();
@@ -292,7 +284,7 @@ public:
 		{
 			if (getElementCount() < LL_OCTREE_MAX_CAPACITY &&
 				(contains(data->getBinRadius()) ||
-				(data->getBinRadius() > getSize().mdV[0] &&
+				(data->getBinRadius() > getSize()[0] &&
 				parent && parent->getElementCount() >= LL_OCTREE_MAX_CAPACITY))) 
 			{ //it belongs here
 #if LL_OCTREE_PARANOIA_CHECK
@@ -323,16 +315,21 @@ public:
 				}
 				
 				//it's here, but no kids are in the right place, make a new kid
-				LLVector3d center(getCenter());
-				LLVector3d size(getSize()*0.5);
+				LLVector4a center = getCenter();
+				LLVector4a size = getSize();
+				size.mul(0.5f);
 		        		
 				//push center in direction of data
 				LLOctreeNode<T>::pushCenter(center, size, data);
 
 				// handle case where floating point number gets too small
-				if( llabs(center.mdV[0] - getCenter().mdV[0]) < F_APPROXIMATELY_ZERO &&
-					llabs(center.mdV[1] - getCenter().mdV[1]) < F_APPROXIMATELY_ZERO &&
-					llabs(center.mdV[2] - getCenter().mdV[2]) < F_APPROXIMATELY_ZERO)
+				LLVector4a val;
+				val.setSub(center, getCenter());
+				val.setAbs(val);
+								
+				S32 lt = val.lessThan(LLVector4a::getEpsilon()).getGatheredBits() & 0x7;
+
+				if( lt == 0x7 )
 				{
 					mData.insert(data);
 					BaseType::insert(data);
@@ -350,7 +347,7 @@ public:
 				//make sure no existing node matches this position
 				for (U32 i = 0; i < getChildCount(); i++)
 				{
-					if (mChild[i]->getCenter() == center)
+					if (mChild[i]->getCenter().equal3(center))
 					{
 						OCT_ERRS << "Octree detected duplicate child center and gave up." << llendl;
 						return false;
@@ -368,7 +365,7 @@ public:
 		else 
 		{
 			//it's not in here, give it to the root
-			//OCT_ERRS << "Octree insertion failed, starting over from root!" << llendl;
+			OCT_ERRS << "Octree insertion failed, starting over from root!" << llendl;
 
 			oct_node* node = this;
 
@@ -475,13 +472,19 @@ public:
 	void addChild(oct_node* child, BOOL silent = FALSE) 
 	{
 #if LL_OCTREE_PARANOIA_CHECK
+
+		if (child->getSize().equal3(getSize()))
+		{
+			OCT_ERRS << "Child size is same as parent size!" << llendl;
+		}
+
 		for (U32 i = 0; i < getChildCount(); i++)
 		{
-			if(mChild[i]->getSize() != child->getSize()) 
+			if(!mChild[i]->getSize().equal3(child->getSize())) 
 			{
 				OCT_ERRS <<"Invalid octree child size." << llendl;
 			}
-			if (mChild[i]->getCenter() == child->getCenter())
+			if (mChild[i]->getCenter().equal3(child->getCenter()))
 			{
 				OCT_ERRS <<"Duplicate octree child position." << llendl;
 			}
@@ -506,7 +509,7 @@ public:
 		}
 	}
 
-	void removeChild(U8 index, BOOL destroy = FALSE)
+	void removeChild(S32 index, BOOL destroy = FALSE)
 	{
 		for (U32 i = 0; i < this->getListenerCount(); i++)
 		{
@@ -547,18 +550,26 @@ public:
 			}
 		}
 
-		//OCT_ERRS << "Octree failed to delete requested child." << llendl;
+		OCT_ERRS << "Octree failed to delete requested child." << llendl;
 	}
 
 protected:	
+	typedef enum
+	{
+		CENTER = 0,
+		SIZE = 1,
+		MAX = 2,
+		MIN = 3
+	} eDName;
+
+	LLVector4a* mD;
+	
+	oct_node* mParent;
+	S32 mOctant;
+
 	child_list mChild;
 	element_list mData;
-	oct_node* mParent;
-	LLVector3d mCenter;
-	LLVector3d mSize;
-	LLVector3d mMax;
-	LLVector3d mMin;
-	U8 mOctant;
+		
 };
 
 //just like a regular node, except it might expand on insert and compress on balance
@@ -569,9 +580,9 @@ public:
 	typedef LLOctreeNode<T>	BaseType;
 	typedef LLOctreeNode<T>		oct_node;
 
-	LLOctreeRoot(	LLVector3d center, 
-					LLVector3d size, 
-					BaseType* parent)
+	LLOctreeRoot(const LLVector4a& center, 
+				 const LLVector4a& size, 
+				 BaseType* parent)
 	:	BaseType(center, size, parent)
 	{
 	}
@@ -612,28 +623,33 @@ public:
 	{
 		if (data == NULL) 
 		{
-			//OCT_ERRS << "!!! INVALID ELEMENT ADDED TO OCTREE ROOT !!!" << llendl;
+			OCT_ERRS << "!!! INVALID ELEMENT ADDED TO OCTREE ROOT !!!" << llendl;
 			return false;
 		}
 		
 		if (data->getBinRadius() > 4096.0)
 		{
-			//OCT_ERRS << "!!! ELEMENT EXCEEDS MAXIMUM SIZE IN OCTREE ROOT !!!" << llendl;
+			OCT_ERRS << "!!! ELEMENT EXCEEDS MAXIMUM SIZE IN OCTREE ROOT !!!" << llendl;
 			return false;
 		}
 		
-		const F64 MAX_MAG = 1024.0*1024.0;
+		LLVector4a MAX_MAG;
+		MAX_MAG.splat(1024.f*1024.f);
+
+		const LLVector4a& v = data->getPositionGroup();
 
-		const LLVector3d& v = data->getPositionGroup();
-		if (!(fabs(v.mdV[0]-this->mCenter.mdV[0]) < MAX_MAG &&
-		      fabs(v.mdV[1]-this->mCenter.mdV[1]) < MAX_MAG &&
-		      fabs(v.mdV[2]-this->mCenter.mdV[2]) < MAX_MAG))
+		LLVector4a val;
+		val.setSub(v, BaseType::mD[BaseType::CENTER]);
+		val.setAbs(val);
+		S32 lt = val.lessThan(MAX_MAG).getGatheredBits() & 0x7;
+
+		if (lt != 0x7)
 		{
-			//OCT_ERRS << "!!! ELEMENT EXCEEDS RANGE OF SPATIAL PARTITION !!!" << llendl;
+			OCT_ERRS << "!!! ELEMENT EXCEEDS RANGE OF SPATIAL PARTITION !!!" << llendl;
 			return false;
 		}
 
-		if (this->getSize().mdV[0] > data->getBinRadius() && isInside(data->getPositionGroup()))
+		if (this->getSize()[0] > data->getBinRadius() && isInside(data->getPositionGroup()))
 		{
 			//we got it, just act like a branch
 			oct_node* node = getNodeAt(data);
@@ -649,31 +665,34 @@ public:
 		else if (this->getChildCount() == 0)
 		{
 			//first object being added, just wrap it up
-			while (!(this->getSize().mdV[0] > data->getBinRadius() && isInside(data->getPositionGroup())))
+			while (!(this->getSize()[0] > data->getBinRadius() && isInside(data->getPositionGroup())))
 			{
-				LLVector3d center, size;
+				LLVector4a center, size;
 				center = this->getCenter();
 				size = this->getSize();
 				LLOctreeNode<T>::pushCenter(center, size, data);
 				this->setCenter(center);
-				this->setSize(size*2);
+				size.mul(2.f);
+				this->setSize(size);
 				this->updateMinMax();
 			}
 			LLOctreeNode<T>::insert(data);
 		}
 		else
 		{
-			while (!(this->getSize().mdV[0] > data->getBinRadius() && isInside(data->getPositionGroup())))
+			while (!(this->getSize()[0] > data->getBinRadius() && isInside(data->getPositionGroup())))
 			{
 				//the data is outside the root node, we need to grow
-				LLVector3d center(this->getCenter());
-				LLVector3d size(this->getSize());
+				LLVector4a center(this->getCenter());
+				LLVector4a size(this->getSize());
 
 				//expand this node
-				LLVector3d newcenter(center);
+				LLVector4a newcenter(center);
 				LLOctreeNode<T>::pushCenter(newcenter, size, data);
 				this->setCenter(newcenter);
-				this->setSize(size*2);
+				LLVector4a size2 = size;
+				size2.mul(2.f);
+				this->setSize(size2);
 				this->updateMinMax();
 
 				//copy our children to a new branch
@@ -710,4 +729,15 @@ void LLOctreeTraveler<T>::traverse(const LLOctreeNode<T>* node)
 		traverse(node->getChild(i));
 	}
 }
+
+template <class T>
+void LLOctreeTravelerDepthFirst<T>::traverse(const LLOctreeNode<T>* node)
+{
+	for (U32 i = 0; i < node->getChildCount(); i++)
+	{
+		traverse(node->getChild(i));
+	}
+	node->accept(this);
+}
+
 #endif
diff --git a/indra/llmath/llquantize.h b/indra/llmath/llquantize.h
index 2192427f07..c043f7f752 100644
--- a/indra/llmath/llquantize.h
+++ b/indra/llmath/llquantize.h
@@ -35,10 +35,16 @@
 #define LL_LLQUANTIZE_H
 
 const U16 U16MAX = 65535;
+LL_ALIGN_16( const F32 F_U16MAX_4A[4] ) = { 65535.f, 65535.f, 65535.f, 65535.f };
+
 const F32 OOU16MAX = 1.f/(F32)(U16MAX);
+LL_ALIGN_16( const F32 F_OOU16MAX_4A[4] ) = { OOU16MAX, OOU16MAX, OOU16MAX, OOU16MAX };
 
 const U8 U8MAX = 255;
+LL_ALIGN_16( const F32 F_U8MAX_4A[4] ) = { 255.f, 255.f, 255.f, 255.f };
+
 const F32 OOU8MAX = 1.f/(F32)(U8MAX);
+LL_ALIGN_16( const F32 F_OOU8MAX_4A[4] ) = { OOU8MAX, OOU8MAX, OOU8MAX, OOU8MAX };
 
 const U8 FIRSTVALIDCHAR = 54;
 const U8 MAXSTRINGVAL = U8MAX - FIRSTVALIDCHAR; //we don't allow newline or null 
diff --git a/indra/llmath/llquaternion.cpp b/indra/llmath/llquaternion.cpp
index fdcc19d657..73c5f4505e 100644
--- a/indra/llmath/llquaternion.cpp
+++ b/indra/llmath/llquaternion.cpp
@@ -32,9 +32,10 @@
 
 #include "linden_common.h"
 
+#include "llmath.h"	// for F_PI
+
 #include "llquaternion.h"
 
-#include "llmath.h"	// for F_PI
 //#include "vmath.h"
 #include "v3math.h"
 #include "v3dmath.h"
diff --git a/indra/llmath/llquaternion.h b/indra/llmath/llquaternion.h
index 0769f29f23..a7bb09fae3 100644
--- a/indra/llmath/llquaternion.h
+++ b/indra/llmath/llquaternion.h
@@ -33,7 +33,11 @@
 #ifndef LLQUATERNION_H
 #define LLQUATERNION_H
 
-#include "llmath.h"
+#include <iostream>
+
+#ifndef LLMATH_H //enforce specific include order to avoid tangling inline dependencies
+#error "Please include llmath.h first."
+#endif
 
 class LLVector4;
 class LLVector3;
diff --git a/indra/llmath/llquaternion2.h b/indra/llmath/llquaternion2.h
new file mode 100644
index 0000000000..dbb4afe312
--- /dev/null
+++ b/indra/llmath/llquaternion2.h
@@ -0,0 +1,111 @@
+/** 
+ * @file llquaternion2.h
+ * @brief LLQuaternion2 class header file - SIMD-enabled quaternion class
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#ifndef	LL_QUATERNION2_H
+#define	LL_QUATERNION2_H
+
+/////////////////////////////
+// LLQuaternion2
+/////////////////////////////
+// This class stores a quaternion x*i + y*j + z*k + w in <x, y, z, w> order
+// (i.e., w in high order element of vector)
+/////////////////////////////
+/////////////////////////////
+// These classes are intentionally minimal right now. If you need additional
+// functionality, please contact someone with SSE experience (e.g., Falcon or
+// Huseby).
+/////////////////////////////
+#include "llquaternion.h"
+
+class LLQuaternion2
+{
+public:
+
+	//////////////////////////
+	// Ctors
+	//////////////////////////
+	
+	// Ctor
+	LLQuaternion2() {}
+
+	// Ctor from LLQuaternion
+	explicit LLQuaternion2( const class LLQuaternion& quat );
+
+	//////////////////////////
+	// Get/Set
+	//////////////////////////
+
+	// Load from an LLQuaternion
+	inline void operator=( const LLQuaternion& quat )
+	{
+		mQ.loadua( quat.mQ );
+	}
+
+	// Return the internal LLVector4a representation of the quaternion
+	inline const LLVector4a& getVector4a() const;
+	inline LLVector4a& getVector4aRw();
+
+	/////////////////////////
+	// Quaternion modification
+	/////////////////////////
+	
+	// Set this quaternion to the conjugate of src
+	inline void setConjugate(const LLQuaternion2& src);
+
+	// Renormalizes the quaternion. Assumes it has nonzero length.
+	inline void normalize();
+
+	// Quantize this quaternion to 8 bit precision
+	inline void quantize8();
+
+	// Quantize this quaternion to 16 bit precision
+	inline void quantize16();
+
+	/////////////////////////
+	// Quaternion inspection
+	/////////////////////////
+
+	// Return true if this quaternion is equal to 'rhs'. 
+	// Note! Quaternions exhibit "double-cover", so any rotation has two equally valid
+	// quaternion representations and they will NOT compare equal.
+	inline bool equals(const LLQuaternion2& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
+
+	// Return true if all components are finite and the quaternion is normalized
+	inline bool isOkRotation() const;
+
+protected:
+
+	LLVector4a mQ;
+
+};
+
+#endif
diff --git a/indra/llmath/llquaternion2.inl b/indra/llmath/llquaternion2.inl
new file mode 100644
index 0000000000..9a4274d6a4
--- /dev/null
+++ b/indra/llmath/llquaternion2.inl
@@ -0,0 +1,108 @@
+/** 
+ * @file llquaternion2.inl
+ * @brief LLQuaternion2 inline definitions
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#include "llquaternion2.h"
+
+static const LLQuad LL_V4A_PLUS_ONE = {1.f, 1.f, 1.f, 1.f};
+static const LLQuad LL_V4A_MINUS_ONE = {-1.f, -1.f, -1.f, -1.f};
+
+// Ctor from LLQuaternion
+inline LLQuaternion2::LLQuaternion2( const LLQuaternion& quat )
+{
+	mQ.set(quat.mQ[VX], quat.mQ[VY], quat.mQ[VZ], quat.mQ[VW]);
+}
+
+//////////////////////////
+// Get/Set
+//////////////////////////
+
+// Return the internal LLVector4a representation of the quaternion
+inline const LLVector4a& LLQuaternion2::getVector4a() const
+{
+	return mQ;
+}
+
+inline LLVector4a& LLQuaternion2::getVector4aRw()
+{
+	return mQ;
+}
+
+/////////////////////////
+// Quaternion modification
+/////////////////////////
+
+// Set this quaternion to the conjugate of src
+inline void LLQuaternion2::setConjugate(const LLQuaternion2& src)
+{
+	static LL_ALIGN_16( const U32 F_QUAT_INV_MASK_4A[4] ) = { 0x80000000, 0x80000000, 0x80000000, 0x00000000 };
+	mQ = _mm_xor_ps(src.mQ, *reinterpret_cast<const LLQuad*>(&F_QUAT_INV_MASK_4A));	
+}
+
+// Renormalizes the quaternion. Assumes it has nonzero length.
+inline void LLQuaternion2::normalize()
+{
+	mQ.normalize4();
+}
+
+// Quantize this quaternion to 8 bit precision
+inline void LLQuaternion2::quantize8()
+{
+	mQ.quantize8( LL_V4A_MINUS_ONE, LL_V4A_PLUS_ONE );
+	normalize();
+}
+
+// Quantize this quaternion to 16 bit precision
+inline void LLQuaternion2::quantize16()
+{
+	mQ.quantize16( LL_V4A_MINUS_ONE, LL_V4A_PLUS_ONE );
+	normalize();
+}
+
+
+/////////////////////////
+// Quaternion inspection
+/////////////////////////
+
+// Return true if this quaternion is equal to 'rhs'. 
+// Note! Quaternions exhibit "double-cover", so any rotation has two equally valid
+// quaternion representations and they will NOT compare equal.
+inline bool LLQuaternion2::equals(const LLQuaternion2 &rhs, F32 tolerance/* = F_APPROXIMATELY_ZERO*/) const
+{
+	return mQ.equals4(rhs.mQ, tolerance);
+}
+
+// Return true if all components are finite and the quaternion is normalized
+inline bool LLQuaternion2::isOkRotation() const
+{
+	return mQ.isFinite4() && mQ.isNormalized4();
+}
+
diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h
new file mode 100644
index 0000000000..9377bfdb53
--- /dev/null
+++ b/indra/llmath/llsimdmath.h
@@ -0,0 +1,95 @@
+/** 
+ * @file llsimdmath.h
+ * @brief Common header for SIMD-based math library (llvector4a, llmatrix3a, etc.)
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#ifndef	LL_SIMD_MATH_H
+#define	LL_SIMD_MATH_H
+
+#ifndef LLMATH_H
+#error "Please include llmath.h before this file."
+#endif
+
+#if ( ( LL_DARWIN || LL_LINUX ) && !(__SSE2__) ) || ( LL_WINDOWS && ( _M_IX86_FP < 2 ) )
+#error SSE2 not enabled. LLVector4a and related class will not compile.
+#endif
+
+template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address) 
+{ 
+	return reinterpret_cast<T*>(
+		(reinterpret_cast<U32>(address) + 0xF) & ~0xF);
+}
+
+template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address) 
+{ 
+	return reinterpret_cast<T*>(
+		(reinterpret_cast<U32>(address) + 0x3F) & ~0x3F);
+}
+
+#if LL_LINUX || LL_DARWIN
+
+#define			LL_ALIGN_PREFIX(x)
+#define			LL_ALIGN_POSTFIX(x)		__attribute__((aligned(x)))
+
+#elif LL_WINDOWS
+
+#define			LL_ALIGN_PREFIX(x)		__declspec(align(x))
+#define			LL_ALIGN_POSTFIX(x)
+
+#else
+#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
+#endif
+
+#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
+
+
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "llsimdtypes.h"
+#include "llsimdtypes.inl"
+
+class LLMatrix3a;
+class LLRotation;
+class LLMatrix3;
+
+#include "llquaternion.h"
+
+#include "llvector4logical.h"
+#include "llvector4a.h"
+#include "llmatrix3a.h"
+#include "llquaternion2.h"
+#include "llvector4a.inl"
+#include "llmatrix3a.inl"
+#include "llquaternion2.inl"
+
+
+#endif //LL_SIMD_MATH_H
diff --git a/indra/llmath/llsimdtypes.h b/indra/llmath/llsimdtypes.h
new file mode 100644
index 0000000000..82e318c8bf
--- /dev/null
+++ b/indra/llmath/llsimdtypes.h
@@ -0,0 +1,130 @@
+/** 
+ * @file llsimdtypes.h
+ * @brief Declaration of basic SIMD math related types
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#ifndef LL_SIMD_TYPES_H
+#define LL_SIMD_TYPES_H
+
+#ifndef LL_SIMD_MATH_H
+#error "Please include llmath.h before this file."
+#endif
+
+typedef __m128	LLQuad;
+
+
+#if LL_WINDOWS
+#pragma warning(push)
+#pragma warning( disable : 4800 3 ) // Disable warning about casting int to bool for this class.
+#if defined(_MSC_VER) && (_MSC_VER < 1500)
+// VC++ 2005 is missing these intrinsics
+// __forceinline is MSVC specific and attempts to override compiler inlining judgment. This is so
+// even in debug builds this call is a NOP.
+__forceinline const __m128 _mm_castsi128_ps( const __m128i a ) { return reinterpret_cast<const __m128&>(a); }
+__forceinline const __m128i _mm_castps_si128( const __m128 a ) { return reinterpret_cast<const __m128i&>(a); }
+#endif // _MSC_VER
+
+#endif // LL_WINDOWS
+
+class LLBool32
+{
+public:
+	inline LLBool32() {}
+	inline LLBool32(int rhs) : m_bool(rhs) {}
+	inline LLBool32(unsigned int rhs) : m_bool(rhs) {}
+	inline LLBool32(bool rhs) { m_bool = static_cast<const int>(rhs); }
+	inline LLBool32& operator= (bool rhs) { m_bool = (int)rhs; return *this; }
+	inline bool operator== (bool rhs) const { return static_cast<const bool&>(m_bool) == rhs; }
+	inline bool operator!= (bool rhs) const { return !operator==(rhs); }
+	inline operator bool() const { return static_cast<const bool&>(m_bool); }
+
+private:
+	int m_bool;
+};
+
+#if LL_WINDOWS
+#pragma warning(pop)
+#endif
+
+class LLSimdScalar
+{
+public:
+	inline LLSimdScalar() {}
+	inline LLSimdScalar(LLQuad q) 
+	{ 
+		mQ = q; 
+	}
+
+	inline LLSimdScalar(F32 f) 
+	{ 
+		mQ = _mm_set_ss(f); 
+	}
+
+	static inline const LLSimdScalar& getZero()
+	{
+		extern const LLQuad F_ZERO_4A;
+		return reinterpret_cast<const LLSimdScalar&>(F_ZERO_4A);
+	}
+
+	inline F32 getF32() const;
+
+	inline LLBool32 isApproximatelyEqual(const LLSimdScalar& rhs, F32 tolerance = F_APPROXIMATELY_ZERO) const;
+
+	inline LLSimdScalar getAbs() const;
+
+	inline void setMax( const LLSimdScalar& a, const LLSimdScalar& b );
+	
+	inline void setMin( const LLSimdScalar& a, const LLSimdScalar& b );
+
+	inline LLSimdScalar& operator=(F32 rhs);
+
+	inline LLSimdScalar& operator+=(const LLSimdScalar& rhs);
+
+	inline LLSimdScalar& operator-=(const LLSimdScalar& rhs);
+
+	inline LLSimdScalar& operator*=(const LLSimdScalar& rhs);
+
+	inline LLSimdScalar& operator/=(const LLSimdScalar& rhs);
+
+	inline operator LLQuad() const
+	{ 
+		return mQ; 
+	}
+	
+	inline const LLQuad& getQuad() const 
+	{ 
+		return mQ; 
+	}
+
+private:
+	LLQuad mQ;
+};
+
+#endif //LL_SIMD_TYPES_H
diff --git a/indra/llmath/llsimdtypes.inl b/indra/llmath/llsimdtypes.inl
new file mode 100644
index 0000000000..69c858e310
--- /dev/null
+++ b/indra/llmath/llsimdtypes.inl
@@ -0,0 +1,163 @@
+/** 
+ * @file llsimdtypes.inl
+ * @brief Inlined definitions of basic SIMD math related types
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+
+
+
+//////////////////
+// LLSimdScalar
+//////////////////
+
+inline LLSimdScalar operator+(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	LLSimdScalar t(a);
+	t += b;
+	return t;
+}
+
+inline LLSimdScalar operator-(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	LLSimdScalar t(a);
+	t -= b;
+	return t;
+}
+
+inline LLSimdScalar operator*(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	LLSimdScalar t(a);
+	t *= b;
+	return t;
+}
+
+inline LLSimdScalar operator/(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	LLSimdScalar t(a);
+	t /= b;
+	return t;
+}
+
+inline LLSimdScalar operator-(const LLSimdScalar& a)
+{
+	static LL_ALIGN_16(const U32 signMask[4]) = {0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+	return _mm_xor_ps(*reinterpret_cast<const LLQuad*>(signMask), a);
+}
+
+inline LLBool32 operator==(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	return _mm_comieq_ss(a, b);
+}
+
+inline LLBool32 operator!=(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	return _mm_comineq_ss(a, b);
+}
+
+inline LLBool32 operator<(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	return _mm_comilt_ss(a, b);
+}
+
+inline LLBool32 operator<=(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	return _mm_comile_ss(a, b);
+}
+
+inline LLBool32 operator>(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	return _mm_comigt_ss(a, b);
+}
+
+inline LLBool32 operator>=(const LLSimdScalar& a, const LLSimdScalar& b)
+{
+	return _mm_comige_ss(a, b);
+}
+
+inline LLBool32 LLSimdScalar::isApproximatelyEqual(const LLSimdScalar& rhs, F32 tolerance /* = F_APPROXIMATELY_ZERO */) const
+{
+	const LLSimdScalar tol( tolerance );
+	const LLSimdScalar diff = _mm_sub_ss( mQ, rhs.mQ );
+	const LLSimdScalar absDiff = diff.getAbs();
+	return absDiff <= tol;
+}
+
+inline void LLSimdScalar::setMax( const LLSimdScalar& a, const LLSimdScalar& b )
+{
+	mQ = _mm_max_ss( a, b );
+}
+
+inline void LLSimdScalar::setMin( const LLSimdScalar& a, const LLSimdScalar& b )
+{
+	mQ = _mm_min_ss( a, b );
+}
+
+inline LLSimdScalar& LLSimdScalar::operator=(F32 rhs) 
+{ 
+	mQ = _mm_set_ss(rhs); 
+	return *this; 
+}
+
+inline LLSimdScalar& LLSimdScalar::operator+=(const LLSimdScalar& rhs) 
+{
+	mQ = _mm_add_ss( mQ, rhs );
+	return *this;
+}
+
+inline LLSimdScalar& LLSimdScalar::operator-=(const LLSimdScalar& rhs)
+{
+	mQ = _mm_sub_ss( mQ, rhs );
+	return *this;
+}
+
+inline LLSimdScalar& LLSimdScalar::operator*=(const LLSimdScalar& rhs)
+{
+	mQ = _mm_mul_ss( mQ, rhs );
+	return *this;
+}
+
+inline LLSimdScalar& LLSimdScalar::operator/=(const LLSimdScalar& rhs)
+{
+	mQ = _mm_div_ss( mQ, rhs );
+	return *this;
+}
+
+inline LLSimdScalar LLSimdScalar::getAbs() const
+{
+	static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
+	return _mm_and_ps( mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
+}
+
+inline F32 LLSimdScalar::getF32() const
+{ 
+	F32 ret; 
+	_mm_store_ss(&ret, mQ); 
+	return ret; 
+}
diff --git a/indra/llmath/lltreenode.h b/indra/llmath/lltreenode.h
index ee9836241a..e6d2521b2a 100644
--- a/indra/llmath/lltreenode.h
+++ b/indra/llmath/lltreenode.h
@@ -34,6 +34,9 @@
 
 #include "stdtypes.h"
 #include "xform.h"
+#include "llpointer.h"
+#include "llrefcount.h"
+
 #include <vector>
 
 template <class T> class LLTreeNode;
diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp
new file mode 100644
index 0000000000..b62c17302f
--- /dev/null
+++ b/indra/llmath/llvector4a.cpp
@@ -0,0 +1,228 @@
+/** 
+ * @file llvector4a.cpp
+ * @brief SIMD vector implementation
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#include "llmath.h"
+#include "llquantize.h"
+
+extern const LLQuad F_ZERO_4A		= { 0, 0, 0, 0 };
+extern const LLQuad F_APPROXIMATELY_ZERO_4A = { 
+	F_APPROXIMATELY_ZERO,
+	F_APPROXIMATELY_ZERO,
+	F_APPROXIMATELY_ZERO,
+	F_APPROXIMATELY_ZERO
+};
+
+extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A );
+extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F_APPROXIMATELY_ZERO_4A );
+
+/*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes)
+{
+	assert(src != NULL);
+	assert(dst != NULL);
+	assert(bytes > 0);
+	assert((bytes % sizeof(F32))== 0); 
+	
+	F32* end = dst + (bytes / sizeof(F32) );
+
+	if (bytes > 64)
+	{
+		F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
+		
+		//at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies
+		F32* end_64 = end-16;
+		
+		_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
+		_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
+		_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
+		_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
+		
+		while (dst < begin_64)
+		{
+			copy4a(dst, src);
+			dst += 4;
+			src += 4;
+		}
+		
+		while (dst < end_64)
+		{
+			_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
+			_mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
+			copy4a(dst, src);
+			copy4a(dst+4, src+4);
+			copy4a(dst+8, src+8);
+			copy4a(dst+12, src+12);
+			
+			dst += 16;
+			src += 16;
+		}
+	}
+
+	while (dst < end)
+	{
+		copy4a(dst, src);
+		dst += 4;
+		src += 4;
+	}
+}
+
+void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec )
+{
+	const LLVector4a col0 = rot.getColumn(0);
+	const LLVector4a col1 = rot.getColumn(1);
+	const LLVector4a col2 = rot.getColumn(2);
+
+	LLVector4a result = _mm_load_ss( vec.getF32ptr() );
+	result.splat<0>( result );
+	result.mul( col0 );
+
+	{
+		LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() +  1 );
+		yyyy.splat<0>( yyyy );
+		yyyy.mul( col1 ); 
+		result.add( yyyy );
+	}
+
+	{
+		LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() +  2 );
+		zzzz.splat<0>( zzzz );
+		zzzz.mul( col2 );
+		result.add( zzzz );
+	}
+
+	*this = result;
+}
+
+void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec )
+{
+	const LLVector4a& quatVec = quat.getVector4a();
+	LLVector4a temp; temp.setCross3(quatVec, vec);
+	temp.add( temp );
+	
+	const LLVector4a realPart( quatVec.getScalarAt<3>() );
+	LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart );
+
+	mQ = vec;
+	add( tempTimesReal );
+	
+	LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp );
+	add(imagCrossTemp);
+}
+
+void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high )
+{
+	LLVector4a val(mQ);
+	LLVector4a delta; delta.setSub( high, low );
+
+	{
+		val.clamp(low, high);
+		val.sub(low);
+
+		// 8-bit quantization means we can do with just 12 bits of reciprocal accuracy
+		const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ);
+// 		{
+// 			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
+// 			LLVector4a two; two.load4a( F_TWO_4A );
+// 
+// 			// Here we use _mm_rcp_ps plus one round of newton-raphson
+// 			// We wish to find 'x' such that x = 1/delta
+// 			// As a first approximation, we take x0 = _mm_rcp_ps(delta)
+// 			// Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
+// 			// See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
+// 			const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
+// 			oneOverDelta.setMul( delta, recipApprox );
+// 			oneOverDelta.setSub( two, oneOverDelta );
+// 			oneOverDelta.mul( recipApprox );
+// 		}
+
+		val.mul(oneOverDelta);
+		val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A));
+	}
+
+	val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
+
+	{
+		val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
+		val.mul(delta);
+		val.add(low);
+	}
+
+	{
+		LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
+		LLVector4a absVal; absVal.setAbs( val );
+		setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
+	}	
+}
+
+void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high )
+{
+	LLVector4a val(mQ);
+	LLVector4a delta; delta.setSub( high, low );
+
+	{
+		val.clamp(low, high);
+		val.sub(low);
+
+		// 16-bit quantization means we need a round of Newton-Raphson
+		LLVector4a oneOverDelta;
+		{
+			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
+			LLVector4a two; two.load4a( F_TWO_4A );
+
+			// Here we use _mm_rcp_ps plus one round of newton-raphson
+			// We wish to find 'x' such that x = 1/delta
+			// As a first approximation, we take x0 = _mm_rcp_ps(delta)
+			// Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
+			// See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
+			const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
+			oneOverDelta.setMul( delta, recipApprox );
+			oneOverDelta.setSub( two, oneOverDelta );
+			oneOverDelta.mul( recipApprox );
+		}
+
+		val.mul(oneOverDelta);
+		val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A));
+	}
+
+	val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
+
+	{
+		val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
+		val.mul(delta);
+		val.add(low);
+	}
+
+	{
+		LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
+		LLVector4a absVal; absVal.setAbs( val );
+		setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
+	}	
+}
diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h
new file mode 100644
index 0000000000..76a3e999ce
--- /dev/null
+++ b/indra/llmath/llvector4a.h
@@ -0,0 +1,331 @@
+/** 
+ * @file llvector4a.h
+ * @brief LLVector4a class header file - memory aligned and vectorized 4 component vector
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#ifndef	LL_LLVECTOR4A_H
+#define	LL_LLVECTOR4A_H
+
+
+class LLRotation;
+
+#include <assert.h>
+#include "llpreprocessor.h"
+
+///////////////////////////////////
+// FIRST TIME USERS PLEASE READ
+//////////////////////////////////
+// This is just the beginning of LLVector4a. There are many more useful functions
+// yet to be implemented. For example, setNeg to negate a vector, rotate() to apply
+// a matrix rotation, various functions to manipulate only the X, Y, and Z elements
+// and many others (including a whole variety of accessors). So if you don't see a 
+// function here that you need, please contact Falcon or someone else with SSE 
+// experience (Richard, I think, has some and davep has a little as of the time 
+// of this writing, July 08, 2010) about getting it implemented before you resort to
+// LLVector3/LLVector4. 
+/////////////////////////////////
+
+class LLVector4a
+{
+public:
+
+	///////////////////////////////////
+	// STATIC METHODS
+	///////////////////////////////////
+	
+	// Call initClass() at startup to avoid 15,000+ cycle penalties from denormalized numbers
+	static void initClass()
+	{
+		_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+		_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+	}
+
+	// Return a vector of all zeros
+	static inline const LLVector4a& getZero()
+	{
+		extern const LLVector4a LL_V4A_ZERO;
+		return LL_V4A_ZERO;
+	}
+
+	// Return a vector of all epsilon, where epsilon is a small float suitable for approximate equality checks
+	static inline const LLVector4a& getEpsilon()
+	{
+		extern const LLVector4a LL_V4A_EPSILON;
+		return LL_V4A_EPSILON;
+	}
+
+	// Copy 16 bytes from src to dst. Source and destination must be 16-byte aligned
+	static inline void copy4a(F32* dst, const F32* src)
+	{
+		_mm_store_ps(dst, _mm_load_ps(src));
+	}
+
+	// Copy words 16-byte blocks from src to dst. Source and destination must not overlap. 
+	static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes);
+
+	////////////////////////////////////
+	// CONSTRUCTORS 
+	////////////////////////////////////
+	
+	LLVector4a()
+	{ //DO NOT INITIALIZE -- The overhead is completely unnecessary
+	}
+	
+	LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f)
+	{
+		set(x,y,z,w);
+	}
+	
+	LLVector4a(F32 x)
+	{
+		splat(x);
+	}
+	
+	LLVector4a(const LLSimdScalar& x)
+	{
+		splat(x);
+	}
+
+	LLVector4a(LLQuad q)
+	{
+		mQ = q;
+	}
+
+	////////////////////////////////////
+	// LOAD/STORE
+	////////////////////////////////////
+	
+	// Load from 16-byte aligned src array (preferred method of loading)
+	inline void load4a(const F32* src);
+	
+	// Load from unaligned src array (NB: Significantly slower than load4a)
+	inline void loadua(const F32* src);
+	
+	// Load only three floats beginning at address 'src'. Slowest method.
+	inline void load3(const F32* src);
+	
+	// Store to a 16-byte aligned memory address
+	inline void store4a(F32* dst) const;
+	
+	////////////////////////////////////
+	// BASIC GET/SET 
+	////////////////////////////////////
+	
+	// Return a "this" as an F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+	inline F32* getF32ptr();
+	
+	// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+	inline const F32* const getF32ptr() const;
+	
+	// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
+	// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead
+	inline F32 operator[](const S32 idx) const;
+
+	// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
+	inline LLSimdScalar getScalarAt(const S32 idx) const;
+
+	// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
+	template <int N> LL_FORCE_INLINE LLSimdScalar getScalarAt() const;
+	template <> LL_FORCE_INLINE LLSimdScalar getScalarAt<0>() const;
+
+	// Set to an x, y, z and optional w provided
+	inline void set(F32 x, F32 y, F32 z, F32 w = 0.f);
+	
+	// Set to all zeros. This is preferred to using ::getZero()
+	inline void clear();
+	
+	// Set all elements to 'x'
+	inline void splat(const F32 x);
+
+	// Set all elements to 'x'
+	inline void splat(const LLSimdScalar& x);
+	
+	// Set all 4 elements to element N of src, with N known at compile time
+	template <int N> void splat(const LLVector4a& src);
+	
+	// Set all 4 elements to element i of v, with i NOT known at compile time
+	inline void splat(const LLVector4a& v, U32 i);
+	
+	// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask
+	inline void setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse );
+	
+	////////////////////////////////////
+	// ALGEBRAIC
+	////////////////////////////////////
+	
+	// Set this to the element-wise (a + b)
+	inline void setAdd(const LLVector4a& a, const LLVector4a& b);
+	
+	// Set this to element-wise (a - b)
+	inline void setSub(const LLVector4a& a, const LLVector4a& b);
+	
+	// Set this to element-wise multiply (a * b)
+	inline void setMul(const LLVector4a& a, const LLVector4a& b);
+	
+	// Set this to element-wise quotient (a / b)
+	inline void setDiv(const LLVector4a& a, const LLVector4a& b);
+	
+	// Set this to the element-wise absolute value of src
+	inline void setAbs(const LLVector4a& src);
+	
+	// Add to each component in this vector the corresponding component in rhs
+	inline void add(const LLVector4a& rhs);
+	
+	// Subtract from each component in this vector the corresponding component in rhs
+	inline void sub(const LLVector4a& rhs);
+	
+	// Multiply each component in this vector by the corresponding component in rhs
+	inline void mul(const LLVector4a& rhs);
+	
+	// Divide each component in this vector by the corresponding component in rhs
+	inline void div(const LLVector4a& rhs);
+	
+	// Multiply this vector by x in a scalar fashion
+	inline void mul(const F32 x);
+
+	// Set this to (a x b) (geometric cross-product)
+	inline void setCross3(const LLVector4a& a, const LLVector4a& b);
+	
+	// Set all elements to the dot product of the x, y, and z elements in a and b
+	inline void setAllDot3(const LLVector4a& a, const LLVector4a& b);
+
+	// Set all elements to the dot product of the x, y, z, and w elements in a and b
+	inline void setAllDot4(const LLVector4a& a, const LLVector4a& b);
+
+	// Return the 3D dot product of this vector and b
+	inline LLSimdScalar dot3(const LLVector4a& b) const;
+
+	// Return the 4D dot product of this vector and b
+	inline LLSimdScalar dot4(const LLVector4a& b) const;
+
+	// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
+	// Note that this does not consider zero length vectors!
+	inline void normalize3();
+
+	// Same as normalize3() but with respect to all 4 components
+	inline void normalize4();
+
+	// Same as normalize3(), but returns length as a SIMD scalar
+	inline LLSimdScalar normalize3withLength();
+
+	// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
+	// Note that this does not consider zero length vectors!
+	inline void normalize3fast();
+
+	// Return true if this vector is normalized with respect to x,y,z up to tolerance
+	inline LLBool32 isNormalized3( F32 tolerance = 1e-3 ) const;
+
+	// Return true if this vector is normalized with respect to all components up to tolerance
+	inline LLBool32 isNormalized4( F32 tolerance = 1e-3 ) const;
+
+	// Set all elements to the length of vector 'v' 
+	inline void setAllLength3( const LLVector4a& v );
+
+	// Get this vector's length
+	inline LLSimdScalar getLength3() const;
+	
+	// Set the components of this vector to the minimum of the corresponding components of lhs and rhs
+	inline void setMin(const LLVector4a& lhs, const LLVector4a& rhs);
+	
+	// Set the components of this vector to the maximum of the corresponding components of lhs and rhs
+	inline void setMax(const LLVector4a& lhs, const LLVector4a& rhs);
+	
+	// Clamps this vector to be within the component-wise range low to high (inclusive)
+	inline void clamp( const LLVector4a& low, const LLVector4a& high );
+
+	// Set this to  (c * lhs) + rhs * ( 1 - c)
+	inline void setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c);
+	
+	// Return true (nonzero) if x, y, z (and w for Finite4) are all finite floats
+	inline LLBool32 isFinite3() const;	
+	inline LLBool32 isFinite4() const;
+
+	// Set this vector to 'vec' rotated by the LLRotation or LLQuaternion2 provided
+	void setRotated( const LLRotation& rot, const LLVector4a& vec );
+	void setRotated( const class LLQuaternion2& quat, const LLVector4a& vec );
+
+	// Set this vector to 'vec' rotated by the INVERSE of the LLRotation or LLQuaternion2 provided
+	inline void setRotatedInv( const LLRotation& rot, const LLVector4a& vec );
+	inline void setRotatedInv( const class LLQuaternion2& quat, const LLVector4a& vec );
+
+	// Quantize this vector to 8 or 16 bit precision
+	void quantize8( const LLVector4a& low, const LLVector4a& high );
+	void quantize16( const LLVector4a& low, const LLVector4a& high );
+
+	////////////////////////////////////
+	// LOGICAL
+	////////////////////////////////////	
+	// The functions in this section will compare the elements in this vector
+	// to those in rhs and return an LLVector4Logical with all bits set in elements
+	// where the comparison was true and all bits unset in elements where the comparison
+	// was false. See llvector4logica.h
+	////////////////////////////////////
+	// WARNING: Other than equals3 and equals4, these functions do NOT account
+	// for floating point tolerance. You should include the appropriate tolerance
+	// in the inputs.
+	////////////////////////////////////
+	
+	inline LLVector4Logical greaterThan(const LLVector4a& rhs) const;
+
+	inline LLVector4Logical lessThan(const LLVector4a& rhs) const;
+	
+	inline LLVector4Logical greaterEqual(const LLVector4a& rhs) const;
+
+	inline LLVector4Logical lessEqual(const LLVector4a& rhs) const;
+	
+	inline LLVector4Logical equal(const LLVector4a& rhs) const;
+
+	// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance
+	inline bool equals4(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
+
+	inline bool equals3(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
+
+	////////////////////////////////////
+	// OPERATORS
+	////////////////////////////////////	
+	
+	// Do NOT add aditional operators without consulting someone with SSE experience
+	inline const LLVector4a& operator= ( const LLVector4a& rhs );
+	
+	inline const LLVector4a& operator= ( const LLQuad& rhs );
+
+	inline operator LLQuad() const;	
+
+private:
+	LLQuad mQ;
+};
+
+inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p)
+{
+	min.setMin(min, p);
+	max.setMax(max, p);
+}
+
+#endif
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
new file mode 100644
index 0000000000..e52b550883
--- /dev/null
+++ b/indra/llmath/llvector4a.inl
@@ -0,0 +1,599 @@
+/** 
+ * @file llvector4a.inl
+ * @brief LLVector4a inline function implementations
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+////////////////////////////////////
+// LOAD/STORE
+////////////////////////////////////
+
+// Load from 16-byte aligned src array (preferred method of loading)
+inline void LLVector4a::load4a(const F32* src)
+{
+	mQ = _mm_load_ps(src);
+}
+
+// Load from unaligned src array (NB: Significantly slower than load4a)
+inline void LLVector4a::loadua(const F32* src)
+{
+	mQ = _mm_loadu_ps(src);
+}
+
+// Load only three floats beginning at address 'src'. Slowest method.
+inline void LLVector4a::load3(const F32* src)
+{
+	// mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X }
+	// NB: This differs from the convention of { Z, Y, X, W }
+	mQ = _mm_set_ps(0.f, src[2], src[1], src[0]);
+}	
+
+// Store to a 16-byte aligned memory address
+inline void LLVector4a::store4a(F32* dst) const
+{
+	_mm_store_ps(dst, mQ);
+}
+
+////////////////////////////////////
+// BASIC GET/SET 
+////////////////////////////////////
+
+// Return a "this" as an F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+F32* LLVector4a::getF32ptr()
+{
+	return (F32*) &mQ;
+}
+
+// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+const F32* const LLVector4a::getF32ptr() const
+{
+	return (const F32* const) &mQ;
+}
+
+// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
+// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead
+inline F32 LLVector4a::operator[](const S32 idx) const
+{
+	return ((F32*)&mQ)[idx];
+}	
+
+// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
+inline LLSimdScalar LLVector4a::getScalarAt(const S32 idx) const
+{
+	// Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop)
+	switch (idx)
+	{
+		case 0:
+			return mQ;
+		case 1:
+			return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1));
+		case 2:
+			return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2));
+		case 3:
+		default:
+			return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3));
+	}
+}
+
+// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
+template <int N> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt() const
+{
+	return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N));
+}
+
+template<> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const
+{
+	return mQ;
+}
+
+// Set to an x, y, z and optional w provided
+inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w)
+{
+	mQ = _mm_set_ps(w, z, y, x);
+}
+
+// Set to all zeros
+inline void LLVector4a::clear()
+{
+	mQ = LLVector4a::getZero().mQ;
+}
+
+inline void LLVector4a::splat(const F32 x)
+{
+	mQ = _mm_set1_ps(x);	
+}
+
+inline void LLVector4a::splat(const LLSimdScalar& x)
+{
+	mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) );
+}
+
+// Set all 4 elements to element N of src, with N known at compile time
+template <int N> void LLVector4a::splat(const LLVector4a& src)
+{
+	mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) );
+}
+
+// Set all 4 elements to element i of v, with i NOT known at compile time
+inline void LLVector4a::splat(const LLVector4a& v, U32 i)
+{
+	switch (i)
+	{
+		case 0:
+			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0));
+			break;
+		case 1:
+			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1));
+			break;
+		case 2:
+			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2));
+			break;
+		case 3:
+			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3));
+			break;
+	}
+}
+
+// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask
+inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse )
+{
+	// ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse )
+	// E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b
+	// (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b, 
+	// as expected (01 from sourceIfTrue, 10 from sourceIfFalse)
+	// Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/
+	mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) );
+}
+
+////////////////////////////////////
+// ALGEBRAIC
+////////////////////////////////////
+
+// Set this to the element-wise (a + b)
+inline void LLVector4a::setAdd(const LLVector4a& a, const LLVector4a& b)
+{
+	mQ = _mm_add_ps(a.mQ, b.mQ);
+}
+
+// Set this to element-wise (a - b)
+inline void LLVector4a::setSub(const LLVector4a& a, const LLVector4a& b)
+{
+	mQ = _mm_sub_ps(a.mQ, b.mQ);
+}
+
+// Set this to element-wise multiply (a * b)
+inline void LLVector4a::setMul(const LLVector4a& a, const LLVector4a& b)
+{
+	mQ = _mm_mul_ps(a.mQ, b.mQ);
+}
+
+// Set this to element-wise quotient (a / b)
+inline void LLVector4a::setDiv(const LLVector4a& a, const LLVector4a& b)
+{
+	mQ = _mm_div_ps( a.mQ, b.mQ );
+}
+
+// Set this to the element-wise absolute value of src
+inline void LLVector4a::setAbs(const LLVector4a& src)
+{
+	static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
+	mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
+}
+
+// Add to each component in this vector the corresponding component in rhs
+inline void LLVector4a::add(const LLVector4a& rhs)
+{
+	mQ = _mm_add_ps(mQ, rhs.mQ);	
+}
+
+// Subtract from each component in this vector the corresponding component in rhs
+inline void LLVector4a::sub(const LLVector4a& rhs)
+{
+	mQ = _mm_sub_ps(mQ, rhs.mQ);
+}
+
+// Multiply each component in this vector by the corresponding component in rhs
+inline void LLVector4a::mul(const LLVector4a& rhs)
+{
+	mQ = _mm_mul_ps(mQ, rhs.mQ);	
+}
+
+// Divide each component in this vector by the corresponding component in rhs
+inline void LLVector4a::div(const LLVector4a& rhs)
+{
+	// TODO: Check accuracy, maybe add divFast
+	mQ = _mm_div_ps(mQ, rhs.mQ);
+}
+
+// Multiply this vector by x in a scalar fashion
+inline void LLVector4a::mul(const F32 x) 
+{
+	LLVector4a t;
+	t.splat(x);
+	
+	mQ = _mm_mul_ps(mQ, t.mQ);
+}
+
+// Set this to (a x b) (geometric cross-product)
+inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
+{
+	// Vectors are stored in memory in w, z, y, x order from high to low
+	// Set vector1 = { a[W], a[X], a[Z], a[Y] }
+	const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
+	// Set vector2 = { b[W], b[Y], b[X], b[Z] }
+	const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
+	// mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] }
+	mQ = _mm_mul_ps( vector1, vector2 );
+	// vector3 = { a[W], a[Y], a[X], a[Z] }
+	const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
+	// vector4 = { b[W], b[X], b[Z], b[Y] }
+	const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
+	// mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] }
+	mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 ));
+}
+
+/* This function works, but may be slightly slower than the one below on older machines
+ inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
+ {
+ // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
+ const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
+ // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
+ const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 ));
+ // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
+ const LLQuad xPlusY = _mm_add_ps(ab, wzxy);
+ // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } 
+ const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
+ // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
+ const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 ));
+ // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
+ mQ = _mm_add_ps(zSplat, xPlusYSplat);
+ }*/
+
+// Set all elements to the dot product of the x, y, and z elements in a and b
+inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
+{
+	// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
+	const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
+	// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
+	const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 ));
+	// xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
+	const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy));
+	// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } 
+	const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
+	// zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
+	const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
+	// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
+	mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
+}
+
+// Set all elements to the dot product of the x, y, z, and w elements in a and b
+inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
+{
+	// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
+	const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
+	// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
+	const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 ));
+	// zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
+	const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy));
+	// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } 
+	const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY);
+	const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY);
+
+	// mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
+	mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
+}
+
+// Return the 3D dot product of this vector and b
+inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
+{
+	const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
+	const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
+	const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
+	const LLQuad xPlusY = _mm_add_ps( ab, splatY );
+	return _mm_add_ps( xPlusY, splatZ );	
+}
+
+// Return the 4D dot product of this vector and b
+inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
+{
+	// ab = { w, z, y, x }
+ 	const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
+ 	// upperProdsInLowerElems = { y, x, y, x }
+	const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab );
+	// sumOfPairs = { w+y, z+x, 2y, 2x }
+ 	const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab );
+	// shuffled = { z+x, z+x, z+x, z+x }
+	const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
+	return _mm_add_ss( sumOfPairs, shuffled );
+}
+
+// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
+// Note that this does not consider zero length vectors!
+inline void LLVector4a::normalize3()
+{
+	// lenSqrd = a dot a
+	LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
+	// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
+	const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+	static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
+	static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
+	// Now we do one round of Newton-Raphson approximation to get full accuracy
+	// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
+	// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
+	// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
+	// = 0.5 * w * (3 - a*w^2)
+	// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
+	// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
+	const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
+	const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
+	const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
+	const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
+	mQ = _mm_mul_ps( mQ, nrApprox );
+}
+
+// Normalize this vector with respect to all components. Accurate to 22 bites of precision.
+// Note that this does not consider zero length vectors!
+inline void LLVector4a::normalize4()
+{
+	// lenSqrd = a dot a
+	LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this );
+	// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
+	const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+	static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
+	static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
+	// Now we do one round of Newton-Raphson approximation to get full accuracy
+	// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
+	// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
+	// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
+	// = 0.5 * w * (3 - a*w^2)
+	// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
+	// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
+	const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
+	const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
+	const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
+	const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
+	mQ = _mm_mul_ps( mQ, nrApprox );
+}
+
+// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
+// Note that this does not consider zero length vectors!
+inline LLSimdScalar LLVector4a::normalize3withLength()
+{
+	// lenSqrd = a dot a
+	LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
+	// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
+	const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+	static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
+	static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
+	// Now we do one round of Newton-Raphson approximation to get full accuracy
+	// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
+	// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
+	// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
+	// = 0.5 * w * (3 - a*w^2)
+	// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
+	// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
+	const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
+	const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
+	const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
+	const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
+	mQ = _mm_mul_ps( mQ, nrApprox );
+	return _mm_sqrt_ss(lenSqrd);
+}
+
+// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
+// Note that this does not consider zero length vectors!
+inline void LLVector4a::normalize3fast()
+{
+	LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
+	const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+	mQ = _mm_mul_ps( mQ, approxRsqrt );
+}
+
+// Return true if this vector is normalized with respect to x,y,z up to tolerance
+inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
+{
+	static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
+	LLSimdScalar tol = _mm_load_ss( &tolerance );
+	tol = _mm_mul_ss( tol, tol );
+	LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this );
+	lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
+	lenSquared.setAbs(lenSquared);
+	return _mm_comile_ss( lenSquared, tol );		
+}
+
+// Return true if this vector is normalized with respect to all components up to tolerance
+inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const
+{
+	static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
+	LLSimdScalar tol = _mm_load_ss( &tolerance );
+	tol = _mm_mul_ss( tol, tol );
+	LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this );
+	lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
+	lenSquared.setAbs(lenSquared);
+	return _mm_comile_ss( lenSquared, tol );		
+}
+
+// Set all elements to the length of vector 'v' 
+inline void LLVector4a::setAllLength3( const LLVector4a& v )
+{
+	LLVector4a lenSqrd;
+	lenSqrd.setAllDot3(v, v);
+	
+	mQ = _mm_sqrt_ps(lenSqrd.mQ);
+}
+
+// Get this vector's length
+inline LLSimdScalar LLVector4a::getLength3() const
+{
+	return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) );
+}
+
+// Set the components of this vector to the minimum of the corresponding components of lhs and rhs
+inline void LLVector4a::setMin(const LLVector4a& lhs, const LLVector4a& rhs)
+{
+	mQ = _mm_min_ps(lhs.mQ, rhs.mQ);
+}
+
+// Set the components of this vector to the maximum of the corresponding components of lhs and rhs
+inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs)
+{
+	mQ = _mm_max_ps(lhs.mQ, rhs.mQ);
+}
+
+// Set this to  (c * lhs) + rhs * ( 1 - c)
+inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c)
+{
+	LLVector4a a = lhs;
+	a.mul(c);
+	
+	LLVector4a b = rhs;
+	b.mul(1.f-c);
+	
+	setAdd(a, b);
+}
+
+inline LLBool32 LLVector4a::isFinite3() const
+{
+	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
+	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
+	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
+	return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ );
+}
+	
+inline LLBool32 LLVector4a::isFinite4() const
+{
+	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
+	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
+	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
+	return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW );
+}
+
+inline void LLVector4a::setRotatedInv( const LLRotation& rot, const LLVector4a& vec )
+{
+	LLRotation inv; inv.setTranspose( rot );
+	setRotated( inv, vec );
+}
+
+inline void LLVector4a::setRotatedInv( const LLQuaternion2& quat, const LLVector4a& vec )
+{
+	LLQuaternion2 invRot; invRot.setConjugate( quat );
+	setRotated(invRot, vec);
+}
+
+inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high )
+{
+	const LLVector4Logical highMask = greaterThan( high );
+	const LLVector4Logical lowMask = lessThan( low );
+
+	setSelectWithMask( highMask, high, *this );
+	setSelectWithMask( lowMask, low, *this );
+}
+
+
+////////////////////////////////////
+// LOGICAL
+////////////////////////////////////	
+// The functions in this section will compare the elements in this vector
+// to those in rhs and return an LLVector4Logical with all bits set in elements
+// where the comparison was true and all bits unset in elements where the comparison
+// was false. See llvector4logica.h
+////////////////////////////////////
+// WARNING: Other than equals3 and equals4, these functions do NOT account
+// for floating point tolerance. You should include the appropriate tolerance
+// in the inputs.
+////////////////////////////////////
+
+inline LLVector4Logical LLVector4a::greaterThan(const LLVector4a& rhs) const
+{	
+	return _mm_cmpgt_ps(mQ, rhs.mQ);
+}
+
+inline LLVector4Logical LLVector4a::lessThan(const LLVector4a& rhs) const
+{
+	return _mm_cmplt_ps(mQ, rhs.mQ);
+}
+
+inline LLVector4Logical LLVector4a::greaterEqual(const LLVector4a& rhs) const
+{
+	return _mm_cmpge_ps(mQ, rhs.mQ);
+}
+
+inline LLVector4Logical LLVector4a::lessEqual(const LLVector4a& rhs) const
+{
+	return _mm_cmple_ps(mQ, rhs.mQ);
+}
+
+inline LLVector4Logical LLVector4a::equal(const LLVector4a& rhs) const
+{
+	return _mm_cmpeq_ps(mQ, rhs.mQ);
+}
+
+// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance
+inline bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance ) const
+{
+	LLVector4a diff; diff.setSub( *this, rhs );
+	diff.setAbs( diff );
+	const LLQuad tol = _mm_set1_ps( tolerance );
+	const LLQuad cmp = _mm_cmplt_ps( diff, tol );
+	return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW;
+}
+
+inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const
+{
+	LLVector4a diff; diff.setSub( *this, rhs );
+	diff.setAbs( diff );
+	const LLQuad tol = _mm_set1_ps( tolerance );
+	const LLQuad t = _mm_cmplt_ps( diff, tol ); 
+	return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ;
+	
+}
+
+////////////////////////////////////
+// OPERATORS
+////////////////////////////////////	
+
+// Do NOT add aditional operators without consulting someone with SSE experience
+inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs )
+{
+	mQ = rhs.mQ;
+	return *this;
+}
+
+inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs )
+{
+	mQ = rhs;
+	return *this;
+}
+
+inline LLVector4a::operator LLQuad() const
+{
+	return mQ;
+}
diff --git a/indra/llmath/llvector4logical.h b/indra/llmath/llvector4logical.h
new file mode 100644
index 0000000000..1c7ee1d79f
--- /dev/null
+++ b/indra/llmath/llvector4logical.h
@@ -0,0 +1,130 @@
+/** 
+ * @file llvector4logical.h
+ * @brief LLVector4Logical class header file - Companion class to LLVector4a for logical and bit-twiddling operations
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ * 
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ *
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#ifndef	LL_VECTOR4LOGICAL_H
+#define	LL_VECTOR4LOGICAL_H
+
+
+////////////////////////////
+// LLVector4Logical
+////////////////////////////
+// This class is incomplete. If you need additional functionality,
+// for example setting/unsetting particular elements or performing
+// other boolean operations, feel free to implement. If you need
+// assistance in determining the most optimal implementation,
+// contact someone with SSE experience (Falcon, Richard, Davep, e.g.)
+////////////////////////////
+
+static LL_ALIGN_16(const U32 S_V4LOGICAL_MASK_TABLE[4*4]) =
+{
+	0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
+};
+
+class LLVector4Logical
+{
+public:
+	
+	enum {
+		MASK_X = 1,
+		MASK_Y = 1 << 1,
+		MASK_Z = 1 << 2,
+		MASK_W = 1 << 3,
+		MASK_XYZ = MASK_X | MASK_Y | MASK_Z,
+		MASK_XYZW = MASK_XYZ | MASK_W
+	};
+	
+	// Empty default ctor
+	LLVector4Logical() {}
+	
+	LLVector4Logical( const LLQuad& quad )
+	{
+		mQ = quad;
+	}
+	
+	// Create and return a mask consisting of the lowest order bit of each element
+	inline U32 getGatheredBits() const
+	{
+		return _mm_movemask_ps(mQ);
+	};	
+	
+	// Invert this mask
+	inline LLVector4Logical& invert()
+	{
+		static const LL_ALIGN_16(U32 allOnes[4]) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+		mQ = _mm_andnot_ps( mQ, *(LLQuad*)(allOnes) );
+		return *this;
+	}
+	
+	inline LLBool32 areAllSet( U32 mask ) const
+	{
+		return ( getGatheredBits() & mask) == mask;
+	}
+	
+	inline LLBool32 areAllSet() const
+	{
+		return areAllSet( MASK_XYZW );
+	}
+		
+	inline LLBool32 areAnySet( U32 mask ) const
+	{
+		return getGatheredBits() & mask;
+	}
+	
+	inline LLBool32 areAnySet() const
+	{
+		return areAnySet( MASK_XYZW );
+	}
+	
+	inline operator LLQuad() const
+	{
+		return mQ;
+	}
+
+	inline void clear() 
+	{
+		mQ = _mm_setzero_ps();
+	}
+
+	template<int N> void setElement()
+	{
+		mQ = _mm_or_ps( mQ, *reinterpret_cast<const LLQuad*>(S_V4LOGICAL_MASK_TABLE + 4*N) );
+	}
+	
+private:
+	
+	LLQuad mQ;
+};
+
+#endif //LL_VECTOR4ALOGICAL_H
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index c563af592f..ab9f8c4c24 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -1,4 +1,5 @@
 /** 
+
  * @file llvolume.cpp
  *
  * $LicenseInfo:firstyear=2002&license=viewergpl$
@@ -30,6 +31,7 @@
  */
 
 #include "linden_common.h"
+#include "llmemory.h"
 #include "llmath.h"
 
 #include <set>
@@ -43,10 +45,15 @@
 #include "v4math.h"
 #include "m4math.h"
 #include "m3math.h"
+#include "llmatrix3a.h"
+#include "lloctree.h"
 #include "lldarray.h"
 #include "llvolume.h"
+#include "llvolumeoctree.h"
 #include "llstl.h"
 #include "llsdserialize.h"
+#include "llvector4a.h"
+#include "llmatrix4a.h"
 
 #define DEBUG_SILHOUETTE_BINORMALS 0
 #define DEBUG_SILHOUETTE_NORMALS 0 // TomY: Use this to display normals using the silhouette
@@ -104,127 +111,264 @@ BOOL check_same_clock_dir( const LLVector3& pt1, const LLVector3& pt2, const LLV
 
 BOOL LLLineSegmentBoxIntersect(const LLVector3& start, const LLVector3& end, const LLVector3& center, const LLVector3& size)
 {
-	float fAWdU[3];
-	LLVector3 dir;
-	LLVector3 diff;
+	return LLLineSegmentBoxIntersect(start.mV, end.mV, center.mV, size.mV);
+}
+
+BOOL LLLineSegmentBoxIntersect(const F32* start, const F32* end, const F32* center, const F32* size)
+{
+	F32 fAWdU[3];
+	F32 dir[3];
+	F32 diff[3];
 
 	for (U32 i = 0; i < 3; i++)
 	{
-		dir.mV[i] = 0.5f * (end.mV[i] - start.mV[i]);
-		diff.mV[i] = (0.5f * (end.mV[i] + start.mV[i])) - center.mV[i];
-		fAWdU[i] = fabsf(dir.mV[i]);
-		if(fabsf(diff.mV[i])>size.mV[i] + fAWdU[i]) return false;
+		dir[i] = 0.5f * (end[i] - start[i]);
+		diff[i] = (0.5f * (end[i] + start[i])) - center[i];
+		fAWdU[i] = fabsf(dir[i]);
+		if(fabsf(diff[i])>size[i] + fAWdU[i]) return false;
 	}
 
 	float f;
-	f = dir.mV[1] * diff.mV[2] - dir.mV[2] * diff.mV[1];    if(fabsf(f)>size.mV[1]*fAWdU[2] + size.mV[2]*fAWdU[1])  return false;
-	f = dir.mV[2] * diff.mV[0] - dir.mV[0] * diff.mV[2];    if(fabsf(f)>size.mV[0]*fAWdU[2] + size.mV[2]*fAWdU[0])  return false;
-	f = dir.mV[0] * diff.mV[1] - dir.mV[1] * diff.mV[0];    if(fabsf(f)>size.mV[0]*fAWdU[1] + size.mV[1]*fAWdU[0])  return false;
+	f = dir[1] * diff[2] - dir[2] * diff[1];    if(fabsf(f)>size[1]*fAWdU[2] + size[2]*fAWdU[1])  return false;
+	f = dir[2] * diff[0] - dir[0] * diff[2];    if(fabsf(f)>size[0]*fAWdU[2] + size[2]*fAWdU[0])  return false;
+	f = dir[0] * diff[1] - dir[1] * diff[0];    if(fabsf(f)>size[0]*fAWdU[1] + size[1]*fAWdU[0])  return false;
 	
 	return true;
 }
 
 
+
 // intersect test between triangle vert0, vert1, vert2 and a ray from orig in direction dir.
 // returns TRUE if intersecting and returns barycentric coordinates in intersection_a, intersection_b,
 // and returns the intersection point along dir in intersection_t.
 
 // Moller-Trumbore algorithm
-BOOL LLTriangleRayIntersect(const LLVector3& vert0, const LLVector3& vert1, const LLVector3& vert2, const LLVector3& orig, const LLVector3& dir,
-							F32* intersection_a, F32* intersection_b, F32* intersection_t, BOOL two_sided)
+BOOL LLTriangleRayIntersect(const LLVector4a& vert0, const LLVector4a& vert1, const LLVector4a& vert2, const LLVector4a& orig, const LLVector4a& dir,
+							F32& intersection_a, F32& intersection_b, F32& intersection_t)
 {
-	F32 u, v, t;
 	
 	/* find vectors for two edges sharing vert0 */
-	LLVector3 edge1 = vert1 - vert0;
+	LLVector4a edge1;
+	edge1.setSub(vert1, vert0);
 	
-	LLVector3 edge2 = vert2 - vert0;;
+	LLVector4a edge2;
+	edge2.setSub(vert2, vert0);
 
 	/* begin calculating determinant - also used to calculate U parameter */
-	LLVector3 pvec = dir % edge2;
-	
-	/* if determinant is near zero, ray lies in plane of triangle */
-	F32 det = edge1 * pvec;
+	LLVector4a pvec;
+	pvec.setCross3(dir, edge2);
 
-	if (!two_sided)
+	/* if determinant is near zero, ray lies in plane of triangle */
+	LLVector4a det;
+	det.setAllDot3(edge1, pvec);
+	
+	if (det.greaterEqual(LLVector4a::getEpsilon()).getGatheredBits() & 0x7)
 	{
-		if (det < F_APPROXIMATELY_ZERO)
-		{
-			return FALSE;
-		}
-
 		/* calculate distance from vert0 to ray origin */
-		LLVector3 tvec = orig - vert0;
+		LLVector4a tvec;
+		tvec.setSub(orig, vert0);
 
 		/* calculate U parameter and test bounds */
-		u = tvec * pvec;	
+		LLVector4a u;
+		u.setAllDot3(tvec,pvec);
 
-		if (u < 0.f || u > det)
+		if ((u.greaterEqual(LLVector4a::getZero()).getGatheredBits() & 0x7) &&
+			(u.lessEqual(det).getGatheredBits() & 0x7))
 		{
-			return FALSE;
+			/* prepare to test V parameter */
+			LLVector4a qvec;
+			qvec.setCross3(tvec, edge1);
+			
+			/* calculate V parameter and test bounds */
+			LLVector4a v;
+			v.setAllDot3(dir, qvec);
+
+			
+			//if (!(v < 0.f || u + v > det))
+
+			LLVector4a sum_uv;
+			sum_uv.setAdd(u, v);
+
+			S32 v_gequal = v.greaterEqual(LLVector4a::getZero()).getGatheredBits() & 0x7;
+			S32 sum_lequal = sum_uv.lessEqual(det).getGatheredBits() & 0x7;
+
+			if (v_gequal  && sum_lequal)
+			{
+				/* calculate t, scale parameters, ray intersects triangle */
+				LLVector4a t;
+				t.setAllDot3(edge2,qvec);
+
+				t.div(det);
+				u.div(det);
+				v.div(det);
+				
+				intersection_a = u[0];
+				intersection_b = v[0];
+				intersection_t = t[0];
+				return TRUE;
+			}
 		}
-	
-		/* prepare to test V parameter */
-		LLVector3 qvec = tvec % edge1;
+	}
 		
-		/* calculate V parameter and test bounds */
-		v = dir * qvec;
-		if (v < 0.f || u + v > det)
-		{
-			return FALSE;
-		}
+	return FALSE;
+} 
+
+BOOL LLTriangleRayIntersectTwoSided(const LLVector4a& vert0, const LLVector4a& vert1, const LLVector4a& vert2, const LLVector4a& orig, const LLVector4a& dir,
+							F32& intersection_a, F32& intersection_b, F32& intersection_t)
+{
+	F32 u, v, t;
+	
+	/* find vectors for two edges sharing vert0 */
+	LLVector4a edge1;
+	edge1.setSub(vert1, vert0);
+	
+	
+	LLVector4a edge2;
+	edge2.setSub(vert2, vert0);
+
+	/* begin calculating determinant - also used to calculate U parameter */
+	LLVector4a pvec;
+	pvec.setCross3(dir, edge2);
+
+	/* if determinant is near zero, ray lies in plane of triangle */
+	F32 det = edge1.dot3(pvec).getF32();
 
-		/* calculate t, scale parameters, ray intersects triangle */
-		t = edge2 * qvec;
-		F32 inv_det = 1.0 / det;
-		t *= inv_det;
-		u *= inv_det;
-		v *= inv_det;
+	
+	if (det > -F_APPROXIMATELY_ZERO && det < F_APPROXIMATELY_ZERO)
+	{
+		return FALSE;
 	}
+
+	F32 inv_det = 1.f / det;
+
+	/* calculate distance from vert0 to ray origin */
+	LLVector4a tvec;
+	tvec.setSub(orig, vert0);
 	
-	else // two sided
-			{
-		if (det > -F_APPROXIMATELY_ZERO && det < F_APPROXIMATELY_ZERO)
-				{
-			return FALSE;
-				}
-		F32 inv_det = 1.0 / det;
+	/* calculate U parameter and test bounds */
+	u = (tvec.dot3(pvec).getF32()) * inv_det;
+	if (u < 0.f || u > 1.f)
+	{
+		return FALSE;
+	}
 
-		/* calculate distance from vert0 to ray origin */
-		LLVector3 tvec = orig - vert0;
+	/* prepare to test V parameter */
+	tvec.sub(edge1);
 		
-		/* calculate U parameter and test bounds */
-		u = (tvec * pvec) * inv_det;
-		if (u < 0.f || u > 1.f)
+	/* calculate V parameter and test bounds */
+	v = (dir.dot3(tvec).getF32()) * inv_det;
+	
+	if (v < 0.f || u + v > 1.f)
+	{
+		return FALSE;
+	}
+
+	/* calculate t, ray intersects triangle */
+	t = (edge2.dot3(tvec).getF32()) * inv_det;
+	
+	intersection_a = u;
+	intersection_b = v;
+	intersection_t = t;
+	
+	
+	return TRUE;
+} 
+
+//helper for non-aligned vectors
+BOOL LLTriangleRayIntersect(const LLVector3& vert0, const LLVector3& vert1, const LLVector3& vert2, const LLVector3& orig, const LLVector3& dir,
+							F32& intersection_a, F32& intersection_b, F32& intersection_t, BOOL two_sided)
+{
+	LLVector4a vert0a, vert1a, vert2a, origa, dira;
+	vert0a.load3(vert0.mV);
+	vert1a.load3(vert1.mV);
+	vert2a.load3(vert2.mV);
+	origa.load3(orig.mV);
+	dira.load3(dir.mV);
+
+	if (two_sided)
+	{
+		return LLTriangleRayIntersectTwoSided(vert0a, vert1a, vert2a, origa, dira, 
+				intersection_a, intersection_b, intersection_t);
+	}
+	else
+	{
+		return LLTriangleRayIntersect(vert0a, vert1a, vert2a, origa, dira, 
+				intersection_a, intersection_b, intersection_t);
+	}
+}
+
+class LLVolumeOctreeRebound : public LLOctreeTravelerDepthFirst<LLVolumeTriangle>
+{
+public:
+	const LLVolumeFace* mFace;
+
+	LLVolumeOctreeRebound(const LLVolumeFace* face)
+	{
+		mFace = face;
+	}
+
+	virtual void visit(const LLOctreeNode<LLVolumeTriangle>* branch)
+	{
+		LLVolumeOctreeListener* node = (LLVolumeOctreeListener*) branch->getListener(0);
+
+		LLVector4a& min = node->mExtents[0];
+		LLVector4a& max = node->mExtents[1];
+
+		if (branch->getElementCount() != 0)
 		{
-			return FALSE;
+			const LLVolumeTriangle* tri = *(branch->getData().begin());
+						
+			min = *(tri->mV[0]);
+			max = *(tri->mV[0]);
+			
+			for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter = 
+				branch->getData().begin(); iter != branch->getData().end(); ++iter)
+			{
+				//stretch by triangles in node
+				tri = *iter;
+				
+				min.setMin(min, *tri->mV[0]);
+				min.setMin(min, *tri->mV[1]);
+				min.setMin(min, *tri->mV[2]);
+
+				max.setMax(max, *tri->mV[0]);
+				max.setMax(max, *tri->mV[1]);
+				max.setMax(max, *tri->mV[2]);
 			}
 
-		/* prepare to test V parameter */
-		LLVector3 qvec = tvec - edge1;
-		
-		/* calculate V parameter and test bounds */
-		v = (dir * qvec) * inv_det;
-		
-		if (v < 0.f || u + v > 1.f)
+			for (S32 i = 0; i < branch->getChildCount(); ++i)
+			{  //stretch by child extents
+				LLVolumeOctreeListener* child = (LLVolumeOctreeListener*) branch->getChild(i)->getListener(0);
+				min.setMin(min, child->mExtents[0]);
+				max.setMax(min, child->mExtents[1]);
+			}
+		}
+		else if (branch->getChildCount() != 0)
 		{
-			return FALSE;
+			LLVolumeOctreeListener* child = (LLVolumeOctreeListener*) branch->getChild(0)->getListener(0);
+
+			min = child->mExtents[0];
+			max = child->mExtents[1];
+
+			for (S32 i = 1; i < branch->getChildCount(); ++i)
+			{  //stretch by child extents
+				child = (LLVolumeOctreeListener*) branch->getChild(i)->getListener(0);
+				min.setMin(min, child->mExtents[0]);
+				max.setMax(max, child->mExtents[1]);
+			}
+		}
+		else
+		{
+			llerrs << "WTF? Empty leaf" << llendl;
 		}
+		
+		node->mBounds[0].setAdd(min, max);
+		node->mBounds[0].mul(0.5f);
 
-		/* calculate t, ray intersects triangle */
-		t = (edge2 * qvec) * inv_det;
+		node->mBounds[1].setSub(max,min);
+		node->mBounds[1].mul(0.5f);
 	}
-	
-	if (intersection_a != NULL)
-		*intersection_a = u;
-	if (intersection_b != NULL)
-		*intersection_b = v;
-	if (intersection_t != NULL)
-		*intersection_t = t;
-	
-	
-	return TRUE;
-} 
+};
 
 
 //-------------------------------------------------------------------
@@ -1843,50 +1987,138 @@ BOOL LLVolume::generate()
 	return FALSE;
 }
 
-bool LLVolumeFace::VertexData::operator<(const LLVolumeFace::VertexData& rhs)const
+void LLVolumeFace::VertexData::init()
 {
-	const U8* l = (const U8*) this;
-	const U8* r = (const U8*) &rhs;
-
-	for (U32 i = 0; i < sizeof(VertexData); ++i)
+	if (!mData)
 	{
-		if (l[i] != r[i])
-		{
-			return r[i] < l[i];
-		}
+		mData = (LLVector4a*) ll_aligned_malloc_16(32);
 	}
+}
+
+LLVolumeFace::VertexData::VertexData()
+{
+	mData = NULL;
+	init();
+}
 	
-	return false;
+LLVolumeFace::VertexData::VertexData(const VertexData& rhs)
+{
+	mData = NULL;
+	*this = rhs;
 }
 
-bool LLVolumeFace::VertexData::operator==(const LLVolumeFace::VertexData& rhs)const
+const LLVolumeFace::VertexData& LLVolumeFace::VertexData::operator=(const LLVolumeFace::VertexData& rhs)
+{
+	if (this != &rhs)
+	{
+		init();
+		LLVector4a::memcpyNonAliased16((F32*) mData, (F32*) rhs.mData, 8*sizeof(F32));
+		mTexCoord = rhs.mTexCoord;
+	}
+	return *this;
+}
+
+LLVolumeFace::VertexData::~VertexData()
 {
-	const U8* l = (const U8*) this;
-	const U8* r = (const U8*) &rhs;
+	ll_aligned_free_16(mData);
+}
+
+LLVector4a& LLVolumeFace::VertexData::getPosition()
+{
+	return mData[POSITION];
+}
+
+LLVector4a& LLVolumeFace::VertexData::getNormal()
+{
+	return mData[NORMAL];
+}
+
+const LLVector4a& LLVolumeFace::VertexData::getPosition() const
+{
+	return mData[POSITION];
+}
 
-	for (U32 i = 0; i < sizeof(VertexData); ++i)
+const LLVector4a& LLVolumeFace::VertexData::getNormal() const
+{
+	return mData[NORMAL];
+}
+
+
+void LLVolumeFace::VertexData::setPosition(const LLVector4a& pos)
+{
+	mData[POSITION] = pos;
+}
+
+void LLVolumeFace::VertexData::setNormal(const LLVector4a& norm)
+{
+	mData[NORMAL] = norm;
+}
+
+bool LLVolumeFace::VertexData::operator<(const LLVolumeFace::VertexData& rhs)const
+{
+	const F32* lp = this->getPosition().getF32ptr();
+	const F32* rp = rhs.getPosition().getF32ptr();
+
+	if (lp[0] != rp[0])
 	{
-		if (l[i] != r[i])
-		{
-			return false;
-		}
+		return lp[0] < rp[0];
 	}
-	
-	return true;
+
+	if (rp[1] != lp[1])
+	{
+		return lp[1] < rp[1];
+	}
+
+	if (rp[2] != lp[2])
+	{
+		return lp[2] < rp[2];
+	}
+
+	lp = getNormal().getF32ptr();
+	rp = rhs.getNormal().getF32ptr();
+
+	if (lp[0] != rp[0])
+	{
+		return lp[0] < rp[0];
+	}
+
+	if (rp[1] != lp[1])
+	{
+		return lp[1] < rp[1];
+	}
+
+	if (rp[2] != lp[2])
+	{
+		return lp[2] < rp[2];
+	}
+
+	if (mTexCoord.mV[0] != rhs.mTexCoord.mV[0])
+	{
+		return mTexCoord.mV[0] < rhs.mTexCoord.mV[0];
+	}
+
+	return mTexCoord.mV[1] < rhs.mTexCoord.mV[1];
+}
+
+bool LLVolumeFace::VertexData::operator==(const LLVolumeFace::VertexData& rhs)const
+{
+	return mData[POSITION].equals3(rhs.getPosition()) &&
+			mData[NORMAL].equals3(rhs.getNormal()) &&
+			mTexCoord == rhs.mTexCoord;
 }
 
 bool LLVolumeFace::VertexData::compareNormal(const LLVolumeFace::VertexData& rhs, F32 angle_cutoff) const
 {
 	bool retval = false;
-	if (rhs.mPosition == mPosition && rhs.mTexCoord == mTexCoord)
+	if (rhs.mData[POSITION].equals3(mData[POSITION]) && rhs.mTexCoord == mTexCoord)
 	{
 		if (angle_cutoff > 1.f)
 		{
-			retval = (mNormal == rhs.mNormal);
+			retval = (mData[NORMAL].equals3(rhs.mData[NORMAL]));
 		}
 		else
 		{
-			F32 cur_angle = rhs.mNormal*mNormal;
+			F32 cur_angle = rhs.mData[NORMAL].dot3(mData[NORMAL]).getF32();
 			retval = cur_angle > angle_cutoff;
 		}
 	}
@@ -1990,11 +2222,10 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 
 			LLVolumeFace& face = mVolumeFaces[i];
 
-			face.mHasBinormals = false;
-
 			//copy out indices
-			face.mIndices.resize(idx.size()/2);
-			if (idx.empty() || face.mIndices.size() < 3)
+			face.resizeIndices(idx.size()/2);
+			
+			if (idx.empty() || face.mNumIndices < 3)
 			{ //why is there an empty index list?
 				llerrs <<"WTF?" << llendl;
 				continue;
@@ -2008,30 +2239,32 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 
 			//copy out vertices
 			U32 num_verts = pos.size()/(3*2);
-			face.mVertices.resize(num_verts);
+			face.resizeVertices(num_verts);
 
 			if (mdl[i].has("Weights"))
 			{
-				face.mWeights.resize(num_verts);
+				face.allocateWeights(num_verts);
+
 				LLSD::Binary weights = mdl[i]["Weights"];
 
-				LLSD::Binary::iterator iter = weights.begin();
+				U32 idx = 0;
 
 				U32 cur_vertex = 0;
-				while (iter != weights.end())
+				while (idx < weights.size() && cur_vertex < num_verts)
 				{
-					const S32 END_INFLUENCES = 0xFF;
-					U8 joint = *(iter++);
+					const U8 END_INFLUENCES = 0xFF;
+					U8 joint = weights[idx++];
 
 					U32 cur_influence = 0;
+					LLVector4 wght(0,0,0,0);
+
 					while (joint != END_INFLUENCES)
 					{
-						U16 influence = *(iter++);
-						influence = influence << 8;
-						influence |= *(iter++);
+						U16 influence = weights[idx++];
+						influence |= ((U16) weights[idx++] << 8);
 
 						F32 w = llmin((F32) influence / 65535.f, 0.99999f);
-						face.mWeights[cur_vertex].mV[cur_influence++] = (F32) joint + w;
+						wght.mV[cur_influence++] = (F32) joint + w;
 
 						if (cur_influence >= 4)
 						{
@@ -2039,64 +2272,86 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 						}
 						else
 						{
-							joint = *(iter++);
+							joint = weights[idx++];
 						}
 					}
 
+					face.mWeights[cur_vertex].loadua(wght.mV);
+
 					cur_vertex++;
-					iter++;
 				}
+
+				if (cur_vertex != num_verts || idx != weights.size())
+				{
+					llwarns << "Vertex weight count does not match vertex count!" << llendl;
+				}
+					
 			}
 
-			LLVector3 min_pos;
-			LLVector3 max_pos;
+			LLVector3 minp;
+			LLVector3 maxp;
 			LLVector2 min_tc; 
 			LLVector2 max_tc; 
-
 		
-			min_pos.setValue(mdl[i]["PositionDomain"]["Min"]);
-			max_pos.setValue(mdl[i]["PositionDomain"]["Max"]);
+			minp.setValue(mdl[i]["PositionDomain"]["Min"]);
+			maxp.setValue(mdl[i]["PositionDomain"]["Max"]);
+			LLVector4a min_pos, max_pos;
+			min_pos.load3(minp.mV);
+			max_pos.load3(maxp.mV);
+
 			min_tc.setValue(mdl[i]["TexCoord0Domain"]["Min"]);
 			max_tc.setValue(mdl[i]["TexCoord0Domain"]["Max"]);
 
-			LLVector3 pos_range = max_pos - min_pos;
+			LLVector4a pos_range;
+			pos_range.setSub(max_pos, min_pos);
 			LLVector2 tc_range = max_tc - min_tc;
 
-			LLVector3& min = face.mExtents[0];
-			LLVector3& max = face.mExtents[1];
+			LLVector4a& min = face.mExtents[0];
+			LLVector4a& max = face.mExtents[1];
 
-			min = max = LLVector3(0,0,0);
+			min.clear();
+			max.clear();
+			
+			LLVector4a* pos_out = face.mPositions;
+			LLVector4a* norm_out = face.mNormals;
+			LLVector2* tc_out = face.mTexCoords;
 
 			for (U32 j = 0; j < num_verts; ++j)
 			{
 				U16* v = (U16*) &(pos[j*3*2]);
 
-				face.mVertices[j].mPosition.setVec(
-					(F32) v[0] / 65535.f * pos_range.mV[0] + min_pos.mV[0],
-					(F32) v[1] / 65535.f * pos_range.mV[1] + min_pos.mV[1],
-					(F32) v[2] / 65535.f * pos_range.mV[2] + min_pos.mV[2]);
+				pos_out->set((F32) v[0], (F32) v[1], (F32) v[2]);
+				pos_out->div(65535.f);
+				pos_out->mul(pos_range);
+				pos_out->add(min_pos);
 
 				if (j == 0)
 				{
-					min = max = face.mVertices[j].mPosition;
+					min = *pos_out;
+					max = min;
 				}
 				else
 				{
-					update_min_max(min,max,face.mVertices[j].mPosition);
+					min.setMin(min, *pos_out);
+					max.setMax(max, *pos_out);
 				}
 
+				pos_out++;
+
 				U16* n = (U16*) &(norm[j*3*2]);
 
-				face.mVertices[j].mNormal.setVec(
-					(F32) n[0] / 65535.f * 2.f - 1.f,
-					(F32) n[1] / 65535.f * 2.f - 1.f,
-					(F32) n[2] / 65535.f * 2.f - 1.f);
+				norm_out->set((F32) n[0], (F32) n[1], (F32) n[2]);
+				norm_out->div(65535.f);
+				norm_out->mul(2.f);
+				norm_out->sub(1.f);
+				norm_out++;
 
 				U16* t = (U16*) &(tc[j*2*2]);
 
-				face.mVertices[j].mTexCoord.setVec(
-					(F32) t[0] / 65535.f * tc_range.mV[0] + min_tc.mV[0],
-					(F32) t[1] / 65535.f * tc_range.mV[1] + min_tc.mV[1]);
+				tc_out->mV[0] = (F32) t[0] / 65535.f * tc_range.mV[0] + min_tc.mV[0];
+				tc_out->mV[1] =	(F32) t[1] / 65535.f * tc_range.mV[1] + min_tc.mV[1];
+
+				tc_out++;
 			}
 
 			
@@ -2126,24 +2381,29 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 
 			if (do_reflect_x)
 			{
-				for (S32 i = 0; i < face.mVertices.size(); i++)
+				LLVector4a* p = (LLVector4a*) face.mPositions;
+				LLVector4a* n = (LLVector4a*) face.mNormals;
+				
+				for (S32 i = 0; i < face.mNumVertices; i++)
 				{
-					face.mVertices[i].mPosition.mV[VX] *= -1.0f;
-					face.mVertices[i].mNormal.mV[VX] *= -1.0f;
+					p[i].mul(-1.0f);
+					n[i].mul(-1.0f);
 				}
 			}
 
 			if (do_invert_normals)
 			{
-				for (S32 i = 0; i < face.mVertices.size(); i++)
+				LLVector4a* n = (LLVector4a*) face.mNormals;
+				
+				for (S32 i = 0; i < face.mNumVertices; i++)
 				{
-					face.mVertices[i].mNormal *= -1.0f;
+					n[i].mul(-1.0f);
 				}
 			}
 
 			if (do_reverse_triangles)
 			{
-				for (U32 j = 0; j < face.mIndices.size(); j += 3)
+				for (U32 j = 0; j < face.mNumIndices; j += 3)
 				{
 					// swap the 2nd and 3rd index
 					S32 swap = face.mIndices[j+1];
@@ -2161,13 +2421,15 @@ bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 
 void tetrahedron_set_normal(LLVolumeFace::VertexData* cv)
 {
-	LLVector3 nrm = (cv[1].mPosition-cv[0].mPosition)%(cv[2].mPosition-cv[0].mPosition);
-
-	nrm.normVec();
-
-	cv[0].mNormal = nrm;
-	cv[1].mNormal = nrm;
-	cv[2].mNormal = nrm;
+	LLVector4a v0;
+	v0.setSub(cv[1].getPosition(), cv[0].getNormal());
+	LLVector4a v1;
+	v1.setSub(cv[2].getNormal(), cv[0].getPosition());
+	
+	cv[0].getNormal().setCross3(v0,v1);
+	cv[0].getNormal().normalize3fast();
+	cv[1].setNormal(cv[0].getNormal());
+	cv[2].setNormal(cv[1].getNormal());
 }
 
 BOOL LLVolume::isTetrahedron()
@@ -2182,16 +2444,16 @@ void LLVolume::makeTetrahedron()
 	LLVolumeFace face;
 
 	F32 x = 0.25f;
-	LLVector3 p[] = 
+	LLVector4a p[] = 
 	{ //unit tetrahedron corners
-		LLVector3(x,x,x),
-		LLVector3(-x,-x,x),
-		LLVector3(-x,x,-x),
-		LLVector3(x,-x,-x)
+		LLVector4a(x,x,x),
+		LLVector4a(-x,-x,x),
+		LLVector4a(-x,x,-x),
+		LLVector4a(x,-x,-x)
 	};
 
-	face.mExtents[0].setVec(-x,-x,-x);
-	face.mExtents[1].setVec(x,x,x);
+	face.mExtents[0].splat(-x);
+	face.mExtents[1].splat(x);
 	
 	LLVolumeFace::VertexData cv[3];
 
@@ -2202,53 +2464,105 @@ void LLVolume::makeTetrahedron()
 
 
 	//side 1
-	cv[0].mPosition = p[1];
-	cv[1].mPosition = p[0];
-	cv[2].mPosition = p[2];
+	cv[0].setPosition(p[1]);
+	cv[1].setPosition(p[0]);
+	cv[2].setPosition(p[2]);
 
 	tetrahedron_set_normal(cv);
 
-	face.mVertices.push_back(cv[0]);
-	face.mVertices.push_back(cv[1]);
-	face.mVertices.push_back(cv[2]);
+	face.resizeVertices(12);
+	face.resizeIndices(12);
+
+	LLVector4a* v = (LLVector4a*) face.mPositions;
+	LLVector4a* n = (LLVector4a*) face.mNormals;
+	LLVector2* tc = (LLVector2*) face.mTexCoords;
+
+	v[0] = cv[0].getPosition();
+	v[1] = cv[1].getPosition();
+	v[2] = cv[2].getPosition();
+	v += 3;
+
+	n[0] = cv[0].getNormal();
+	n[1] = cv[1].getNormal();
+	n[2] = cv[2].getNormal();
+	n += 3;
+
+	tc[0] = cv[0].mTexCoord;
+	tc[1] = cv[1].mTexCoord;
+	tc[2] = cv[2].mTexCoord;
+	tc += 3;
+
 	
 	//side 2
-	cv[0].mPosition = p[3];
-	cv[1].mPosition = p[0];
-	cv[2].mPosition = p[1];
+	cv[0].setPosition(p[3]);
+	cv[1].setPosition(p[0]);
+	cv[2].setPosition(p[1]);
 
 	tetrahedron_set_normal(cv);
 
-	face.mVertices.push_back(cv[0]);
-	face.mVertices.push_back(cv[1]);
-	face.mVertices.push_back(cv[2]);
+	v[0] = cv[0].getPosition();
+	v[1] = cv[1].getPosition();
+	v[2] = cv[2].getPosition();
+	v += 3;
+
+	n[0] = cv[0].getNormal();
+	n[1] = cv[1].getNormal();
+	n[2] = cv[2].getNormal();
+	n += 3;
+
+	tc[0] = cv[0].mTexCoord;
+	tc[1] = cv[1].mTexCoord;
+	tc[2] = cv[2].mTexCoord;
+	tc += 3;
 	
 	//side 3
-	cv[0].mPosition = p[3];
-	cv[1].mPosition = p[1];
-	cv[2].mPosition = p[2];
+	cv[0].setPosition(p[3]);
+	cv[1].setPosition(p[1]);
+	cv[2].setPosition(p[2]);
 
 	tetrahedron_set_normal(cv);
 
-	face.mVertices.push_back(cv[0]);
-	face.mVertices.push_back(cv[1]);
-	face.mVertices.push_back(cv[2]);
+	v[0] = cv[0].getPosition();
+	v[1] = cv[1].getPosition();
+	v[2] = cv[2].getPosition();
+	v += 3;
+
+	n[0] = cv[0].getNormal();
+	n[1] = cv[1].getNormal();
+	n[2] = cv[2].getNormal();
+	n += 3;
+
+	tc[0] = cv[0].mTexCoord;
+	tc[1] = cv[1].mTexCoord;
+	tc[2] = cv[2].mTexCoord;
+	tc += 3;
 	
 	//side 4
-	cv[0].mPosition = p[2];
-	cv[1].mPosition = p[0];
-	cv[2].mPosition = p[3];
+	cv[0].setPosition(p[2]);
+	cv[1].setPosition(p[0]);
+	cv[2].setPosition(p[3]);
 
 	tetrahedron_set_normal(cv);
 
-	face.mVertices.push_back(cv[0]);
-	face.mVertices.push_back(cv[1]);
-	face.mVertices.push_back(cv[2]);
+	v[0] = cv[0].getPosition();
+	v[1] = cv[1].getPosition();
+	v[2] = cv[2].getPosition();
+	v += 3;
+
+	n[0] = cv[0].getNormal();
+	n[1] = cv[1].getNormal();
+	n[2] = cv[2].getNormal();
+	n += 3;
+
+	tc[0] = cv[0].mTexCoord;
+	tc[1] = cv[1].mTexCoord;
+	tc[2] = cv[2].mTexCoord;
+	tc += 3;
 	
 	//set index buffer
-	for (U32 i = 0; i < 12; i++)
+	for (U16 i = 0; i < 12; i++)
 	{
-		face.mIndices.push_back(i);
+		face.mIndices[i] = i;
 	}
 	
 	mVolumeFaces.push_back(face);
@@ -2266,12 +2580,14 @@ void LLVolume::copyVolumeFaces(LLVolume* volume)
 
 S32	LLVolume::getNumFaces() const
 {
+#if LL_MESH_ENABLED
 	U8 sculpt_type = (mParams.getSculptType() & LL_SCULPT_TYPE_MASK);
 
 	if (sculpt_type == LL_SCULPT_TYPE_MESH)
 	{
 		return LL_SCULPT_MESH_MAX_FACES;
 	}
+#endif
 
 	return (S32)mProfilep->mFaces.size();
 }
@@ -2629,7 +2945,7 @@ void sculpt_calc_mesh_resolution(U16 width, U16 height, U8 type, F32 detail, S32
 		ratio = (F32) width / (F32) height;
 
 	
-	s = (S32)fsqrtf(((F32)vertices / ratio));
+	s = (S32)(F32) sqrt(((F32)vertices / ratio));
 
 	s = llmax(s, 4);              // no degenerate sizes, please
 	t = vertices / s;
@@ -2644,11 +2960,6 @@ void LLVolume::sculpt(U16 sculpt_width, U16 sculpt_height, S8 sculpt_components,
 	LLMemType m1(LLMemType::MTYPE_VOLUME);
     U8 sculpt_type = mParams.getSculptType();
 
-	if (sculpt_type & LL_SCULPT_TYPE_MASK == LL_SCULPT_TYPE_MESH)
-	{
-		llerrs << "WTF?" << llendl;
-	}
-
 	BOOL data_is_empty = FALSE;
 
 	if (sculpt_width == 0 || sculpt_height == 0 || sculpt_components < 3 || sculpt_data == NULL)
@@ -3824,7 +4135,7 @@ S32 LLVolume::getNumTriangles() const
 
 	for (S32 i = 0; i < getNumVolumeFaces(); ++i)
 	{
-		triangle_count += getVolumeFace(i).mIndices.size()/3;
+		triangle_count += getVolumeFace(i).mNumIndices/3;
 	}
 
 	return triangle_count;
@@ -3837,21 +4148,32 @@ S32 LLVolume::getNumTriangles() const
 void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 										  std::vector<LLVector3> &normals,
 										  std::vector<S32> &segments,
-										  const LLVector3& obj_cam_vec,
-										  const LLMatrix4& mat,
-										  const LLMatrix3& norm_mat,
+										  const LLVector3& obj_cam_vec_in,
+										  const LLMatrix4& mat_in,
+										  const LLMatrix3& norm_mat_in,
 										  S32 face_mask)
 {
 	LLMemType m1(LLMemType::MTYPE_VOLUME);
 
+	LLMatrix4a mat;
+	mat.loadu(mat_in);
+
+	LLMatrix4a norm_mat;
+	norm_mat.loadu(norm_mat_in);
+		
+	LLVector4a obj_cam_vec;
+	obj_cam_vec.load3(obj_cam_vec_in.mV);
+
 	vertices.clear();
 	normals.clear();
 	segments.clear();
 
+#if LL_MESH_ENABLED
 	if ((mParams.getSculptType() & LL_SCULPT_TYPE_MASK) == LL_SCULPT_TYPE_MESH)
 	{
 		return;
 	}
+#endif
 	
 	S32 cur_index = 0;
 	//for each face
@@ -3861,7 +4183,7 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 		LLVolumeFace& face = *iter;
 	
 		if (!(face_mask & (0x1 << cur_index++)) ||
-		     face.mIndices.empty() || face.mEdge.empty())
+		     face.mNumIndices == 0 || face.mEdge.empty())
 		{
 			continue;
 		}
@@ -3878,7 +4200,7 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 #if DEBUG_SILHOUETTE_EDGE_MAP
 
 			//for each triangle
-			U32 count = face.mIndices.size();
+			U32 count = face.mNumIndices;
 			for (U32 j = 0; j < count/3; j++) {
 				//get vertices
 				S32 v1 = face.mIndices[j*3+0];
@@ -3886,9 +4208,9 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 				S32 v3 = face.mIndices[j*3+2];
 
 				//get current face center
-				LLVector3 cCenter = (face.mVertices[v1].mPosition + 
-									face.mVertices[v2].mPosition + 
-									face.mVertices[v3].mPosition) / 3.0f;
+				LLVector3 cCenter = (face.mVertices[v1].getPosition() + 
+									face.mVertices[v2].getPosition() + 
+									face.mVertices[v3].getPosition()) / 3.0f;
 
 				//for each edge
 				for (S32 k = 0; k < 3; k++) {
@@ -3906,9 +4228,9 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 					v3 = face.mIndices[nIndex*3+2];
 
 					//get neighbor face center
-					LLVector3 nCenter = (face.mVertices[v1].mPosition + 
-									face.mVertices[v2].mPosition + 
-									face.mVertices[v3].mPosition) / 3.0f;
+					LLVector3 nCenter = (face.mVertices[v1].getPosition() + 
+									face.mVertices[v2].getPosition() + 
+									face.mVertices[v3].getPosition()) / 3.0f;
 
 					//draw line
 					vertices.push_back(cCenter);
@@ -3931,15 +4253,15 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 #elif DEBUG_SILHOUETTE_NORMALS
 
 			//for each vertex
-			for (U32 j = 0; j < face.mVertices.size(); j++) {
-				vertices.push_back(face.mVertices[j].mPosition);
-				vertices.push_back(face.mVertices[j].mPosition + face.mVertices[j].mNormal*0.1f);
+			for (U32 j = 0; j < face.mNumVertices; j++) {
+				vertices.push_back(face.mVertices[j].getPosition());
+				vertices.push_back(face.mVertices[j].getPosition() + face.mVertices[j].getNormal()*0.1f);
 				normals.push_back(LLVector3(0,0,1));
 				normals.push_back(LLVector3(0,0,1));
 				segments.push_back(vertices.size());
 #if DEBUG_SILHOUETTE_BINORMALS
-				vertices.push_back(face.mVertices[j].mPosition);
-				vertices.push_back(face.mVertices[j].mPosition + face.mVertices[j].mBinormal*0.1f);
+				vertices.push_back(face.mVertices[j].getPosition());
+				vertices.push_back(face.mVertices[j].getPosition() + face.mVertices[j].mBinormal*0.1f);
 				normals.push_back(LLVector3(0,0,1));
 				normals.push_back(LLVector3(0,0,1));
 				segments.push_back(vertices.size());
@@ -3957,26 +4279,36 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 
 			//for each triangle
 			std::vector<U8> fFacing;
-			vector_append(fFacing, face.mIndices.size()/3);
-			for (U32 j = 0; j < face.mIndices.size()/3; j++) 
+			vector_append(fFacing, face.mNumIndices/3);
+
+			LLVector4a* v = (LLVector4a*) face.mPositions;
+			LLVector4a* n = (LLVector4a*) face.mNormals;
+
+			for (U32 j = 0; j < face.mNumIndices/3; j++) 
 			{
 				//approximate normal
 				S32 v1 = face.mIndices[j*3+0];
 				S32 v2 = face.mIndices[j*3+1];
 				S32 v3 = face.mIndices[j*3+2];
 
-				LLVector3 norm = (face.mVertices[v1].mPosition - face.mVertices[v2].mPosition) % 
-					(face.mVertices[v2].mPosition - face.mVertices[v3].mPosition);
-				
-				if (norm.magVecSquared() < 0.00000001f) 
+				LLVector4a c1,c2;
+				c1.setSub(v[v1], v[v2]);
+				c2.setSub(v[v2], v[v3]);
+
+				LLVector4a norm;
+
+				norm.setCross3(c1, c2);
+
+				if (norm.dot3(norm) < 0.00000001f) 
 				{
 					fFacing[j] = AWAY | TOWARDS;
 				}
 				else 
 				{
 					//get view vector
-					LLVector3 view = (obj_cam_vec-face.mVertices[v1].mPosition);
-					bool away = view * norm > 0.0f; 
+					LLVector4a view;
+					view.setSub(obj_cam_vec, v[v1]);
+					bool away = view.dot3(norm) > 0.0f; 
 					if (away) 
 					{
 						fFacing[j] = AWAY;
@@ -3989,7 +4321,7 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 			}
 			
 			//for each triangle
-			for (U32 j = 0; j < face.mIndices.size()/3; j++) 
+			for (U32 j = 0; j < face.mNumIndices/3; j++) 
 			{
 				if (fFacing[j] == (AWAY | TOWARDS)) 
 				{ //this is a degenerate triangle
@@ -4022,15 +4354,21 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 						S32 v1 = face.mIndices[j*3+k];
 						S32 v2 = face.mIndices[j*3+((k+1)%3)];
 						
-						vertices.push_back(face.mVertices[v1].mPosition*mat);
-						LLVector3 norm1 = face.mVertices[v1].mNormal * norm_mat;
-						norm1.normVec();
-						normals.push_back(norm1);
+						LLVector4a t;
+						mat.affineTransform(v[v1], t);
+						vertices.push_back(LLVector3(t[0], t[1], t[2]));
 
-						vertices.push_back(face.mVertices[v2].mPosition*mat);
-						LLVector3 norm2 = face.mVertices[v2].mNormal * norm_mat;
-						norm2.normVec();
-						normals.push_back(norm2);
+						norm_mat.rotate(n[v1], t);
+
+						t.normalize3fast();
+						normals.push_back(LLVector3(t[0], t[1], t[2]));
+
+						mat.affineTransform(v[v2], t);
+						vertices.push_back(LLVector3(t[0], t[1], t[2]));
+						
+						norm_mat.rotate(n[v2], t);
+						t.normalize3fast();
+						normals.push_back(LLVector3(t[0], t[1], t[2]));
 
 						segments.push_back(vertices.size());
 					}
@@ -4045,6 +4383,19 @@ S32 LLVolume::lineSegmentIntersect(const LLVector3& start, const LLVector3& end,
 								   S32 face,
 								   LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal)
 {
+	LLVector4a starta, enda;
+	starta.load3(start.mV);
+	enda.load3(end.mV);
+
+	return lineSegmentIntersect(starta, enda, face, intersection, tex_coord, normal, bi_normal);
+
+}
+
+
+S32 LLVolume::lineSegmentIntersect(const LLVector4a& start, const LLVector4a& end, 
+								   S32 face,
+								   LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal)
+{
 	S32 hit_face = -1;
 	
 	S32 start_face;
@@ -4061,7 +4412,8 @@ S32 LLVolume::lineSegmentIntersect(const LLVector3& start, const LLVector3& end,
 		end_face = face;
 	}
 
-	LLVector3 dir = end - start;
+	LLVector4a dir;
+	dir.setSub(end, start);
 
 	F32 closest_t = 2.f; // must be larger than 1
 	
@@ -4069,10 +4421,14 @@ S32 LLVolume::lineSegmentIntersect(const LLVector3& start, const LLVector3& end,
 
 	for (S32 i = start_face; i <= end_face; i++)
 	{
-		const LLVolumeFace &face = getVolumeFace((U32)i);
+		LLVolumeFace &face = mVolumeFaces[i];
 
-		LLVector3 box_center = (face.mExtents[0] + face.mExtents[1]) / 2.f;
-		LLVector3 box_size   = face.mExtents[1] - face.mExtents[0];
+		LLVector4a box_center;
+		box_center.setAdd(face.mExtents[0], face.mExtents[1]);
+		box_center.mul(0.5f);
+
+		LLVector4a box_size;
+		box_size.setSub(face.mExtents[1], face.mExtents[0]);
 
         if (LLLineSegmentBoxIntersect(start, end, box_center, box_size))
 		{
@@ -4080,56 +4436,19 @@ S32 LLVolume::lineSegmentIntersect(const LLVector3& start, const LLVector3& end,
 			{
 				genBinormals(i);
 			}
-			
-			for (U32 tri = 0; tri < face.mIndices.size()/3; tri++) 
-			{
-				S32 index1 = face.mIndices[tri*3+0];
-				S32 index2 = face.mIndices[tri*3+1];
-				S32 index3 = face.mIndices[tri*3+2];
 
-				F32 a, b, t;
+			if (!face.mOctree)
+			{
+				face.createOctree();
+			}
 			
-				if (LLTriangleRayIntersect(face.mVertices[index1].mPosition,
-										   face.mVertices[index2].mPosition,
-										   face.mVertices[index3].mPosition,
-										   start, dir, &a, &b, &t, FALSE))
-				{
-					if ((t >= 0.f) &&      // if hit is after start
-						(t <= 1.f) &&      // and before end
-						(t < closest_t))   // and this hit is closer
-		{
-						closest_t = t;
-						hit_face = i;
+			//LLVector4a* p = (LLVector4a*) face.mPositions;
 
-						if (intersection != NULL)
-						{
-							*intersection = start + dir * closest_t;
-						}
-			
-						if (tex_coord != NULL)
+			LLOctreeTriangleRayIntersect intersect(start, dir, &face, &closest_t, intersection, tex_coord, normal, bi_normal);
+			intersect.traverse(face.mOctree);
+			if (intersect.mHitFace)
 			{
-							*tex_coord = ((1.f - a - b)  * face.mVertices[index1].mTexCoord +
-										  a              * face.mVertices[index2].mTexCoord +
-										  b              * face.mVertices[index3].mTexCoord);
-
-						}
-
-						if (normal != NULL)
-				{
-							*normal    = ((1.f - a - b)  * face.mVertices[index1].mNormal + 
-										  a              * face.mVertices[index2].mNormal +
-										  b              * face.mVertices[index3].mNormal);
-						}
-
-						if (bi_normal != NULL)
-					{
-							*bi_normal = ((1.f - a - b)  * face.mVertices[index1].mBinormal + 
-										  a              * face.mVertices[index2].mBinormal +
-										  b              * face.mVertices[index3].mBinormal);
-						}
-
-					}
-				}
+				hit_face = i;
 			}
 		}		
 	}
@@ -4896,9 +5215,154 @@ std::ostream& operator<<(std::ostream &s, const LLVolume *volumep)
 	return s;
 }
 
+LLVolumeFace::LLVolumeFace() : 
+	mID(0),
+	mTypeMask(0),
+	mBeginS(0),
+	mBeginT(0),
+	mNumS(0),
+	mNumT(0),
+	mNumVertices(0),
+	mNumIndices(0),
+	mPositions(NULL),
+	mNormals(NULL),
+	mBinormals(NULL),
+	mTexCoords(NULL),
+	mIndices(NULL),
+	mWeights(NULL),
+	mOctree(NULL)
+{
+	mExtents = (LLVector4a*) ll_aligned_malloc_16(48);
+	mCenter = mExtents+2;
+}
+
+LLVolumeFace::LLVolumeFace(const LLVolumeFace& src)
+:	mID(0),
+	mTypeMask(0),
+	mBeginS(0),
+	mBeginT(0),
+	mNumS(0),
+	mNumT(0),
+	mNumVertices(0),
+	mNumIndices(0),
+	mPositions(NULL),
+	mNormals(NULL),
+	mBinormals(NULL),
+	mTexCoords(NULL),
+	mIndices(NULL),
+	mWeights(NULL),
+	mOctree(NULL)
+{ 
+	mExtents = (LLVector4a*) ll_aligned_malloc_16(48);
+	mCenter = mExtents+2;
+	*this = src;
+}
+
+LLVolumeFace& LLVolumeFace::operator=(const LLVolumeFace& src)
+{
+	if (&src == this)
+	{ //self assignment, do nothing
+		return *this;
+	}
+
+	mID = src.mID;
+	mTypeMask = src.mTypeMask;
+	mBeginS = src.mBeginS;
+	mBeginT = src.mBeginT;
+	mNumS = src.mNumS;
+	mNumT = src.mNumT;
+
+	mExtents[0] = src.mExtents[0];
+	mExtents[1] = src.mExtents[1];
+	*mCenter = *src.mCenter;
+
+	mNumVertices = 0;
+	mNumIndices = 0;
+
+	freeData();
+	
+	LLVector4a::memcpyNonAliased16((F32*) mExtents, (F32*) src.mExtents, 12*sizeof(F32));
+
+	resizeVertices(src.mNumVertices);
+	resizeIndices(src.mNumIndices);
+
+	if (mNumVertices)
+	{
+		S32 vert_size = mNumVertices*4*sizeof(F32);
+		S32 tc_size = (mNumVertices*8+0xF) & ~0xF;
+			
+		LLVector4a::memcpyNonAliased16((F32*) mPositions, (F32*) src.mPositions, vert_size);
+		LLVector4a::memcpyNonAliased16((F32*) mNormals, (F32*) src.mNormals, vert_size);
+		LLVector4a::memcpyNonAliased16((F32*) mTexCoords, (F32*) src.mTexCoords, tc_size);
+
+
+		if (src.mBinormals)
+		{
+			allocateBinormals(src.mNumVertices);
+			LLVector4a::memcpyNonAliased16((F32*) mBinormals, (F32*) src.mBinormals, vert_size);
+		}
+		else
+		{
+			ll_aligned_free_16(mBinormals);
+			mBinormals = NULL;
+		}
+
+		if (src.mWeights)
+		{
+			allocateWeights(src.mNumVertices);
+			LLVector4a::memcpyNonAliased16((F32*) mWeights, (F32*) src.mWeights, vert_size);
+		}
+		else
+		{
+			ll_aligned_free_16(mWeights);
+			mWeights = NULL;
+		}
+	}
+
+	if (mNumIndices)
+	{
+		S32 idx_size = (mNumIndices*2+0xF) & ~0xF;
+		
+		LLVector4a::memcpyNonAliased16((F32*) mIndices, (F32*) src.mIndices, idx_size);
+	}
+	
+	//delete 
+	return *this;
+}
+
+LLVolumeFace::~LLVolumeFace()
+{
+	ll_aligned_free_16(mExtents);
+	mExtents = NULL;
+
+	freeData();
+}
+
+void LLVolumeFace::freeData()
+{
+	ll_aligned_free_16(mPositions);
+	mPositions = NULL;
+	ll_aligned_free_16(mNormals);
+	mNormals = NULL;
+	ll_aligned_free_16(mTexCoords);
+	mTexCoords = NULL;
+	ll_aligned_free_16(mIndices);
+	mIndices = NULL;
+	ll_aligned_free_16(mBinormals);
+	mBinormals = NULL;
+	ll_aligned_free_16(mWeights);
+	mWeights = NULL;
+
+	delete mOctree;
+	mOctree = NULL;
+}
 
 BOOL LLVolumeFace::create(LLVolume* volume, BOOL partial_build)
 {
+	//tree for this face is no longer valid
+	delete mOctree;
+	mOctree = NULL;
+
 	if (mTypeMask & CAP_MASK)
 	{
 		return createCap(volume, partial_build);
@@ -4914,6 +5378,35 @@ BOOL LLVolumeFace::create(LLVolume* volume, BOOL partial_build)
 	}
 }
 
+void LLVolumeFace::getVertexData(U16 index, LLVolumeFace::VertexData& cv)
+{
+	cv.setPosition(mPositions[index]);
+	cv.setNormal(mNormals[index]);
+	cv.mTexCoord = mTexCoords[index];
+}
+
+bool LLVolumeFace::VertexMapData::operator==(const LLVolumeFace::VertexData& rhs) const
+{
+	return getPosition().equals3(rhs.getPosition()) &&
+		mTexCoord == rhs.mTexCoord &&
+		getNormal().equals3(rhs.getNormal());
+}
+
+bool LLVolumeFace::VertexMapData::ComparePosition::operator()(const LLVector3& a, const LLVector3& b) const
+{
+	if (a.mV[0] != b.mV[0])
+	{
+		return a.mV[0] < b.mV[0];
+	}
+	
+	if (a.mV[1] != b.mV[1])
+	{
+		return a.mV[1] < b.mV[1];
+	}
+	
+	return a.mV[2] < b.mV[2];			
+}
+
 void LLVolumeFace::optimize(F32 angle_cutoff)
 {
 	LLVolumeFace new_face;
@@ -4921,14 +5414,15 @@ void LLVolumeFace::optimize(F32 angle_cutoff)
 	VertexMapData::PointMap point_map;
 
 	//remove redundant vertices
-	for (U32 i = 0; i < mIndices.size(); ++i)
+	for (U32 i = 0; i < mNumIndices; ++i)
 	{
 		U16 index = mIndices[i];
 
-		LLVolumeFace::VertexData cv = mVertices[index];
-
+		LLVolumeFace::VertexData cv;
+		getVertexData(index, cv);
+		
 		BOOL found = FALSE;
-		VertexMapData::PointMap::iterator point_iter = point_map.find(cv.mPosition);
+		VertexMapData::PointMap::iterator point_iter = point_map.find(LLVector3(cv.getPosition().getF32ptr()));
 		if (point_iter != point_map.end())
 		{ //duplicate point might exist
 			for (U32 j = 0; j < point_iter->second.size(); ++j)
@@ -4937,7 +5431,7 @@ void LLVolumeFace::optimize(F32 angle_cutoff)
 				if (tv.compareNormal(cv, angle_cutoff))
 				{
 					found = TRUE;
-					new_face.mIndices.push_back((point_iter->second)[j].mIndex);
+					new_face.pushIndex((point_iter->second)[j].mIndex);
 					break;
 				}
 			}
@@ -4945,14 +5439,14 @@ void LLVolumeFace::optimize(F32 angle_cutoff)
 
 		if (!found)
 		{
-			new_face.mVertices.push_back(cv);
-			U16 index = (U16) new_face.mVertices.size()-1;
-			new_face.mIndices.push_back(index);
+			new_face.pushVertex(cv);
+			U16 index = (U16) new_face.mNumVertices-1;
+			new_face.pushIndex(index);
 
 			VertexMapData d;
-			d.mPosition = cv.mPosition;
+			d.setPosition(cv.getPosition());
 			d.mTexCoord = cv.mTexCoord;
-			d.mNormal = cv.mNormal;
+			d.setNormal(cv.getNormal());
 			d.mIndex = index;
 			if (point_iter != point_map.end())
 			{
@@ -4960,13 +5454,77 @@ void LLVolumeFace::optimize(F32 angle_cutoff)
 			}
 			else
 			{
-				point_map[d.mPosition].push_back(d);
+				point_map[LLVector3(d.getPosition().getF32ptr())].push_back(d);
 			}
 		}
 	}
 
-	mVertices = new_face.mVertices;
-	mIndices = new_face.mIndices;
+	swapData(new_face);
+}
+
+
+void LLVolumeFace::createOctree()
+{
+	LLVector4a center;
+	LLVector4a size;
+	center.splat(0.f);
+	size.splat(1.f);
+
+	mOctree = new LLOctreeRoot<LLVolumeTriangle>(center, size, NULL);
+	new LLVolumeOctreeListener(mOctree);
+
+	for (U32 i = 0; i < mNumIndices; i+= 3)
+	{
+		LLPointer<LLVolumeTriangle> tri = new LLVolumeTriangle();
+				
+		const LLVector4a& v0 = mPositions[mIndices[i]];
+		const LLVector4a& v1 = mPositions[mIndices[i+1]];
+		const LLVector4a& v2 = mPositions[mIndices[i+2]];
+
+		tri->mV[0] = &v0;
+		tri->mV[1] = &v1;
+		tri->mV[2] = &v2;
+
+		tri->mIndex[0] = mIndices[i];
+		tri->mIndex[1] = mIndices[i+1];
+		tri->mIndex[2] = mIndices[i+2];
+
+		LLVector4a min = v0;
+		min.setMin(min, v1);
+		min.setMin(min, v2);
+
+		LLVector4a max = v0;
+		max.setMax(max, v1);
+		max.setMax(max, v2);
+
+		LLVector4a center;
+		center.setAdd(min, max);
+		center.mul(0.5f);
+
+		*tri->mPositionGroup = center;
+
+		LLVector4a size;
+		size.setSub(max,min);
+		
+		tri->mRadius = size.getLength3().getF32() * 0.5f;
+		
+		mOctree->insert(tri);
+	}
+
+	LLVolumeOctreeRebound rebound(this);
+	rebound.traverse(mOctree);
+}
+
+
+void LLVolumeFace::swapData(LLVolumeFace& rhs)
+{
+	llswap(rhs.mPositions, mPositions);
+	llswap(rhs.mNormals, mNormals);
+	llswap(rhs.mBinormals, mBinormals);
+	llswap(rhs.mTexCoords, mTexCoords);
+	llswap(rhs.mIndices,mIndices);
+	llswap(rhs.mNumVertices, mNumVertices);
+	llswap(rhs.mNumIndices, mNumIndices);
 }
 
 void	LerpPlanarVertex(LLVolumeFace::VertexData& v0,
@@ -4976,10 +5534,21 @@ void	LerpPlanarVertex(LLVolumeFace::VertexData& v0,
 				   F32	coef01,
 				   F32	coef02)
 {
-	vout.mPosition = v0.mPosition + ((v1.mPosition-v0.mPosition)*coef01)+((v2.mPosition-v0.mPosition)*coef02);
+
+	LLVector4a lhs;
+	lhs.setSub(v1.getPosition(), v0.getPosition());
+	lhs.mul(coef01);
+	LLVector4a rhs;
+	rhs.setSub(v2.getPosition(), v0.getPosition());
+	rhs.mul(coef02);
+
+	rhs.add(lhs);
+	rhs.add(v0.getPosition());
+
+	vout.setPosition(rhs);
+		
 	vout.mTexCoord = v0.mTexCoord + ((v1.mTexCoord-v0.mTexCoord)*coef01)+((v2.mTexCoord-v0.mTexCoord)*coef02);
-	vout.mNormal = v0.mNormal;
-	vout.mBinormal = v0.mBinormal;
+	vout.setNormal(v0.getNormal());
 }
 
 BOOL LLVolumeFace::createUnCutCubeCap(LLVolume* volume, BOOL partial_build)
@@ -4999,82 +5568,113 @@ BOOL LLVolumeFace::createUnCutCubeCap(LLVolume* volume, BOOL partial_build)
 	num_vertices = (grid_size+1)*(grid_size+1);
 	num_indices = quad_count * 4;
 
-	LLVector3& min = mExtents[0];
-	LLVector3& max = mExtents[1];
+	LLVector4a& min = mExtents[0];
+	LLVector4a& max = mExtents[1];
 
 	S32 offset = 0;
 	if (mTypeMask & TOP_MASK)
+	{
 		offset = (max_t-1) * max_s;
+	}
 	else
+	{
 		offset = mBeginS;
+	}
 
-	VertexData	corners[4];
-	VertexData baseVert;
-	for(int t = 0; t < 4; t++){
-		corners[t].mPosition = mesh[offset + (grid_size*t)].mPos;
-		corners[t].mTexCoord.mV[0] = profile[grid_size*t].mV[0]+0.5f;
-		corners[t].mTexCoord.mV[1] = 0.5f - profile[grid_size*t].mV[1];
-	}
-	baseVert.mNormal = 
-		((corners[1].mPosition-corners[0].mPosition) % 
-		(corners[2].mPosition-corners[1].mPosition));
-	baseVert.mNormal.normVec();
-	if(!(mTypeMask & TOP_MASK)){
-		baseVert.mNormal *= -1.0f;
-	}else{
-		//Swap the UVs on the U(X) axis for top face
-		LLVector2 swap;
-		swap = corners[0].mTexCoord;
-		corners[0].mTexCoord=corners[3].mTexCoord;
-		corners[3].mTexCoord=swap;
-		swap = corners[1].mTexCoord;
-		corners[1].mTexCoord=corners[2].mTexCoord;
-		corners[2].mTexCoord=swap;
-	}
-	baseVert.mBinormal = calc_binormal_from_triangle( 
-		corners[0].mPosition, corners[0].mTexCoord,
-		corners[1].mPosition, corners[1].mTexCoord,
-		corners[2].mPosition, corners[2].mTexCoord);
-	for(int t = 0; t < 4; t++){
-		corners[t].mBinormal = baseVert.mBinormal;
-		corners[t].mNormal = baseVert.mNormal;
-	}
-	mHasBinormals = TRUE;
-
-	if (partial_build)
 	{
-		mVertices.clear();
-	}
+		VertexData	corners[4];
+		VertexData baseVert;
+		for(S32 t = 0; t < 4; t++)
+		{
+			corners[t].getPosition().load3( mesh[offset + (grid_size*t)].mPos.mV);
+			corners[t].mTexCoord.mV[0] = profile[grid_size*t].mV[0]+0.5f;
+			corners[t].mTexCoord.mV[1] = 0.5f - profile[grid_size*t].mV[1];
+		}
 
-	S32	vtop = mVertices.size();
-	for(int gx = 0;gx<grid_size+1;gx++){
-		for(int gy = 0;gy<grid_size+1;gy++){
-			VertexData newVert;
-			LerpPlanarVertex(
-				corners[0],
-				corners[1],
-				corners[3],
-				newVert,
-				(F32)gx/(F32)grid_size,
-				(F32)gy/(F32)grid_size);
-			mVertices.push_back(newVert);
+		{
+			LLVector4a lhs;
+			lhs.setSub(corners[1].getPosition(), corners[0].getPosition());
+			LLVector4a rhs;
+			rhs.setSub(corners[2].getPosition(), corners[1].getPosition());
+			baseVert.getNormal().setCross3(lhs, rhs); 
+			baseVert.getNormal().normalize3fast();
+		}
 
-			if (gx == 0 && gy == 0)
-			{
-				min = max = newVert.mPosition;
-			}
-			else
+		if(!(mTypeMask & TOP_MASK))
+		{
+			baseVert.getNormal().mul(-1.0f);
+		}
+		else
+		{
+			//Swap the UVs on the U(X) axis for top face
+			LLVector2 swap;
+			swap = corners[0].mTexCoord;
+			corners[0].mTexCoord=corners[3].mTexCoord;
+			corners[3].mTexCoord=swap;
+			swap = corners[1].mTexCoord;
+			corners[1].mTexCoord=corners[2].mTexCoord;
+			corners[2].mTexCoord=swap;
+		}
+
+		LLVector4a binormal;
+		
+		calc_binormal_from_triangle( binormal,
+			corners[0].getPosition(), corners[0].mTexCoord,
+			corners[1].getPosition(), corners[1].mTexCoord,
+			corners[2].getPosition(), corners[2].mTexCoord);
+		
+		binormal.normalize3fast();
+
+		S32 size = (grid_size+1)*(grid_size+1);
+		resizeVertices(size);
+		allocateBinormals(size);
+
+		LLVector4a* pos = (LLVector4a*) mPositions;
+		LLVector4a* norm = (LLVector4a*) mNormals;
+		LLVector4a* binorm = (LLVector4a*) mBinormals;
+		LLVector2* tc = (LLVector2*) mTexCoords;
+
+		for(int gx = 0;gx<grid_size+1;gx++)
+		{
+			for(int gy = 0;gy<grid_size+1;gy++)
 			{
-				update_min_max(min,max,newVert.mPosition);
+				VertexData newVert;
+				LerpPlanarVertex(
+					corners[0],
+					corners[1],
+					corners[3],
+					newVert,
+					(F32)gx/(F32)grid_size,
+					(F32)gy/(F32)grid_size);
+
+				*pos++ = newVert.getPosition();
+				*norm++ = baseVert.getNormal();
+				*tc++ = newVert.mTexCoord;
+				*binorm++ = binormal;
+
+				if (gx == 0 && gy == 0)
+				{
+					min = newVert.getPosition();
+					max = min;
+				}
+				else
+				{
+					min.setMin(min, newVert.getPosition());
+					max.setMax(max, newVert.getPosition());
+				}
 			}
 		}
-	}
 	
-	mCenter = (min + max) * 0.5f;
+		mCenter->setAdd(min, max);
+		mCenter->mul(0.5f); 
+	}
 
 	if (!partial_build)
 	{
-		mTriStrip.clear();
+		resizeIndices(grid_size*grid_size*6);
+
+		U16* out = mIndices;
+
 		S32 idxs[] = {0,1,(grid_size+1)+1,(grid_size+1)+1,(grid_size+1),0};
 		for(S32 gx = 0;gx<grid_size;gx++)
 		{
@@ -5085,54 +5685,17 @@ BOOL LLVolumeFace::createUnCutCubeCap(LLVolume* volume, BOOL partial_build)
 				{
 					for(S32 i=5;i>=0;i--)
 					{
-						mIndices.push_back(vtop+(gy*(grid_size+1))+gx+idxs[i]);
-					}
-					
-					if (gy == 0)
-					{
-						mTriStrip.push_back((gx+1)*(grid_size+1));
-						mTriStrip.push_back((gx+1)*(grid_size+1));
-						mTriStrip.push_back(gx*(grid_size+1));
-					}
-
-					mTriStrip.push_back(gy+1+(gx+1)*(grid_size+1));
-					mTriStrip.push_back(gy+1+gx*(grid_size+1));
-					
-					
-					if (gy == grid_size-1)
-					{
-						mTriStrip.push_back(gy+1+gx*(grid_size+1));
-					}
+						*out++ = ((gy*(grid_size+1))+gx+idxs[i]);
+					}		
 				}
 				else
 				{
 					for(S32 i=0;i<6;i++)
 					{
-						mIndices.push_back(vtop+(gy*(grid_size+1))+gx+idxs[i]);
-					}
-
-					if (gy == 0)
-					{
-						mTriStrip.push_back(gx*(grid_size+1));
-						mTriStrip.push_back(gx*(grid_size+1));
-						mTriStrip.push_back((gx+1)*(grid_size+1));
-					}
-
-					mTriStrip.push_back(gy+1+gx*(grid_size+1));
-					mTriStrip.push_back(gy+1+(gx+1)*(grid_size+1));
-					
-					if (gy == grid_size-1)
-					{
-						mTriStrip.push_back(gy+1+(gx+1)*(grid_size+1));
+						*out++ = ((gy*(grid_size+1))+gx+idxs[i]);
 					}
 				}
-			}
-			
-		}
-
-		if (mTriStrip.size()%2 == 1)
-		{
-			mTriStrip.push_back(mTriStrip[mTriStrip.size()-1]);
+			}	
 		}
 	}
 		
@@ -5163,17 +5726,31 @@ BOOL LLVolumeFace::createCap(LLVolume* volume, BOOL partial_build)
 	num_vertices = profile.size();
 	num_indices = (profile.size() - 2)*3;
 
-	mVertices.resize(num_vertices);
+	if (!(mTypeMask & HOLLOW_MASK) && !(mTypeMask & OPEN_MASK))
+	{
+		resizeVertices(num_vertices+1);
+		allocateBinormals(num_vertices+1);	
 
-	if (!partial_build)
+		if (!partial_build)
+		{
+			resizeIndices(num_indices+3);
+		}
+	}
+	else
 	{
-		mIndices.resize(num_indices);
+		resizeVertices(num_vertices);
+		allocateBinormals(num_vertices);
+
+		if (!partial_build)
+		{
+			resizeIndices(num_indices);
+		}
 	}
 
 	S32 max_s = volume->getProfile().getTotal();
 	S32 max_t = volume->getPath().mPath.size();
 
-	mCenter.clearVec();
+	mCenter->clear();
 
 	S32 offset = 0;
 	if (mTypeMask & TOP_MASK)
@@ -5191,82 +5768,91 @@ BOOL LLVolumeFace::createCap(LLVolume* volume, BOOL partial_build)
 	LLVector2 cuv;
 	LLVector2 min_uv, max_uv;
 
-	LLVector3& min = mExtents[0];
-	LLVector3& max = mExtents[1];
+	LLVector4a& min = mExtents[0];
+	LLVector4a& max = mExtents[1];
+
+	LLVector2* tc = (LLVector2*) mTexCoords;
+	LLVector4a* pos = (LLVector4a*) mPositions;
+	LLVector4a* norm = (LLVector4a*) mNormals;
+	LLVector4a* binorm = (LLVector4a*) mBinormals;
 
 	// Copy the vertices into the array
 	for (S32 i = 0; i < num_vertices; i++)
 	{
 		if (mTypeMask & TOP_MASK)
 		{
-			mVertices[i].mTexCoord.mV[0] = profile[i].mV[0]+0.5f;
-			mVertices[i].mTexCoord.mV[1] = profile[i].mV[1]+0.5f;
+			tc[i].mV[0] = profile[i].mV[0]+0.5f;
+			tc[i].mV[1] = profile[i].mV[1]+0.5f;
 		}
 		else
 		{
 			// Mirror for underside.
-			mVertices[i].mTexCoord.mV[0] = profile[i].mV[0]+0.5f;
-			mVertices[i].mTexCoord.mV[1] = 0.5f - profile[i].mV[1];
+			tc[i].mV[0] = profile[i].mV[0]+0.5f;
+			tc[i].mV[1] = 0.5f - profile[i].mV[1];
 		}
 
-		mVertices[i].mPosition = mesh[i + offset].mPos;
+		pos[i].load3(mesh[i + offset].mPos.mV);
 		
 		if (i == 0)
 		{
-			min = max = mVertices[i].mPosition;
-			min_uv = max_uv = mVertices[i].mTexCoord;
+			max = pos[i];
+			min = max;
+			min_uv = max_uv = tc[i];
 		}
 		else
 		{
-			update_min_max(min,max, mVertices[i].mPosition);
-			update_min_max(min_uv, max_uv, mVertices[i].mTexCoord);
+			update_min_max(min,max,pos[i]);
+			update_min_max(min_uv, max_uv, tc[i]);
 		}
 	}
 
-	mCenter = (min+max)*0.5f;
+	mCenter->setAdd(min, max);
+	mCenter->mul(0.5f); 
+
 	cuv = (min_uv + max_uv)*0.5f;
 
-	LLVector3 binormal = calc_binormal_from_triangle( 
-		mCenter, cuv,
-		mVertices[0].mPosition, mVertices[0].mTexCoord,
-		mVertices[1].mPosition, mVertices[1].mTexCoord);
-	binormal.normVec();
+	LLVector4a binormal;
+	calc_binormal_from_triangle(binormal,
+		*mCenter, cuv,
+		pos[0], tc[0],
+		pos[1], tc[1]);
+	binormal.normalize3fast();
 
-	LLVector3 d0;
-	LLVector3 d1;
-	LLVector3 normal;
+	LLVector4a normal;
+	LLVector4a d0, d1;
+	
 
-	d0 = mCenter-mVertices[0].mPosition;
-	d1 = mCenter-mVertices[1].mPosition;
+	d0.setSub(*mCenter, pos[0]);
+	d1.setSub(*mCenter, pos[1]);
 
-	normal = (mTypeMask & TOP_MASK) ? (d0%d1) : (d1%d0);
-	normal.normVec();
+	if (mTypeMask & TOP_MASK)
+	{
+		normal.setCross3(d0, d1);
+	}
+	else
+	{
+		normal.setCross3(d1, d0);
+	}
+
+	normal.normalize3fast();
 
 	VertexData vd;
-	vd.mPosition = mCenter;
-	vd.mNormal = normal;
-	vd.mBinormal = binormal;
+	vd.setPosition(*mCenter);
 	vd.mTexCoord = cuv;
 	
 	if (!(mTypeMask & HOLLOW_MASK) && !(mTypeMask & OPEN_MASK))
 	{
-		mVertices.push_back(vd);
+		pos[num_vertices] = *mCenter;
+		tc[num_vertices] = cuv;
 		num_vertices++;
-		if (!partial_build)
-		{
-			vector_append(mIndices, 3);
-		}
 	}
 		
-	
 	for (S32 i = 0; i < num_vertices; i++)
 	{
-		mVertices[i].mBinormal = binormal;
-		mVertices[i].mNormal = normal;
+		binorm[i].load4a(binormal.getF32ptr());
+		norm[i].load4a(normal.getF32ptr());
 	}
 
-	mHasBinormals = TRUE;
-
 	if (partial_build)
 	{
 		return TRUE;
@@ -5374,8 +5960,6 @@ BOOL LLVolumeFace::createCap(LLVolume* volume, BOOL partial_build)
 					pt2--;
 				}
 			}
-
-			makeTriStrip();
 		}
 		else
 		{
@@ -5480,8 +6064,6 @@ BOOL LLVolumeFace::createCap(LLVolume* volume, BOOL partial_build)
 					pt2--;
 				}
 			}
-
-			makeTriStrip();
 		}
 	}
 	else
@@ -5503,163 +6085,320 @@ BOOL LLVolumeFace::createCap(LLVolume* volume, BOOL partial_build)
 			mIndices[3*i+v2] = i + 1;
 		}
 
-		//make tri strip
-		if (mTypeMask & OPEN_MASK)
-		{
-			makeTriStrip();
-		}
-		else
-		{
-			S32 j = num_vertices-2;
-			if (mTypeMask & TOP_MASK)
-			{
-				mTriStrip.push_back(0);
-				for (S32 i = 0; i <= j; ++i)
-				{
-					mTriStrip.push_back(i);
-					if (i != j)
-					{
-						mTriStrip.push_back(j);
-					}
-					--j;
-				}
-			}
-			else
-			{
-				mTriStrip.push_back(j);
-				for (S32 i = 0; i <= j; ++i)
-				{
-					if (i != j)
-					{
-						mTriStrip.push_back(j);
-					}
-					mTriStrip.push_back(i);
-					--j;
-				}
-			}
-			
-			mTriStrip.push_back(mTriStrip[mTriStrip.size()-1]);
 
-			if (mTriStrip.size()%2 == 1)
-			{
-				mTriStrip.push_back(mTriStrip[mTriStrip.size()-1]);
-			}
-		}
 	}
 		
 	return TRUE;
 }
 
-void LLVolumeFace::makeTriStrip()
-{
-	for (U32 i = 0; i < mIndices.size(); i+=3)
-	{
-		U16 i0 = mIndices[i];
-		U16 i1 = mIndices[i+1];
-		U16 i2 = mIndices[i+2];
-
-		if ((i/3)%2 == 1)
-		{
-			mTriStrip.push_back(i0);
-			mTriStrip.push_back(i0);
-			mTriStrip.push_back(i1);
-			mTriStrip.push_back(i2);
-			mTriStrip.push_back(i2);
-		}
-		else
-		{
-			mTriStrip.push_back(i2);
-			mTriStrip.push_back(i2);
-			mTriStrip.push_back(i1);
-			mTriStrip.push_back(i0);
-			mTriStrip.push_back(i0);
-		}
-	}
-
-	if (mTriStrip.size()%2 == 1)
-	{
-		mTriStrip.push_back(mTriStrip[mTriStrip.size()-1]);
-	}
-}
-
 void LLVolumeFace::createBinormals()
 {
 	LLMemType m1(LLMemType::MTYPE_VOLUME);
 	
-	if (!mHasBinormals)
+	if (!mBinormals)
 	{
+		allocateBinormals(mNumVertices);
+
 		//generate binormals
-		for (U32 i = 0; i < mIndices.size()/3; i++) 
+		LLVector4a* pos = mPositions;
+		LLVector2* tc = (LLVector2*) mTexCoords;
+		LLVector4a* binorm = (LLVector4a*) mBinormals;
+
+		for (U32 i = 0; i < mNumIndices/3; i++) 
 		{	//for each triangle
-			const VertexData& v0 = mVertices[mIndices[i*3+0]];
-			const VertexData& v1 = mVertices[mIndices[i*3+1]];
-			const VertexData& v2 = mVertices[mIndices[i*3+2]];
+			const U16& i0 = mIndices[i*3+0];
+			const U16& i1 = mIndices[i*3+1];
+			const U16& i2 = mIndices[i*3+2];
 						
 			//calculate binormal
-			LLVector3 binorm = calc_binormal_from_triangle(v0.mPosition, v0.mTexCoord,
-															v1.mPosition, v1.mTexCoord,
-															v2.mPosition, v2.mTexCoord);
+			LLVector4a binormal;
+			calc_binormal_from_triangle(binormal,
+										pos[i0], tc[i0],
+										pos[i1], tc[i1],
+										pos[i2], tc[i2]);
 
-			for (U32 j = 0; j < 3; j++) 
-			{ //add triangle normal to vertices
-				mVertices[mIndices[i*3+j]].mBinormal += binorm; // * (weight_sum - d[j])/weight_sum;
-			}
+
+			//add triangle normal to vertices
+			binorm[i0].add(binormal);
+			binorm[i1].add(binormal);
+			binorm[i2].add(binormal);
 
 			//even out quad contributions
 			if (i % 2 == 0) 
 			{
-				mVertices[mIndices[i*3+2]].mBinormal += binorm;
+				binorm[i2].add(binormal);
 			}
 			else 
 			{
-				mVertices[mIndices[i*3+1]].mBinormal += binorm;
+				binorm[i1].add(binormal);
 			}
 		}
 
 		//normalize binormals
-		for (U32 i = 0; i < mVertices.size(); i++) 
+		for (U32 i = 0; i < mNumVertices; i++) 
+		{
+			binorm[i].normalize3fast();
+			//bump map/planar projection code requires normals to be normalized
+			mNormals[i].normalize3fast();
+		}
+	}
+}
+
+void LLVolumeFace::resizeVertices(S32 num_verts)
+{
+	ll_aligned_free_16(mPositions);
+	ll_aligned_free_16(mNormals);
+	ll_aligned_free_16(mBinormals);
+	ll_aligned_free_16(mTexCoords);
+
+	mBinormals = NULL;
+
+	if (num_verts)
+	{
+		mPositions = (LLVector4a*) ll_aligned_malloc_16(num_verts*16);
+		mNormals = (LLVector4a*) ll_aligned_malloc_16(num_verts*16);
+
+		//pad texture coordinate block end to allow for QWORD reads
+		S32 size = ((num_verts*8) + 0xF) & ~0xF;
+		mTexCoords = (LLVector2*) ll_aligned_malloc_16(size);
+	}
+	else
+	{
+		mPositions = NULL;
+		mNormals = NULL;
+		mTexCoords = NULL;
+	}
+
+	mNumVertices = num_verts;
+}
+
+void LLVolumeFace::pushVertex(const LLVolumeFace::VertexData& cv)
+{
+	pushVertex(cv.getPosition(), cv.getNormal(), cv.mTexCoord);
+}
+
+void LLVolumeFace::pushVertex(const LLVector4a& pos, const LLVector4a& norm, const LLVector2& tc)
+{
+	S32 new_verts = mNumVertices+1;
+	S32 new_size = new_verts*16;
+	S32 old_size = mNumVertices*16;
+
+	//positions
+	LLVector4a* dst = (LLVector4a*) ll_aligned_malloc_16(new_size);
+	if (mPositions)
+	{
+		LLVector4a::memcpyNonAliased16((F32*) dst, (F32*) mPositions, old_size);
+		ll_aligned_free_16(mPositions);
+	}
+	mPositions = dst;
+
+	//normals
+	dst = (LLVector4a*) ll_aligned_malloc_16(new_size);
+	if (mNormals)
+	{
+		LLVector4a::memcpyNonAliased16((F32*) dst, (F32*) mNormals, old_size);
+		ll_aligned_free_16(mNormals);
+	}
+	mNormals = dst;
+
+	//tex coords
+	new_size = ((new_verts*8)+0xF) & ~0xF;
+	old_size = ((mNumVertices*8)+0xF) & ~0xF;
+
+	dst = (LLVector4a*) ll_aligned_malloc_16(new_size);
+	{
+		LLVector2* dst = (LLVector2*) ll_aligned_malloc_16(new_size);
+		if (mTexCoords)
+		{
+			LLVector4a::memcpyNonAliased16((F32*) dst, (F32*) mTexCoords, old_size);
+			ll_aligned_free_16(mTexCoords);
+		}
+	}
+	mTexCoords = (LLVector2*) dst;
+
+	//just clear binormals
+	ll_aligned_free_16(mBinormals);
+	mBinormals = NULL;
+
+	mPositions[mNumVertices] = pos;
+	mNormals[mNumVertices] = norm;
+	mTexCoords[mNumVertices] = tc;
+
+	mNumVertices++;	
+}
+
+void LLVolumeFace::allocateBinormals(S32 num_verts)
+{
+	ll_aligned_free_16(mBinormals);
+	mBinormals = (LLVector4a*) ll_aligned_malloc_16(num_verts*16);
+}
+
+void LLVolumeFace::allocateWeights(S32 num_verts)
+{
+	ll_aligned_free_16(mWeights);
+	mWeights = (LLVector4a*) ll_aligned_malloc_16(num_verts*16);
+}
+
+void LLVolumeFace::resizeIndices(S32 num_indices)
+{
+	ll_aligned_free_16(mIndices);
+
+	if (num_indices)
+	{
+		//pad index block end to allow for QWORD reads
+		S32 size = ((num_indices*2) + 0xF) & ~0xF;
+		
+		mIndices = (U16*) ll_aligned_malloc_16(size);	
+	}
+	else
+	{
+		mIndices = NULL;
+	}
+
+	mNumIndices = num_indices;
+}
+
+void LLVolumeFace::pushIndex(const U16& idx)
+{
+	S32 new_count = mNumIndices + 1;
+	S32 new_size = ((new_count*2)+0xF) & ~0xF;
+
+	S32 old_size = ((mNumIndices*2)+0xF) & ~0xF;
+	if (new_size != old_size)
+	{
+		U16* dst = (U16*) ll_aligned_malloc_16(new_size);
+		if (mIndices)
 		{
-			mVertices[i].mBinormal.normVec();
-			mVertices[i].mNormal.normVec();
+			LLVector4a::memcpyNonAliased16((F32*) dst, (F32*) mIndices, old_size);
+			ll_aligned_free_16(mIndices);
 		}
+		mIndices = dst;
+	}
+	
+	mIndices[mNumIndices++] = idx;
+}
+
+void LLVolumeFace::fillFromLegacyData(std::vector<LLVolumeFace::VertexData>& v, std::vector<U16>& idx)
+{
+	resizeVertices(v.size());
+	resizeIndices(idx.size());
 
-		mHasBinormals = TRUE;
+	for (U32 i = 0; i < v.size(); ++i)
+	{
+		mPositions[i] = v[i].getPosition();
+		mNormals[i] = v[i].getNormal();
+		mTexCoords[i] = v[i].mTexCoord;
+	}
+
+	for (U32 i = 0; i < idx.size(); ++i)
+	{
+		mIndices[i] = idx[i];
 	}
 }
 
-void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat, LLMatrix4& norm_mat)
+void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMatrix4& norm_mat_in)
 {
-	U16 offset = mVertices.size();
+	U16 offset = mNumVertices;
+
+	S32 new_count = face.mNumVertices + mNumVertices;
 
-	if (face.mVertices.size() + mVertices.size() > 65536)
+	if (new_count > 65536)
 	{
 		llerrs << "Cannot append face -- 16-bit overflow will occur." << llendl;
 	}
 	
-	for (U32 i = 0; i < face.mVertices.size(); ++i)
+	if (face.mNumVertices == 0)
 	{
-		VertexData v = face.mVertices[i];
-		v.mPosition = v.mPosition*mat;
-		v.mNormal = v.mNormal * norm_mat;
+		llerrs << "Cannot append empty face." << llendl;
+	}
+
+	//allocate new buffer space
+	LLVector4a* new_pos = (LLVector4a*) ll_aligned_malloc_16(new_count*16);
+	LLVector4a* new_norm = (LLVector4a*) ll_aligned_malloc_16(new_count*16);
+	LLVector2* new_tc = (LLVector2*) ll_aligned_malloc_16((new_count*8+0xF) & ~0xF);
+	
+
+	if (mNumVertices > 0)
+	{ //copy old buffers
+		LLVector4a::memcpyNonAliased16((F32*) new_pos, (F32*) mPositions, mNumVertices*4*sizeof(F32));
+		LLVector4a::memcpyNonAliased16((F32*) new_norm, (F32*) mNormals, mNumVertices*4*sizeof(F32));
+		LLVector4a::memcpyNonAliased16((F32*) new_tc, (F32*) mTexCoords, mNumVertices*2*sizeof(F32));
+	}
+
+	//free old buffer space
+	ll_aligned_free_16(mPositions);
+	ll_aligned_free_16(mNormals);
+	ll_aligned_free_16(mTexCoords);
+
+	//point to new buffers
+	mPositions = new_pos;
+	mNormals = new_norm;
+	mTexCoords = new_tc;
+
+	mNumVertices = new_count;
+
+	//get destination address of appended face
+	LLVector4a* dst_pos = mPositions+offset;
+	LLVector2* dst_tc = mTexCoords+offset;
+	LLVector4a* dst_norm = mNormals+offset;
 
-		v.mNormal.normalize();
+	//get source addresses of appended face
+	const LLVector4a* src_pos = face.mPositions;
+	const LLVector2* src_tc = face.mTexCoords;
+	const LLVector4a* src_norm = face.mNormals;
 
-		mVertices.push_back(v);
+	//load aligned matrices
+	LLMatrix4a mat, norm_mat;
+	mat.loadu(mat_in);
+	norm_mat.loadu(norm_mat_in);
+
+	for (U32 i = 0; i < face.mNumVertices; ++i)
+	{
+		//transform appended face position and store
+		mat.affineTransform(src_pos[i], dst_pos[i]);
+
+		//transform appended face normal and store
+		norm_mat.rotate(src_norm[i], dst_norm[i]);
+		dst_norm[i].normalize3fast();
+
+		//copy appended face texture coordinate
+		dst_tc[i] = src_tc[i];
 
 		if (offset == 0 && i == 0)
-		{
-			mExtents[0] = mExtents[1] = v.mPosition;
+		{ //initialize bounding box
+			mExtents[0] = mExtents[1] = dst_pos[i];
 		}
 		else
 		{
-			update_min_max(mExtents[0], mExtents[1], v.mPosition);
+			//stretch bounding box
+			update_min_max(mExtents[0], mExtents[1], dst_pos[i]);
 		}
 	}
 
-	
-	for (U32 i = 0; i < face.mIndices.size(); ++i)
-	{
-		mIndices.push_back(face.mIndices[i]+offset);
+
+	new_count = mNumIndices + face.mNumIndices;
+
+	//allocate new index buffer
+	U16* new_indices = (U16*) ll_aligned_malloc_16((new_count*2+0xF) & ~0xF);
+	if (mNumIndices > 0)
+	{ //copy old index buffer
+		S32 old_size = (mNumIndices*2+0xF) & ~0xF;
+		LLVector4a::memcpyNonAliased16((F32*) new_indices, (F32*) mIndices, old_size);
+	}
+
+	//free old index buffer
+	ll_aligned_free_16(mIndices);
+
+	//point to new index buffer
+	mIndices = new_indices;
+
+	//get destination address into new index buffer
+	U16* dst_idx = mIndices+mNumIndices;
+	mNumIndices = new_count;
+
+	for (U32 i = 0; i < face.mNumIndices; ++i)
+	{ //copy indices, offsetting by old vertex count
+		dst_idx[i] = face.mIndices[i]+offset;
 	}
 }
 
@@ -5689,28 +6428,24 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 	num_vertices = mNumS*mNumT;
 	num_indices = (mNumS-1)*(mNumT-1)*6;
 
-	mVertices.resize(num_vertices);
-
 	if (!partial_build)
 	{
-		mIndices.resize(num_indices);
+		resizeVertices(num_vertices);
+		resizeIndices(num_indices);
 
+#if LL_MESH_ENABLED
 		if ((volume->getParams().getSculptType() & LL_SCULPT_TYPE_MASK) != LL_SCULPT_TYPE_MESH)
 		{
 			mEdge.resize(num_indices);
 		}
-	}
-	else
-	{
-		mHasBinormals = FALSE;
+#else
+		mEdge.resize(num_indices);
+#endif
 	}
 
-
-	LLVector3& face_min = mExtents[0];
-	LLVector3& face_max = mExtents[1];
-
-	mCenter.clearVec();
-
+	LLVector4a* pos = (LLVector4a*) mPositions;
+	LLVector4a* norm = (LLVector4a*) mNormals;
+	LLVector2* tc = (LLVector2*) mTexCoords;
 	S32 begin_stex = llfloor( profile[mBeginS].mV[2] );
 	S32 num_s = ((mTypeMask & INNER_MASK) && (mTypeMask & FLAT_MASK) && mNumS > 2) ? mNumS/2 : mNumS;
 
@@ -5761,30 +6496,20 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 				i = mBeginS + s + max_s*t;
 			}
 
-			mVertices[cur_vertex].mPosition = mesh[i].mPos;
-			mVertices[cur_vertex].mTexCoord = LLVector2(ss,tt);
+			pos[cur_vertex].load3(mesh[i].mPos.mV);
+			tc[cur_vertex] = LLVector2(ss,tt);
 		
-			mVertices[cur_vertex].mNormal = LLVector3(0,0,0);
-			mVertices[cur_vertex].mBinormal = LLVector3(0,0,0);
-			
-			if (cur_vertex == 0)
-			{
-				face_min = face_max = mesh[i].mPos;
-			}
-			else
-			{
-				update_min_max(face_min, face_max, mesh[i].mPos);
-			}
-
+			norm[cur_vertex].clear();
 			cur_vertex++;
 
 			if ((mTypeMask & INNER_MASK) && (mTypeMask & FLAT_MASK) && mNumS > 2 && s > 0)
 			{
-				mVertices[cur_vertex].mPosition = mesh[i].mPos;
-				mVertices[cur_vertex].mTexCoord = LLVector2(ss,tt);
+
+				pos[cur_vertex].load3(mesh[i].mPos.mV);
+				tc[cur_vertex] = LLVector2(ss,tt);
 			
-				mVertices[cur_vertex].mNormal = LLVector3(0,0,0);
-				mVertices[cur_vertex].mBinormal = LLVector3(0,0,0);
+				norm[cur_vertex].clear();
+				
 				cur_vertex++;
 			}
 		}
@@ -5802,19 +6527,29 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 
 			i = mBeginS + s + max_s*t;
 			ss = profile[mBeginS + s].mV[2] - begin_stex;
-			mVertices[cur_vertex].mPosition = mesh[i].mPos;
-			mVertices[cur_vertex].mTexCoord = LLVector2(ss,tt);
-		
-			mVertices[cur_vertex].mNormal = LLVector3(0,0,0);
-			mVertices[cur_vertex].mBinormal = LLVector3(0,0,0);
-
-			update_min_max(face_min,face_max,mesh[i].mPos);
-
+			pos[cur_vertex].load3(mesh[i].mPos.mV);
+			tc[cur_vertex] = LLVector2(ss,tt);
+			norm[cur_vertex].clear(); 
+			
 			cur_vertex++;
 		}
 	}
 	
-	mCenter = (face_min + face_max) * 0.5f;
+
+	//get bounding box for this side
+	LLVector4a& face_min = mExtents[0];
+	LLVector4a& face_max = mExtents[1];
+	mCenter->clear();
+
+	face_min = face_max = pos[0];
+
+	for (U32 i = 1; i < mNumVertices; ++i)
+	{
+		update_min_max(face_min, face_max, pos[i]);
+	}
+
+	mCenter->setAdd(face_min, face_max);
+	mCenter->mul(0.5f);
 
 	S32 cur_index = 0;
 	S32 cur_edge = 0;
@@ -5822,14 +6557,9 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 
 	if (!partial_build)
 	{
-		mTriStrip.clear();
-
 		// Now we generate the indices.
 		for (t = 0; t < (mNumT-1); t++)
 		{
-			//prepend terminating index to strip
-			mTriStrip.push_back(mNumS*t);
-
 			for (s = 0; s < (mNumS-1); s++)
 			{	
 				mIndices[cur_index++] = s   + mNumS*t;			//bottom left
@@ -5839,14 +6569,6 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 				mIndices[cur_index++] = s+1 + mNumS*t;			//bottom right
 				mIndices[cur_index++] = s+1 + mNumS*(t+1);		//top right
 
-				if (s == 0)
-				{
-					mTriStrip.push_back(s+mNumS*t);
-					mTriStrip.push_back(s+mNumS*(t+1));
-				}
-				mTriStrip.push_back(s+1+mNumS*t);
-				mTriStrip.push_back(s+1+mNumS*(t+1));
-				
 				mEdge[cur_edge++] = (mNumS-1)*2*t+s*2+1;						//bottom left/top right neighbor face 
 				if (t < mNumT-2) {												//top right/top left neighbor face 
 					mEdge[cur_edge++] = (mNumS-1)*2*(t+1)+s*2+1;
@@ -5887,59 +6609,55 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 				}
 				mEdge[cur_edge++] = (mNumS-1)*2*t+s*2;							//top right/bottom left neighbor face	
 			}
-			//append terminating vertex to strip
-			mTriStrip.push_back(mNumS-1+mNumS*(t+1));
-		}
-
-		if (mTriStrip.size()%2 == 1)
-		{
-			mTriStrip.push_back(mTriStrip[mTriStrip.size()-1]);
 		}
 	}
 
 	//generate normals 
-	for (U32 i = 0; i < mIndices.size()/3; i++) //for each triangle
-	{
-		const S32 i0 = mIndices[i*3+0];
-		const S32 i1 = mIndices[i*3+1];
-		const S32 i2 = mIndices[i*3+2];
-		const VertexData& v0 = mVertices[i0];
-		const VertexData& v1 = mVertices[i1];
-		const VertexData& v2 = mVertices[i2];
-					
-		//calculate triangle normal
-		LLVector3 norm = (v0.mPosition-v1.mPosition) % (v0.mPosition-v2.mPosition);
+	for (U32 i = 0; i < mNumIndices/3; i++) //for each triangle
+	{
+		const U16* idx = &(mIndices[i*3]);
+		
 
-		for (U32 j = 0; j < 3; j++) 
-		{ //add triangle normal to vertices
-			const S32 idx = mIndices[i*3+j];
-			mVertices[idx].mNormal += norm; // * (weight_sum - d[j])/weight_sum;
-		}
+		LLVector4a* v[] = 
+		{	pos+idx[0], pos+idx[1], pos+idx[2] };
+		
+		LLVector4a* n[] = 
+		{	norm+idx[0], norm+idx[1], norm+idx[2] };
+		
+		//calculate triangle normal
+		LLVector4a a, b, c;
+		
+		a.setSub(*v[0], *v[1]);
+		b.setSub(*v[0], *v[2]);
+		c.setCross3(a,b);
 
+		n[0]->add(c);
+		n[1]->add(c);
+		n[2]->add(c);
+		
 		//even out quad contributions
-		if ((i & 1) == 0) 
-		{
-			mVertices[i2].mNormal += norm;
-		}
-		else 
-		{
-			mVertices[i1].mNormal += norm;
-		}
+		n[i%2+1]->add(c);
 	}
 	
 	// adjust normals based on wrapping and stitching
 	
-	BOOL s_bottom_converges = ((mVertices[0].mPosition - mVertices[mNumS*(mNumT-2)].mPosition).magVecSquared() < 0.000001f);
-	BOOL s_top_converges = ((mVertices[mNumS-1].mPosition - mVertices[mNumS*(mNumT-2)+mNumS-1].mPosition).magVecSquared() < 0.000001f);
+	LLVector4a top;
+	top.setSub(pos[0], pos[mNumS*(mNumT-2)]);
+	BOOL s_bottom_converges = (top.dot3(top) < 0.000001f);
+
+	top.setSub(pos[mNumS-1], pos[mNumS*(mNumT-2)+mNumS-1]);
+	BOOL s_top_converges = (top.dot3(top) < 0.000001f);
+
 	if (sculpt_stitching == LL_SCULPT_TYPE_NONE)  // logic for non-sculpt volumes
 	{
 		if (volume->getPath().isOpen() == FALSE)
 		{ //wrap normals on T
 			for (S32 i = 0; i < mNumS; i++)
 			{
-				LLVector3 norm = mVertices[i].mNormal + mVertices[mNumS*(mNumT-1)+i].mNormal;
-				mVertices[i].mNormal = norm;
-				mVertices[mNumS*(mNumT-1)+i].mNormal = norm;
+				LLVector4a n;
+				n.setAdd(norm[i], norm[mNumS*(mNumT-1)+i]);
+				norm[i] = n;
+				norm[mNumS*(mNumT-1)+i] = n;
 			}
 		}
 
@@ -5947,9 +6665,10 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 		{ //wrap normals on S
 			for (S32 i = 0; i < mNumT; i++)
 			{
-				LLVector3 norm = mVertices[mNumS*i].mNormal + mVertices[mNumS*i+mNumS-1].mNormal;
-				mVertices[mNumS * i].mNormal = norm;
-				mVertices[mNumS * i+mNumS-1].mNormal = norm;
+				LLVector4a n;
+				n.setAdd(norm[mNumS*i], norm[mNumS*i+mNumS-1]);
+				norm[mNumS * i] = n;
+				norm[mNumS * i+mNumS-1] = n;
 			}
 		}
 	
@@ -5960,7 +6679,7 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 			{ //all lower S have same normal
 				for (S32 i = 0; i < mNumT; i++)
 				{
-					mVertices[mNumS*i].mNormal = LLVector3(1,0,0);
+					norm[mNumS*i].set(1,0,0);
 				}
 			}
 
@@ -5968,12 +6687,11 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 			{ //all upper S have same normal
 				for (S32 i = 0; i < mNumT; i++)
 				{
-					mVertices[mNumS*i+mNumS-1].mNormal = LLVector3(-1,0,0);
+					norm[mNumS*i+mNumS-1].set(-1,0,0);
 				}
 			}
 		}
 	}
-	
 	else  // logic for sculpt volumes
 	{
 		BOOL average_poles = FALSE;
@@ -5996,30 +6714,33 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 		{
 			// average normals for north pole
 		
-			LLVector3 average(0.0, 0.0, 0.0);
+			LLVector4a average;
+			average.clear();
+
 			for (S32 i = 0; i < mNumS; i++)
 			{
-				average += mVertices[i].mNormal;
+				average.add(norm[i]);
 			}
 
 			// set average
 			for (S32 i = 0; i < mNumS; i++)
 			{
-				mVertices[i].mNormal = average;
+				norm[i] = average;
 			}
 
 			// average normals for south pole
 		
-			average = LLVector3(0.0, 0.0, 0.0);
+			average.clear();
+
 			for (S32 i = 0; i < mNumS; i++)
 			{
-				average += mVertices[i + mNumS * (mNumT - 1)].mNormal;
+				average.add(norm[i + mNumS * (mNumT - 1)]);
 			}
 
 			// set average
 			for (S32 i = 0; i < mNumS; i++)
 			{
-				mVertices[i + mNumS * (mNumT - 1)].mNormal = average;
+				norm[i + mNumS * (mNumT - 1)] = average;
 			}
 
 		}
@@ -6029,23 +6750,22 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 		{
 			for (S32 i = 0; i < mNumT; i++)
 			{
-				LLVector3 norm = mVertices[mNumS*i].mNormal + mVertices[mNumS*i+mNumS-1].mNormal;
-				mVertices[mNumS * i].mNormal = norm;
-				mVertices[mNumS * i+mNumS-1].mNormal = norm;
+				LLVector4a n;
+				n.setAdd(norm[mNumS*i], norm[mNumS*i+mNumS-1]);
+				norm[mNumS * i] = n;
+				norm[mNumS * i+mNumS-1] = n;
 			}
 		}
 
-
-		
 		if (wrap_t)
 		{
 			for (S32 i = 0; i < mNumS; i++)
 			{
-				LLVector3 norm = mVertices[i].mNormal + mVertices[mNumS*(mNumT-1)+i].mNormal;
-				mVertices[i].mNormal = norm;
-				mVertices[mNumS*(mNumT-1)+i].mNormal = norm;
+				LLVector4a n;
+				n.setAdd(norm[i], norm[mNumS*(mNumT-1)+i]);
+				norm[i] = n;
+				norm[mNumS*(mNumT-1)+i] = n;
 			}
-			
 		}
 
 	}
@@ -6055,41 +6775,51 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 
 // Finds binormal based on three vertices with texture coordinates.
 // Fills in dummy values if the triangle has degenerate texture coordinates.
-LLVector3 calc_binormal_from_triangle( 
-	const LLVector3& pos0,
+void calc_binormal_from_triangle(LLVector4a& binormal,
+
+	const LLVector4a& pos0,
 	const LLVector2& tex0,
-	const LLVector3& pos1,
+	const LLVector4a& pos1,
 	const LLVector2& tex1,
-	const LLVector3& pos2,
+	const LLVector4a& pos2,
 	const LLVector2& tex2)
 {
-	LLVector3 rx0( pos0.mV[VX], tex0.mV[VX], tex0.mV[VY] );
-	LLVector3 rx1( pos1.mV[VX], tex1.mV[VX], tex1.mV[VY] );
-	LLVector3 rx2( pos2.mV[VX], tex2.mV[VX], tex2.mV[VY] );
+	LLVector4a rx0( pos0[VX], tex0.mV[VX], tex0.mV[VY] );
+	LLVector4a rx1( pos1[VX], tex1.mV[VX], tex1.mV[VY] );
+	LLVector4a rx2( pos2[VX], tex2.mV[VX], tex2.mV[VY] );
 	
-	LLVector3 ry0( pos0.mV[VY], tex0.mV[VX], tex0.mV[VY] );
-	LLVector3 ry1( pos1.mV[VY], tex1.mV[VX], tex1.mV[VY] );
-	LLVector3 ry2( pos2.mV[VY], tex2.mV[VX], tex2.mV[VY] );
+	LLVector4a ry0( pos0[VY], tex0.mV[VX], tex0.mV[VY] );
+	LLVector4a ry1( pos1[VY], tex1.mV[VX], tex1.mV[VY] );
+	LLVector4a ry2( pos2[VY], tex2.mV[VX], tex2.mV[VY] );
 
-	LLVector3 rz0( pos0.mV[VZ], tex0.mV[VX], tex0.mV[VY] );
-	LLVector3 rz1( pos1.mV[VZ], tex1.mV[VX], tex1.mV[VY] );
-	LLVector3 rz2( pos2.mV[VZ], tex2.mV[VX], tex2.mV[VY] );
+	LLVector4a rz0( pos0[VZ], tex0.mV[VX], tex0.mV[VY] );
+	LLVector4a rz1( pos1[VZ], tex1.mV[VX], tex1.mV[VY] );
+	LLVector4a rz2( pos2[VZ], tex2.mV[VX], tex2.mV[VY] );
 	
-	LLVector3 r0 = (rx0 - rx1) % (rx0 - rx2);
-	LLVector3 r1 = (ry0 - ry1) % (ry0 - ry2);
-	LLVector3 r2 = (rz0 - rz1) % (rz0 - rz2);
+	LLVector4a lhs, rhs;
+
+	LLVector4a r0; 
+	lhs.setSub(rx0, rx1); rhs.setSub(rx0, rx2);
+	r0.setCross3(lhs, rhs);
+		
+	LLVector4a r1;
+	lhs.setSub(ry0, ry1); rhs.setSub(ry0, ry2);
+	r1.setCross3(lhs, rhs);
+
+	LLVector4a r2;
+	lhs.setSub(rz0, rz1); rhs.setSub(rz0, rz2);
+	r2.setCross3(lhs, rhs);
 
-	if( r0.mV[VX] && r1.mV[VX] && r2.mV[VX] )
+	if( r0[VX] && r1[VX] && r2[VX] )
 	{
-		LLVector3 binormal(
-				-r0.mV[VZ] / r0.mV[VX],
-				-r1.mV[VZ] / r1.mV[VX],
-				-r2.mV[VZ] / r2.mV[VX]);
+		binormal.set(
+				-r0[VZ] / r0[VX],
+				-r1[VZ] / r1[VX],
+				-r2[VZ] / r2[VX]);
 		// binormal.normVec();
-		return binormal;
 	}
 	else
 	{
-		return LLVector3( 0, 1 , 0 );
+		binormal.set( 0, 1 , 0 );
 	}
 }
diff --git a/indra/llmath/llvolume.h b/indra/llmath/llvolume.h
index c6a156ae37..af28337f57 100644
--- a/indra/llmath/llvolume.h
+++ b/indra/llmath/llvolume.h
@@ -40,8 +40,15 @@ class LLPathParams;
 class LLVolumeParams;
 class LLProfile;
 class LLPath;
+
+#define LL_MESH_ENABLED 1
+
+template <class T> class LLOctreeNode;
+
+class LLVector4a;
 class LLVolumeFace;
 class LLVolume;
+class LLVolumeTriangle;
 
 #include "lldarray.h"
 #include "lluuid.h"
@@ -49,6 +56,7 @@ class LLVolume;
 //#include "vmath.h"
 #include "v2math.h"
 #include "v3math.h"
+#include "v3dmath.h"
 #include "v4math.h"
 #include "llquaternion.h"
 #include "llstrider.h"
@@ -184,10 +192,15 @@ const U8 LL_SCULPT_TYPE_SPHERE    = 1;
 const U8 LL_SCULPT_TYPE_TORUS     = 2;
 const U8 LL_SCULPT_TYPE_PLANE     = 3;
 const U8 LL_SCULPT_TYPE_CYLINDER  = 4;
+#if LL_MESH_ENABLED
 const U8 LL_SCULPT_TYPE_MESH      = 5;
 
 const U8 LL_SCULPT_TYPE_MASK      = LL_SCULPT_TYPE_SPHERE | LL_SCULPT_TYPE_TORUS | LL_SCULPT_TYPE_PLANE |
 	LL_SCULPT_TYPE_CYLINDER | LL_SCULPT_TYPE_MESH;
+#else
+const U8 LL_SCULPT_TYPE_MASK      = LL_SCULPT_TYPE_SPHERE | LL_SCULPT_TYPE_TORUS | LL_SCULPT_TYPE_PLANE |
+	LL_SCULPT_TYPE_CYLINDER;
+#endif
 
 const U8 LL_SCULPT_FLAG_INVERT    = 64;
 const U8 LL_SCULPT_FLAG_MIRROR    = 128;
@@ -791,69 +804,86 @@ public:
 class LLVolumeFace
 {
 public:
-	LLVolumeFace() : 
-		mID(0),
-		mTypeMask(0),
-		mHasBinormals(FALSE),
-		mBeginS(0),
-		mBeginT(0),
-		mNumS(0),
-		mNumT(0)
-	{
-	}
-
-	BOOL create(LLVolume* volume, BOOL partial_build = FALSE);
-	void createBinormals();
-	void makeTriStrip();
-	
-	void appendFace(const LLVolumeFace& face, LLMatrix4& transform, LLMatrix4& normal_tranform);
-
 	class VertexData
 	{
+		enum 
+		{
+			POSITION = 0,
+			NORMAL = 1
+		};
+
+	private:
+		void init();
 	public:
-		LLVector3 mPosition;
-		LLVector3 mNormal;
-		LLVector3 mBinormal;
+		VertexData();
+		VertexData(const VertexData& rhs);
+		const VertexData& operator=(const VertexData& rhs);
+
+		~VertexData();
+		LLVector4a& getPosition();
+		LLVector4a& getNormal();
+		const LLVector4a& getPosition() const;
+		const LLVector4a& getNormal() const;
+		void setPosition(const LLVector4a& pos);
+		void setNormal(const LLVector4a& norm);
+		
+
 		LLVector2 mTexCoord;
 
 		bool operator<(const VertexData& rhs) const;
 		bool operator==(const VertexData& rhs) const;
 		bool compareNormal(const VertexData& rhs, F32 angle_cutoff) const;
+
+	private:
+		LLVector4a* mData;
 	};
 
+	LLVolumeFace();
+	LLVolumeFace(const LLVolumeFace& src);
+	LLVolumeFace& operator=(const LLVolumeFace& rhs);
+
+	~LLVolumeFace();
+private:
+	void freeData();
+public:
+
+	BOOL create(LLVolume* volume, BOOL partial_build = FALSE);
+	void createBinormals();
+	
+	void appendFace(const LLVolumeFace& face, LLMatrix4& transform, LLMatrix4& normal_tranform);
+
+	void resizeVertices(S32 num_verts);
+	void allocateBinormals(S32 num_verts);
+	void allocateWeights(S32 num_verts);
+	void resizeIndices(S32 num_indices);
+	void fillFromLegacyData(std::vector<LLVolumeFace::VertexData>& v, std::vector<U16>& idx);
+
+	void pushVertex(const VertexData& cv);
+	void pushVertex(const LLVector4a& pos, const LLVector4a& norm, const LLVector2& tc);
+	void pushIndex(const U16& idx);
+
+	void swapData(LLVolumeFace& rhs);
+
+	void getVertexData(U16 indx, LLVolumeFace::VertexData& cv);
+
 	class VertexMapData : public LLVolumeFace::VertexData
 	{
 	public:
 		U16 mIndex;
 
-		bool operator==(const LLVolumeFace::VertexData& rhs) const
-		{
-			return mPosition == rhs.mPosition &&
-				mTexCoord == rhs.mTexCoord &&
-				mNormal == rhs.mNormal;
-		}
+		bool operator==(const LLVolumeFace::VertexData& rhs) const;
 
 		struct ComparePosition
 		{
-			bool operator()(const LLVector3& a, const LLVector3& b) const
-			{
-				if (a.mV[0] != b.mV[0])
-				{
-					return a.mV[0] < b.mV[0];
-				}
-				if (a.mV[1] != b.mV[1])
-				{
-					return a.mV[1] < b.mV[1];
-				}
-				return a.mV[2] < b.mV[2];
-			}
+			bool operator()(const LLVector3& a, const LLVector3& b) const;
 		};
 
 		typedef std::map<LLVector3, std::vector<VertexMapData>, VertexMapData::ComparePosition > PointMap;
 	};
 
 	void optimize(F32 angle_cutoff = 2.f);
-	
+	void createOctree();
+
 	enum
 	{
 		SINGLE_MASK =	0x0001,
@@ -872,26 +902,33 @@ public:
 public:
 	S32 mID;
 	U32 mTypeMask;
-	LLVector3 mCenter;
-	BOOL mHasBinormals;
-
+	
 	// Only used for INNER/OUTER faces
 	S32 mBeginS;
 	S32 mBeginT;
 	S32 mNumS;
 	S32 mNumT;
 
-	LLVector3 mExtents[2]; //minimum and maximum point of face
+	LLVector4a* mExtents; //minimum and maximum point of face
+	LLVector4a* mCenter;
+
+	S32 mNumVertices;
+	S32 mNumIndices;
+
+	LLVector4a* mPositions;
+	LLVector4a* mNormals;
+	LLVector4a* mBinormals;
+	LLVector2* mTexCoords;
+	U16* mIndices;
 
-	std::vector<VertexData> mVertices;
-	std::vector<U16>	mIndices;
-	std::vector<U16>	mTriStrip;
 	std::vector<S32>	mEdge;
 
 	//list of skin weights for rigged volumes
 	// format is mWeights[vertex_index].mV[influence] = <joint_index>.<weight>
 	// mWeights.size() should be empty or match mVertices.size()  
-	std::vector<LLVector4> mWeights;
+	LLVector4a* mWeights;
+
+	LLOctreeNode<LLVolumeTriangle>* mOctree;
 
 private:
 	BOOL createUnCutCubeCap(LLVolume* volume, BOOL partial_build = FALSE);
@@ -974,6 +1011,13 @@ public:
 							 LLVector3* normal = NULL,               // return the surface normal at the intersection point
 							 LLVector3* bi_normal = NULL             // return the surface bi-normal at the intersection point
 		);
+
+	S32 lineSegmentIntersect(const LLVector4a& start, const LLVector4a& end, 
+								   S32 face = 1,
+								   LLVector3* intersection = NULL,
+								   LLVector2* tex_coord = NULL,
+								   LLVector3* normal = NULL,
+								   LLVector3* bi_normal = NULL);
 	
 	// The following cleans up vertices and triangles,
 	// getting rid of degenerate triangles and duplicate vertices,
@@ -1038,17 +1082,26 @@ public:
 
 std::ostream& operator<<(std::ostream &s, const LLVolumeParams &volume_params);
 
-LLVector3 calc_binormal_from_triangle(
-		const LLVector3& pos0,
+void calc_binormal_from_triangle(
+		LLVector4a& binormal,
+		const LLVector4a& pos0,
 		const LLVector2& tex0,
-		const LLVector3& pos1,
+		const LLVector4a& pos1,
 		const LLVector2& tex1,
-		const LLVector3& pos2,
+		const LLVector4a& pos2,
 		const LLVector2& tex2);
 
+BOOL LLLineSegmentBoxIntersect(const F32* start, const F32* end, const F32* center, const F32* size);
 BOOL LLLineSegmentBoxIntersect(const LLVector3& start, const LLVector3& end, const LLVector3& center, const LLVector3& size);
+BOOL LLLineSegmentBoxIntersect(const LLVector4a& start, const LLVector4a& end, const LLVector4a& center, const LLVector4a& size);
+
 BOOL LLTriangleRayIntersect(const LLVector3& vert0, const LLVector3& vert1, const LLVector3& vert2, const LLVector3& orig, const LLVector3& dir,
-							F32* intersection_a, F32* intersection_b, F32* intersection_t, BOOL two_sided);
+							F32& intersection_a, F32& intersection_b, F32& intersection_t, BOOL two_sided);
+
+BOOL LLTriangleRayIntersect(const LLVector4a& vert0, const LLVector4a& vert1, const LLVector4a& vert2, const LLVector4a& orig, const LLVector4a& dir,
+							F32& intersection_a, F32& intersection_b, F32& intersection_t);
+BOOL LLTriangleRayIntersectTwoSided(const LLVector4a& vert0, const LLVector4a& vert1, const LLVector4a& vert2, const LLVector4a& orig, const LLVector4a& dir,
+							F32& intersection_a, F32& intersection_b, F32& intersection_t);
 	
 	
 
diff --git a/indra/llmath/llvolumeoctree.cpp b/indra/llmath/llvolumeoctree.cpp
new file mode 100644
index 0000000000..194b1faf81
--- /dev/null
+++ b/indra/llmath/llvolumeoctree.cpp
@@ -0,0 +1,208 @@
+/** 
+
+ * @file llvolumeoctree.cpp
+ *
+ * $LicenseInfo:firstyear=2002&license=viewergpl$
+ * 
+ * Copyright (c) 2002-2009, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#include "llvolumeoctree.h"
+#include "llvector4a.h"
+
+BOOL LLLineSegmentBoxIntersect(const LLVector4a& start, const LLVector4a& end, const LLVector4a& center, const LLVector4a& size)
+{
+	LLVector4a fAWdU;
+	LLVector4a dir;
+	LLVector4a diff;
+
+	dir.setSub(end, start);
+	dir.mul(0.5f);
+
+	diff.setAdd(end,start);
+	diff.mul(0.5f);
+	diff.sub(center);
+	fAWdU.setAbs(dir); 
+
+	LLVector4a rhs;
+	rhs.setAdd(size, fAWdU);
+
+	LLVector4a lhs;
+	lhs.setAbs(diff);
+
+	U32 grt = lhs.greaterThan(rhs).getGatheredBits();
+
+	if (grt & 0x7)
+	{
+		return false;
+	}
+	
+	LLVector4a f;
+	f.setCross3(dir, diff);
+	f.setAbs(f);
+
+	LLVector4a v0, v1;
+
+	v0 = _mm_shuffle_ps(size, size,_MM_SHUFFLE(3,0,0,1));
+	v1 = _mm_shuffle_ps(fAWdU, fAWdU, _MM_SHUFFLE(3,1,2,2));
+	lhs.setMul(v0, v1);
+
+	v0 = _mm_shuffle_ps(size, size, _MM_SHUFFLE(3,1,2,2));
+	v1 = _mm_shuffle_ps(fAWdU, fAWdU, _MM_SHUFFLE(3,0,0,1));
+	rhs.setMul(v0, v1);
+	rhs.add(lhs);
+	
+	grt = f.greaterThan(rhs).getGatheredBits();
+
+	return (grt & 0x7) ? false : true;
+}
+
+
+LLVolumeOctreeListener::LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node)
+{
+	node->addListener(this);
+
+	mBounds = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*4);
+	mExtents = mBounds+2;
+}
+
+LLVolumeOctreeListener::~LLVolumeOctreeListener()
+{
+	ll_aligned_free_16(mBounds);
+}
+	
+void LLVolumeOctreeListener::handleChildAddition(const LLOctreeNode<LLVolumeTriangle>* parent, 
+	LLOctreeNode<LLVolumeTriangle>* child)
+{
+	new LLVolumeOctreeListener(child);
+}
+
+
+LLOctreeTriangleRayIntersect::LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir, 
+							   const LLVolumeFace* face, F32* closest_t,
+							   LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal)
+   : mFace(face),
+     mStart(start),
+	 mDir(dir),
+	 mIntersection(intersection),
+	 mTexCoord(tex_coord),
+	 mNormal(normal),
+	 mBinormal(bi_normal),
+	 mClosestT(closest_t),
+	 mHitFace(false)
+{
+	mEnd.setAdd(mStart, mDir);
+}
+
+void LLOctreeTriangleRayIntersect::traverse(const LLOctreeNode<LLVolumeTriangle>* node)
+{
+	LLVolumeOctreeListener* vl = (LLVolumeOctreeListener*) node->getListener(0);
+
+	/*const F32* start = mStart.getF32();
+	const F32* end = mEnd.getF32();
+	const F32* center = vl->mBounds[0].getF32();
+	const F32* size = vl->mBounds[1].getF32();*/
+
+	//if (LLLineSegmentBoxIntersect(mStart.getF32(), mEnd.getF32(), vl->mBounds[0].getF32(), vl->mBounds[1].getF32()))
+	if (LLLineSegmentBoxIntersect(mStart, mEnd, vl->mBounds[0], vl->mBounds[1]))
+	{
+		node->accept(this);
+		for (S32 i = 0; i < node->getChildCount(); ++i)
+		{
+			traverse(node->getChild(i));
+		}
+	}
+}
+
+void LLOctreeTriangleRayIntersect::visit(const LLOctreeNode<LLVolumeTriangle>* node)
+{
+	for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter = 
+			node->getData().begin(); iter != node->getData().end(); ++iter)
+	{
+		const LLVolumeTriangle* tri = *iter;
+
+		F32 a, b, t;
+		
+		if (LLTriangleRayIntersect(*tri->mV[0], *tri->mV[1], *tri->mV[2],
+				mStart, mDir, a, b, t))
+		{
+			if ((t >= 0.f) &&      // if hit is after start
+				(t <= 1.f) &&      // and before end
+				(t < *mClosestT))   // and this hit is closer
+			{
+				*mClosestT = t;
+				mHitFace = true;
+
+				if (mIntersection != NULL)
+				{
+					LLVector4a intersect = mDir;
+					intersect.mul(*mClosestT);
+					intersect.add(mStart);
+					mIntersection->set(intersect.getF32ptr());
+				}
+
+
+				if (mTexCoord != NULL)
+				{
+					LLVector2* tc = (LLVector2*) mFace->mTexCoords;
+					*mTexCoord = ((1.f - a - b)  * tc[tri->mIndex[0]] +
+						a              * tc[tri->mIndex[1]] +
+						b              * tc[tri->mIndex[2]]);
+
+				}
+
+				if (mNormal != NULL)
+				{
+					LLVector4* norm = (LLVector4*) mFace->mNormals;
+
+					*mNormal    = ((1.f - a - b)  * LLVector3(norm[tri->mIndex[0]]) + 
+						a              * LLVector3(norm[tri->mIndex[1]]) +
+						b              * LLVector3(norm[tri->mIndex[2]]));
+				}
+
+				if (mBinormal != NULL)
+				{
+					LLVector4* binormal = (LLVector4*) mFace->mBinormals;
+					*mBinormal = ((1.f - a - b)  * LLVector3(binormal[tri->mIndex[0]]) + 
+							a              * LLVector3(binormal[tri->mIndex[1]]) +
+							b              * LLVector3(binormal[tri->mIndex[2]]));
+				}
+			}
+		}
+	}
+}
+
+const LLVector4a& LLVolumeTriangle::getPositionGroup() const
+{
+	return *mPositionGroup;
+}
+
+const F32& LLVolumeTriangle::getBinRadius() const
+{
+	return mRadius;
+}
+
+
diff --git a/indra/llmath/llvolumeoctree.h b/indra/llmath/llvolumeoctree.h
new file mode 100644
index 0000000000..0031626498
--- /dev/null
+++ b/indra/llmath/llvolumeoctree.h
@@ -0,0 +1,138 @@
+/** 
+ * @file llvolumeoctree.h
+ * @brief LLVolume octree classes.
+ *
+ * $LicenseInfo:firstyear=2002&license=viewergpl$
+ * 
+ * Copyright (c) 2002-2009, Linden Research, Inc.
+ * 
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab.  Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ * 
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ * 
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ * 
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#ifndef LL_LLVOLUME_OCTREE_H
+#define LL_LLVOLUME_OCTREE_H
+
+#include "linden_common.h"
+#include "llmemory.h"
+
+#include "lloctree.h"
+#include "llvolume.h"
+#include "llvector4a.h"
+
+class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle>
+{
+public:
+	
+	LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node);
+	~LLVolumeOctreeListener();
+	
+	LLVolumeOctreeListener(const LLVolumeOctreeListener& rhs)
+	{
+		*this = rhs;
+	}
+
+	const LLVolumeOctreeListener& operator=(const LLVolumeOctreeListener& rhs)
+	{
+		llerrs << "Illegal operation!" << llendl;
+		return *this;
+	}
+
+	 //LISTENER FUNCTIONS
+	virtual void handleChildAddition(const LLOctreeNode<LLVolumeTriangle>* parent, 
+		LLOctreeNode<LLVolumeTriangle>* child);
+	virtual void handleStateChange(const LLTreeNode<LLVolumeTriangle>* node) { }
+	virtual void handleChildRemoval(const LLOctreeNode<LLVolumeTriangle>* parent, 
+			const LLOctreeNode<LLVolumeTriangle>* child) {	}
+	virtual void handleInsertion(const LLTreeNode<LLVolumeTriangle>* node, LLVolumeTriangle* tri) { }
+	virtual void handleRemoval(const LLTreeNode<LLVolumeTriangle>* node, LLVolumeTriangle* tri) { }
+	virtual void handleDestruction(const LLTreeNode<LLVolumeTriangle>* node) { }
+	
+
+public:
+	LLVector4a* mBounds; // bounding box (center, size) of this node and all its children (tight fit to objects)
+	LLVector4a* mExtents; // extents (min, max) of this node and all its children
+};
+
+class LLOctreeTriangleRayIntersect : public LLOctreeTraveler<LLVolumeTriangle>
+{
+public:
+	const LLVolumeFace* mFace;
+	LLVector4a mStart;
+	LLVector4a mDir;
+	LLVector4a mEnd;
+	LLVector3* mIntersection;
+	LLVector2* mTexCoord;
+	LLVector3* mNormal;
+	LLVector3* mBinormal;
+	F32* mClosestT;
+	bool mHitFace;
+
+	LLOctreeTriangleRayIntersect() { };
+
+	LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir, 
+								   const LLVolumeFace* face, F32* closest_t,
+								   LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal);
+
+	void traverse(const LLOctreeNode<LLVolumeTriangle>* node);
+
+	virtual void visit(const LLOctreeNode<LLVolumeTriangle>* node);
+};
+
+class LLVolumeTriangle : public LLRefCount
+{
+public:
+	LLVolumeTriangle()
+	{
+		mPositionGroup = (LLVector4a*) ll_aligned_malloc_16(16);
+	}
+
+	LLVolumeTriangle(const LLVolumeTriangle& rhs)
+	{
+		*this = rhs;
+	}
+
+	const LLVolumeTriangle& operator=(const LLVolumeTriangle& rhs)
+	{
+		llerrs << "Illegal operation!" << llendl;
+		return *this;
+	}
+
+	~LLVolumeTriangle()
+	{
+		ll_aligned_free_16(mPositionGroup);
+	}
+
+	const LLVector4a* mV[3];
+	U16 mIndex[3];
+
+	LLVector4a* mPositionGroup;
+
+	F32 mRadius;
+
+	virtual const LLVector4a& getPositionGroup() const;
+	virtual const F32& getBinRadius() const;
+};
+
+
+#endif
diff --git a/indra/llmath/tests/mathmisc_test.cpp b/indra/llmath/tests/mathmisc_test.cpp
index ea42f6e001..68d9ddc0fe 100644
--- a/indra/llmath/tests/mathmisc_test.cpp
+++ b/indra/llmath/tests/mathmisc_test.cpp
@@ -334,6 +334,8 @@ namespace tut
 	template<> template<>
 	void sphere_object::test<2>()
 	{
+		skip("See SNOW-620.  Neither the test nor the code being tested seem good.  Also sim-only.");
+
 		// test LLSphere::getBoundingSphere()
 		S32 number_of_tests = 100;
 		S32 number_of_spheres = 10;
diff --git a/indra/llmath/tests/v2math_test.cpp b/indra/llmath/tests/v2math_test.cpp
index 4660fcb955..c745b9989e 100644
--- a/indra/llmath/tests/v2math_test.cpp
+++ b/indra/llmath/tests/v2math_test.cpp
@@ -91,7 +91,7 @@ namespace tut
 		F32 x = 2.2345f, y = 3.5678f ;
 		LLVector2 vec2(x,y);
 		ensure("magVecSquared:Fail ", is_approx_equal(vec2.magVecSquared(), (x*x + y*y)));
-		ensure("magVec:Fail ", is_approx_equal(vec2.magVec(), fsqrtf(x*x + y*y)));
+		ensure("magVec:Fail ", is_approx_equal(vec2.magVec(), (F32) sqrt(x*x + y*y)));
 	}
 
 	template<> template<>
@@ -413,7 +413,7 @@ namespace tut
 		ensure_equals("dist_vec_squared values are not equal",val2, val1);
 
 		val1 = 	dist_vec(vec2, vec3);
-		val2 = fsqrtf((x1 - x2)*(x1 - x2) + (y1 - y2)* (y1 - y2));
+		val2 = (F32) sqrt((x1 - x2)*(x1 - x2) + (y1 - y2)* (y1 - y2));
 		ensure_equals("dist_vec values are not equal",val2, val1);
 	}
 
@@ -437,7 +437,7 @@ namespace tut
 		LLVector2 vec2(x1, y1);
 
 		F32 vecMag = vec2.normVec();
-		F32 mag = fsqrtf(x1*x1 + y1*y1);
+		F32 mag = (F32) sqrt(x1*x1 + y1*y1);
 
 		F32 oomag = 1.f / mag;
 		val1 = x1 * oomag;
diff --git a/indra/llmath/tests/v3color_test.cpp b/indra/llmath/tests/v3color_test.cpp
index 316b6e392f..0efba8e9f3 100644
--- a/indra/llmath/tests/v3color_test.cpp
+++ b/indra/llmath/tests/v3color_test.cpp
@@ -99,7 +99,7 @@ namespace tut
 		F32 r = 2.3436212f, g = 1231.f, b = 4.7849321232f;
 		LLColor3 llcolor3(r,g,b);
 		ensure("magVecSquared:Fail ", is_approx_equal(llcolor3.magVecSquared(), (r*r + g*g + b*b)));
-		ensure("magVec:Fail ", is_approx_equal(llcolor3.magVec(), fsqrtf(r*r + g*g + b*b)));
+		ensure("magVec:Fail ", is_approx_equal(llcolor3.magVec(), (F32) sqrt(r*r + g*g + b*b)));
 	}
 
 	template<> template<>
@@ -109,7 +109,7 @@ namespace tut
 		F32 val1, val2,val3;
 		LLColor3 llcolor3(r,g,b);
 		F32 vecMag = llcolor3.normVec();
-		F32 mag = fsqrtf(r*r + g*g + b*b);
+		F32 mag = (F32) sqrt(r*r + g*g + b*b);
 		F32 oomag = 1.f / mag;
 		val1 = r * oomag;
 		val2 = g * oomag;
@@ -292,7 +292,7 @@ namespace tut
 		F32 r1 =1.f, g1 = 2.f,b1 = 1.2f, r2 = -2.3f, g2 = 1.11f, b2 = 1234.234f;
 		LLColor3 llcolor3(r1,g1,b1),llcolor3a(r2,g2,b2);
 		F32 val = distVec(llcolor3,llcolor3a);
-		ensure("distVec failed ", is_approx_equal(fsqrtf((r1-r2)*(r1-r2) + (g1-g2)*(g1-g2) + (b1-b2)*(b1-b2)) ,val));
+		ensure("distVec failed ", is_approx_equal((F32) sqrt((r1-r2)*(r1-r2) + (g1-g2)*(g1-g2) + (b1-b2)*(b1-b2)) ,val));
 		
 		F32 val1 = distVec_squared(llcolor3,llcolor3a);
 		ensure("distVec_squared failed ", is_approx_equal(((r1-r2)*(r1-r2) + (g1-g2)*(g1-g2) + (b1-b2)*(b1-b2)) ,val1));
diff --git a/indra/llmath/tests/v3dmath_test.cpp b/indra/llmath/tests/v3dmath_test.cpp
index e7c949186c..894b6200f5 100644
--- a/indra/llmath/tests/v3dmath_test.cpp
+++ b/indra/llmath/tests/v3dmath_test.cpp
@@ -409,7 +409,7 @@ namespace tut
 		LLVector3d vec3D(x,y,z);
 		F64 res = (x*x + y*y + z*z) - vec3D.magVecSquared();
 		ensure("1:magVecSquared:Fail ", ((-F_APPROXIMATELY_ZERO <= res)&& (res <=F_APPROXIMATELY_ZERO)));
-		res = fsqrtf(x*x + y*y + z*z) - vec3D.magVec();
+		res = (F32) sqrt(x*x + y*y + z*z) - vec3D.magVec();
 		ensure("2:magVec: Fail ", ((-F_APPROXIMATELY_ZERO <= res)&& (res <=F_APPROXIMATELY_ZERO)));	
 	}
 
diff --git a/indra/llmath/tests/v3math_test.cpp b/indra/llmath/tests/v3math_test.cpp
index 7faf076243..d5c8dd2f9c 100644
--- a/indra/llmath/tests/v3math_test.cpp
+++ b/indra/llmath/tests/v3math_test.cpp
@@ -155,7 +155,7 @@ namespace tut
 		F32 x = 2.32f, y = 1.212f, z = -.12f;
 		LLVector3 vec3(x,y,z);		
 		ensure("1:magVecSquared:Fail ", is_approx_equal(vec3.magVecSquared(), (x*x + y*y + z*z)));
-		ensure("2:magVec:Fail ", is_approx_equal(vec3.magVec(), fsqrtf(x*x + y*y + z*z)));
+		ensure("2:magVec:Fail ", is_approx_equal(vec3.magVec(), (F32) sqrt(x*x + y*y + z*z)));
 	}
 
 	template<> template<>
@@ -515,7 +515,7 @@ namespace tut
 		F32 val1,val2;
 		LLVector3 vec3(x1,y1,z1),vec3a(x2,y2,z2);
 		val1 = dist_vec(vec3,vec3a);
-		val2 = fsqrtf((x1 - x2)*(x1 - x2) + (y1 - y2)* (y1 - y2) + (z1 - z2)* (z1 -z2));
+		val2 = (F32) sqrt((x1 - x2)*(x1 - x2) + (y1 - y2)* (y1 - y2) + (z1 - z2)* (z1 -z2));
 		ensure_equals("1:dist_vec: Fail ",val2, val1);
 		val1 = dist_vec_squared(vec3,vec3a);
 		val2 =((x1 - x2)*(x1 - x2) + (y1 - y2)* (y1 - y2) + (z1 - z2)* (z1 -z2));
diff --git a/indra/llmath/tests/v4color_test.cpp b/indra/llmath/tests/v4color_test.cpp
index 33921e0f0f..636446027a 100644
--- a/indra/llmath/tests/v4color_test.cpp
+++ b/indra/llmath/tests/v4color_test.cpp
@@ -161,7 +161,7 @@ namespace tut
 		F32 r = 0x20, g = 0xFFFF, b = 0xFF;
 		LLColor4 llcolor4(r,g,b);
 		ensure("magVecSquared:Fail ", is_approx_equal(llcolor4.magVecSquared(), (r*r + g*g + b*b)));
-		ensure("magVec:Fail ", is_approx_equal(llcolor4.magVec(), fsqrtf(r*r + g*g + b*b)));
+		ensure("magVec:Fail ", is_approx_equal(llcolor4.magVec(), (F32) sqrt(r*r + g*g + b*b)));
 	}
 
 	template<> template<>
@@ -170,7 +170,7 @@ namespace tut
 		F32 r = 0x20, g = 0xFFFF, b = 0xFF;
 		LLColor4 llcolor4(r,g,b);
 		F32 vecMag = llcolor4.normVec();
-		F32 mag = fsqrtf(r*r + g*g + b*b);
+		F32 mag = (F32) sqrt(r*r + g*g + b*b);
 		F32 oomag = 1.f / mag;
 		F32 val1 = r * oomag, val2 = g * oomag,	val3 = b * oomag;
 		ensure("1:normVec failed ", (is_approx_equal(val1, llcolor4.mV[0]) && is_approx_equal(val2, llcolor4.mV[1]) && is_approx_equal(val3, llcolor4.mV[2]) && is_approx_equal(vecMag, mag)));
diff --git a/indra/llmath/tests/v4coloru_test.cpp b/indra/llmath/tests/v4coloru_test.cpp
index 9f71cfc8cc..b3dbfece34 100644
--- a/indra/llmath/tests/v4coloru_test.cpp
+++ b/indra/llmath/tests/v4coloru_test.cpp
@@ -141,7 +141,7 @@ namespace tut
 		U8 r = 0x12, g = 0xFF, b = 0xAF;
 		LLColor4U llcolor4u(r,g,b);
 		ensure("magVecSquared:Fail ", is_approx_equal(llcolor4u.magVecSquared(), (F32)(r*r + g*g + b*b)));
-		ensure("magVec:Fail ", is_approx_equal(llcolor4u.magVec(), fsqrtf(r*r + g*g + b*b)));
+		ensure("magVec:Fail ", is_approx_equal(llcolor4u.magVec(), (F32) sqrt((F32) (r*r + g*g + b*b))));
 	}
 
 	template<> template<>
diff --git a/indra/llmath/tests/v4math_test.cpp b/indra/llmath/tests/v4math_test.cpp
index fe051c27e9..e919c90efa 100644
--- a/indra/llmath/tests/v4math_test.cpp
+++ b/indra/llmath/tests/v4math_test.cpp
@@ -102,7 +102,7 @@ namespace tut
 	{
 		F32 x = 10.f, y = -2.3f, z = -.023f;
 		LLVector4 vec4(x,y,z);
-		ensure("magVec:Fail ", is_approx_equal(vec4.magVec(), fsqrtf(x*x + y*y + z*z)));
+		ensure("magVec:Fail ", is_approx_equal(vec4.magVec(), (F32) sqrt(x*x + y*y + z*z)));
 		ensure("magVecSquared:Fail ", is_approx_equal(vec4.magVecSquared(), (x*x + y*y + z*z)));
 	}
 
@@ -343,7 +343,7 @@ namespace tut
 		F32 val1,val2;
 		LLVector4 vec4(x1,y1,z1),vec4a(x2,y2,z2);
 		val1 = dist_vec(vec4,vec4a);
-		val2 = fsqrtf((x1 - x2)*(x1 - x2) + (y1 - y2)* (y1 - y2) + (z1 - z2)* (z1 -z2));
+		val2 = (F32) sqrt((x1 - x2)*(x1 - x2) + (y1 - y2)* (y1 - y2) + (z1 - z2)* (z1 -z2));
 		ensure_equals("dist_vec: Fail ",val2, val1);
 		val1 = dist_vec_squared(vec4,vec4a);
 		val2 =((x1 - x2)*(x1 - x2) + (y1 - y2)* (y1 - y2) + (z1 - z2)* (z1 -z2));
diff --git a/indra/llmath/v2math.cpp b/indra/llmath/v2math.cpp
index 220336e0c2..2603127f75 100644
--- a/indra/llmath/v2math.cpp
+++ b/indra/llmath/v2math.cpp
@@ -92,7 +92,7 @@ F32	dist_vec(const LLVector2 &a, const LLVector2 &b)
 {
 	F32 x = a.mV[0] - b.mV[0];
 	F32 y = a.mV[1] - b.mV[1];
-	return fsqrtf( x*x + y*y );
+	return (F32) sqrt( x*x + y*y );
 }
 
 F32	dist_vec_squared(const LLVector2 &a, const LLVector2 &b)
diff --git a/indra/llmath/v2math.h b/indra/llmath/v2math.h
index f9f1c024f2..35fd1b6048 100644
--- a/indra/llmath/v2math.h
+++ b/indra/llmath/v2math.h
@@ -73,6 +73,8 @@ class LLVector2
 		void	setVec(const LLVector2 &vec);	// deprecated
 		void	setVec(const F32 *vec);			// deprecated
 
+		inline bool isFinite() const; // checks to see if all values of LLVector2 are finite
+
 		F32		length() const;				// Returns magnitude of LLVector2
 		F32		lengthSquared() const;		// Returns magnitude squared of LLVector2
 		F32		normalize();					// Normalizes and returns the magnitude of LLVector2
@@ -218,11 +220,12 @@ inline void	LLVector2::setVec(const F32 *vec)
 	mV[VY] = vec[VY];
 }
 
+
 // LLVector2 Magnitude and Normalization Functions
 
 inline F32 LLVector2::length(void) const
 {
-	return fsqrtf(mV[0]*mV[0] + mV[1]*mV[1]);
+	return (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1]);
 }
 
 inline F32 LLVector2::lengthSquared(void) const
@@ -232,7 +235,7 @@ inline F32 LLVector2::lengthSquared(void) const
 
 inline F32		LLVector2::normalize(void)
 {
-	F32 mag = fsqrtf(mV[0]*mV[0] + mV[1]*mV[1]);
+	F32 mag = (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1]);
 	F32 oomag;
 
 	if (mag > FP_MAG_THRESHOLD)
@@ -250,10 +253,16 @@ inline F32		LLVector2::normalize(void)
 	return (mag);
 }
 
+// checker
+inline bool LLVector2::isFinite() const
+{
+	return (llfinite(mV[VX]) && llfinite(mV[VY]));
+}
+
 // deprecated
 inline F32		LLVector2::magVec(void) const
 {
-	return fsqrtf(mV[0]*mV[0] + mV[1]*mV[1]);
+	return (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1]);
 }
 
 // deprecated
@@ -265,7 +274,7 @@ inline F32		LLVector2::magVecSquared(void) const
 // deprecated
 inline F32		LLVector2::normVec(void)
 {
-	F32 mag = fsqrtf(mV[0]*mV[0] + mV[1]*mV[1]);
+	F32 mag = (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1]);
 	F32 oomag;
 
 	if (mag > FP_MAG_THRESHOLD)
diff --git a/indra/llmath/v3color.h b/indra/llmath/v3color.h
index 1915d80502..95a3de8b62 100644
--- a/indra/llmath/v3color.h
+++ b/indra/llmath/v3color.h
@@ -284,7 +284,7 @@ inline F32		LLColor3::brightness(void) const
 
 inline F32		LLColor3::length(void) const
 {
-	return fsqrtf(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
+	return (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
 }
 
 inline F32		LLColor3::lengthSquared(void) const
@@ -294,7 +294,7 @@ inline F32		LLColor3::lengthSquared(void) const
 
 inline F32		LLColor3::normalize(void)
 {
-	F32 mag = fsqrtf(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
+	F32 mag = (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
 	F32 oomag;
 
 	if (mag)
@@ -310,7 +310,7 @@ inline F32		LLColor3::normalize(void)
 // deprecated
 inline F32		LLColor3::magVec(void) const
 {
-	return fsqrtf(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
+	return (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
 }
 
 // deprecated
@@ -322,7 +322,7 @@ inline F32		LLColor3::magVecSquared(void) const
 // deprecated
 inline F32		LLColor3::normVec(void)
 {
-	F32 mag = fsqrtf(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
+	F32 mag = (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
 	F32 oomag;
 
 	if (mag)
@@ -444,7 +444,7 @@ inline F32		distVec(const LLColor3 &a, const LLColor3 &b)
 	F32 x = a.mV[0] - b.mV[0];
 	F32 y = a.mV[1] - b.mV[1];
 	F32 z = a.mV[2] - b.mV[2];
-	return fsqrtf( x*x + y*y + z*z );
+	return (F32) sqrt( x*x + y*y + z*z );
 }
 
 inline F32		distVec_squared(const LLColor3 &a, const LLColor3 &b)
diff --git a/indra/llmath/v3dmath.h b/indra/llmath/v3dmath.h
index 6ab31e8a41..ab253de064 100644
--- a/indra/llmath/v3dmath.h
+++ b/indra/llmath/v3dmath.h
@@ -240,7 +240,7 @@ inline const LLVector3d&	LLVector3d::setVec(const F64 *vec)
 
 inline F64 LLVector3d::normVec(void)
 {
-	F64 mag = fsqrtf(mdV[0]*mdV[0] + mdV[1]*mdV[1] + mdV[2]*mdV[2]);
+	F64 mag = (F32) sqrt(mdV[0]*mdV[0] + mdV[1]*mdV[1] + mdV[2]*mdV[2]);
 	F64 oomag;
 
 	if (mag > FP_MAG_THRESHOLD)
@@ -262,7 +262,7 @@ inline F64 LLVector3d::normVec(void)
 
 inline F64 LLVector3d::normalize(void)
 {
-	F64 mag = fsqrtf(mdV[0]*mdV[0] + mdV[1]*mdV[1] + mdV[2]*mdV[2]);
+	F64 mag = (F32) sqrt(mdV[0]*mdV[0] + mdV[1]*mdV[1] + mdV[2]*mdV[2]);
 	F64 oomag;
 
 	if (mag > FP_MAG_THRESHOLD)
@@ -286,7 +286,7 @@ inline F64 LLVector3d::normalize(void)
 
 inline F64	LLVector3d::magVec(void) const
 {
-	return fsqrtf(mdV[0]*mdV[0] + mdV[1]*mdV[1] + mdV[2]*mdV[2]);
+	return (F32) sqrt(mdV[0]*mdV[0] + mdV[1]*mdV[1] + mdV[2]*mdV[2]);
 }
 
 inline F64	LLVector3d::magVecSquared(void) const
@@ -296,7 +296,7 @@ inline F64	LLVector3d::magVecSquared(void) const
 
 inline F64	LLVector3d::length(void) const
 {
-	return fsqrtf(mdV[0]*mdV[0] + mdV[1]*mdV[1] + mdV[2]*mdV[2]);
+	return (F32) sqrt(mdV[0]*mdV[0] + mdV[1]*mdV[1] + mdV[2]*mdV[2]);
 }
 
 inline F64	LLVector3d::lengthSquared(void) const
@@ -406,7 +406,7 @@ inline F64	dist_vec(const LLVector3d &a, const LLVector3d &b)
 	F64 x = a.mdV[0] - b.mdV[0];
 	F64 y = a.mdV[1] - b.mdV[1];
 	F64 z = a.mdV[2] - b.mdV[2];
-	return fsqrtf( x*x + y*y + z*z );
+	return (F32) sqrt( x*x + y*y + z*z );
 }
 
 inline F64	dist_vec_squared(const LLVector3d &a, const LLVector3d &b)
diff --git a/indra/llmath/v3math.h b/indra/llmath/v3math.h
index 76dd938887..5d483a8753 100644
--- a/indra/llmath/v3math.h
+++ b/indra/llmath/v3math.h
@@ -36,7 +36,6 @@
 #include "llerror.h"
 #include "llmath.h"
 
-
 #include "llsd.h"
 class LLVector2;
 class LLVector4;
@@ -283,7 +282,7 @@ inline void	LLVector3::setVec(const F32 *vec)
 
 inline F32 LLVector3::normalize(void)
 {
-	F32 mag = fsqrtf(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
+	F32 mag = (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
 	F32 oomag;
 
 	if (mag > FP_MAG_THRESHOLD)
@@ -306,7 +305,7 @@ inline F32 LLVector3::normalize(void)
 // deprecated
 inline F32 LLVector3::normVec(void)
 {
-	F32 mag = fsqrtf(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
+	F32 mag = (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
 	F32 oomag;
 
 	if (mag > FP_MAG_THRESHOLD)
@@ -330,7 +329,7 @@ inline F32 LLVector3::normVec(void)
 
 inline F32	LLVector3::length(void) const
 {
-	return fsqrtf(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
+	return (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
 }
 
 inline F32	LLVector3::lengthSquared(void) const
@@ -340,7 +339,7 @@ inline F32	LLVector3::lengthSquared(void) const
 
 inline F32	LLVector3::magVec(void) const
 {
-	return fsqrtf(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
+	return (F32) sqrt(mV[0]*mV[0] + mV[1]*mV[1] + mV[2]*mV[2]);
 }
 
 inline F32	LLVector3::magVecSquared(void) const
@@ -474,7 +473,7 @@ inline F32	dist_vec(const LLVector3 &a, const LLVector3 &b)
 	F32 x = a.mV[0] - b.mV[0];
 	F32 y = a.mV[1] - b.mV[1];
 	F32 z = a.mV[2] - b.mV[2];
-	return fsqrtf( x*x + y*y + z*z );
+	return (F32) sqrt( x*x + y*y + z*z );
 }
 
 inline F32	dist_vec_squared(const LLVector3 &a, const LLVector3 &b)
@@ -532,6 +531,21 @@ inline void update_min_max(LLVector3& min, LLVector3& max, const LLVector3& pos)
 	}
 }
 
+inline void update_min_max(LLVector3& min, LLVector3& max, const F32* pos)
+{
+	for (U32 i = 0; i < 3; i++)
+	{
+		if (min.mV[i] > pos[i])
+		{
+			min.mV[i] = pos[i];
+		}
+		if (max.mV[i] < pos[i])
+		{
+			max.mV[i] = pos[i];
+		}
+	}
+}
+
 inline F32 angle_between(const LLVector3& a, const LLVector3& b)
 {
 	LLVector3 an = a;
diff --git a/indra/llmath/v4color.h b/indra/llmath/v4color.h
index 6b63b976b0..dd92e1cc63 100644
--- a/indra/llmath/v4color.h
+++ b/indra/llmath/v4color.h
@@ -392,7 +392,7 @@ inline const LLColor4&	LLColor4::setAlpha(F32 a)
 
 inline F32		LLColor4::length(void) const
 {
-	return fsqrtf(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
+	return (F32) sqrt(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
 }
 
 inline F32		LLColor4::lengthSquared(void) const
@@ -402,7 +402,7 @@ inline F32		LLColor4::lengthSquared(void) const
 
 inline F32		LLColor4::normalize(void)
 {
-	F32 mag = fsqrtf(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
+	F32 mag = (F32) sqrt(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
 	F32 oomag;
 
 	if (mag)
@@ -418,7 +418,7 @@ inline F32		LLColor4::normalize(void)
 // deprecated
 inline F32		LLColor4::magVec(void) const
 {
-	return fsqrtf(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
+	return (F32) sqrt(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
 }
 
 // deprecated
@@ -430,7 +430,7 @@ inline F32		LLColor4::magVecSquared(void) const
 // deprecated
 inline F32		LLColor4::normVec(void)
 {
-	F32 mag = fsqrtf(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
+	F32 mag = (F32) sqrt(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
 	F32 oomag;
 
 	if (mag)
diff --git a/indra/llmath/v4coloru.h b/indra/llmath/v4coloru.h
index 4ec5a345eb..08245403a1 100644
--- a/indra/llmath/v4coloru.h
+++ b/indra/llmath/v4coloru.h
@@ -300,7 +300,7 @@ inline const LLColor4U&	LLColor4U::setAlpha(U8 a)
 
 inline F32		LLColor4U::length(void) const
 {
-	return fsqrtf( ((F32)mV[VX]) * mV[VX] + ((F32)mV[VY]) * mV[VY] + ((F32)mV[VZ]) * mV[VZ] );
+	return (F32) sqrt( ((F32)mV[VX]) * mV[VX] + ((F32)mV[VY]) * mV[VY] + ((F32)mV[VZ]) * mV[VZ] );
 }
 
 inline F32		LLColor4U::lengthSquared(void) const
@@ -311,7 +311,7 @@ inline F32		LLColor4U::lengthSquared(void) const
 // deprecated
 inline F32		LLColor4U::magVec(void) const
 {
-	return fsqrtf( ((F32)mV[VX]) * mV[VX] + ((F32)mV[VY]) * mV[VY] + ((F32)mV[VZ]) * mV[VZ] );
+	return (F32) sqrt( ((F32)mV[VX]) * mV[VX] + ((F32)mV[VY]) * mV[VY] + ((F32)mV[VZ]) * mV[VZ] );
 }
 
 // deprecated
diff --git a/indra/llmath/v4math.h b/indra/llmath/v4math.h
index 4c82e6b629..72a477ed20 100644
--- a/indra/llmath/v4math.h
+++ b/indra/llmath/v4math.h
@@ -321,7 +321,7 @@ inline void	LLVector4::setVec(const F32 *vec)
 
 inline F32		LLVector4::length(void) const
 {
-	return fsqrtf(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
+	return (F32) sqrt(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
 }
 
 inline F32		LLVector4::lengthSquared(void) const
@@ -331,7 +331,7 @@ inline F32		LLVector4::lengthSquared(void) const
 
 inline F32		LLVector4::magVec(void) const
 {
-	return fsqrtf(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
+	return (F32) sqrt(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
 }
 
 inline F32		LLVector4::magVecSquared(void) const
@@ -463,7 +463,7 @@ inline LLVector4 lerp(const LLVector4 &a, const LLVector4 &b, F32 u)
 
 inline F32		LLVector4::normalize(void)
 {
-	F32 mag = fsqrtf(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
+	F32 mag = (F32) sqrt(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
 	F32 oomag;
 
 	if (mag > FP_MAG_THRESHOLD)
@@ -486,7 +486,7 @@ inline F32		LLVector4::normalize(void)
 // deprecated
 inline F32		LLVector4::normVec(void)
 {
-	F32 mag = fsqrtf(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
+	F32 mag = (F32) sqrt(mV[VX]*mV[VX] + mV[VY]*mV[VY] + mV[VZ]*mV[VZ]);
 	F32 oomag;
 
 	if (mag > FP_MAG_THRESHOLD)