23 files changed, 515 insertions, 490 deletions
diff --git a/indra/llmath/CMakeLists.txt b/indra/llmath/CMakeLists.txt
index b5e59c1ca3..5865ae030c 100644
--- a/indra/llmath/CMakeLists.txt
+++ b/indra/llmath/CMakeLists.txt
@@ -117,6 +117,7 @@ if (LL_TESTS)
   # INTEGRATION TESTS
   set(test_libs llmath llcommon ${LLCOMMON_LIBRARIES} ${WINDOWS_LIBRARIES})
   # TODO: Some of these need refactoring to be proper Unit tests rather than Integration tests.
+  LL_ADD_INTEGRATION_TEST(alignment "" "${test_libs}")
   LL_ADD_INTEGRATION_TEST(llbbox llbbox.cpp "${test_libs}")
   LL_ADD_INTEGRATION_TEST(llquaternion llquaternion.cpp "${test_libs}")
   LL_ADD_INTEGRATION_TEST(mathmisc "" "${test_libs}")
diff --git a/indra/llmath/llcalcparser.h b/indra/llmath/llcalcparser.h
index bd9c8c2519..e0ad270266 100644
--- a/indra/llmath/llcalcparser.h
+++ b/indra/llmath/llcalcparser.h
@@ -174,7 +174,7 @@ private:
 	F32 _log(const F32& a) const { return log(a); }
 	F32 _exp(const F32& a) const { return exp(a); }
 	F32 _fabs(const F32& a) const { return fabs(a); }
-	F32 _floor(const F32& a) const { return llfloor(a); }
+	F32 _floor(const F32& a) const { return (F32)llfloor(a); }
 	F32 _ceil(const F32& a) const { return llceil(a); }
 
 	F32 _atan2(const F32& a,const F32& b) const { return atan2(a,b); }
diff --git a/indra/llmath/llcamera.h b/indra/llmath/llcamera.h
index ec67b91d05..0b591be622 100644
--- a/indra/llmath/llcamera.h
+++ b/indra/llmath/llcamera.h
@@ -60,7 +60,7 @@ static const F32 MAX_FIELD_OF_VIEW = 175.f * DEG_TO_RAD;
 // roll(), pitch(), yaw()
 // etc...
 
-
+LL_ALIGN_PREFIX(16)
 class LLCamera
 : 	public LLCoordFrame
 {
@@ -108,7 +108,7 @@ public:
 	};
 
 private:
-	LLPlane mAgentPlanes[7];  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP
+	LL_ALIGN_16(LLPlane mAgentPlanes[7]);  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP
 	U8 mPlaneMask[8];         // 8 for alignment	
 	
 	F32 mView;					// angle between top and bottom frustum planes in radians.
@@ -116,13 +116,13 @@ private:
 	S32 mViewHeightInPixels;	// for ViewHeightInPixels() only
 	F32 mNearPlane;
 	F32 mFarPlane;
-	LLPlane mLocalPlanes[4];
+	LL_ALIGN_16(LLPlane mLocalPlanes[4]);
 	F32 mFixedDistance;			// Always return this distance, unless < 0
 	LLVector3 mFrustCenter;		// center of frustum and radius squared for ultra-quick exclusion test
 	F32 mFrustRadiusSquared;
 	
-	LLPlane mWorldPlanes[PLANE_NUM];
-	LLPlane mHorizPlanes[HORIZ_PLANE_NUM];
+	LL_ALIGN_16(LLPlane mWorldPlanes[PLANE_NUM]);
+	LL_ALIGN_16(LLPlane mHorizPlanes[HORIZ_PLANE_NUM]);
 
 	U32 mPlaneCount;  //defaults to 6, if setUserClipPlane is called, uses user supplied clip plane in
 
@@ -208,7 +208,7 @@ protected:
 	void calculateFrustumPlanes(F32 left, F32 right, F32 top, F32 bottom);
 	void calculateFrustumPlanesFromWindow(F32 x1, F32 y1, F32 x2, F32 y2);
 	void calculateWorldFrustumPlanes();
-};
+} LL_ALIGN_POSTFIX(16);
 
 
 #endif
diff --git a/indra/llmath/llcoord.h b/indra/llmath/llcoord.h
index 706ad92787..9b76268afd 100644
--- a/indra/llmath/llcoord.h
+++ b/indra/llmath/llcoord.h
@@ -26,80 +26,87 @@
 #ifndef LL_LLCOORD_H
 #define LL_LLCOORD_H
 
+template<typename> class LLCoord;
+struct LL_COORD_TYPE_GL;
+struct LL_COORD_TYPE_WINDOW;
+struct LL_COORD_TYPE_SCREEN;
+
+typedef LLCoord<LL_COORD_TYPE_GL> LLCoordGL;
+typedef LLCoord<LL_COORD_TYPE_WINDOW> LLCoordWindow;
+typedef LLCoord<LL_COORD_TYPE_SCREEN> LLCoordScreen;
+
+struct LLCoordCommon
+{
+	LLCoordCommon(S32 x, S32 y) : mX(x), mY(y) {}
+	LLCoordCommon() : mX(0), mY(0) {}
+	S32 mX;
+	S32 mY;
+};
+
 // A two-dimensional pixel value
-class LLCoord
+template<typename COORD_FRAME>
+class LLCoord : protected COORD_FRAME
 {
 public:
-	S32		mX;
-	S32		mY;
+	typedef LLCoord<COORD_FRAME> self_t;
+	typename COORD_FRAME::value_t	mX;
+	typename COORD_FRAME::value_t	mY;
 
 	LLCoord():	mX(0), mY(0)
 	{}
-	LLCoord(S32 x, S32 y): mX(x), mY(y)
-	{}
-	virtual ~LLCoord()
+	LLCoord(typename COORD_FRAME::value_t x, typename COORD_FRAME::value_t y): mX(x), mY(y)
 	{}
 
-	virtual void set(S32 x, S32 y)		{ mX = x; mY = y; }
-};
+	LLCoord(const LLCoordCommon& other)
+	{
+		COORD_FRAME::convertFromCommon(other);
+	}
 
+	LLCoordCommon convert() const
+	{
+		return COORD_FRAME::convertToCommon();
+	}
 
-// GL coordinates start in the client region of a window,
-// with left, bottom = 0, 0
-class LLCoordGL : public LLCoord
-{
-public:
-	LLCoordGL() : LLCoord()
-	{}
-	LLCoordGL(S32 x, S32 y) : LLCoord(x, y)
-	{}
-	bool operator==(const LLCoordGL& other) const { return mX == other.mX && mY == other.mY; }
-	bool operator!=(const LLCoordGL& other) const { return !(*this == other); }
-};
+	void set(typename COORD_FRAME::value_t x, typename COORD_FRAME::value_t y) { mX = x; mY = y;}
+	bool operator==(const self_t& other) const { return mX == other.mX && mY == other.mY; }
+	bool operator!=(const self_t& other) const { return !(*this == other); }
 
-//bool operator ==(const LLCoordGL& a, const LLCoordGL& b);
+	static const self_t& getTypedCoords(const COORD_FRAME& self) { return static_cast<const self_t&>(self); }
+	static self_t& getTypedCoords(COORD_FRAME& self) { return static_cast<self_t&>(self); }
+};
 
-// Window coords include things like window borders,
-// menu regions, etc.
-class LLCoordWindow : public LLCoord
+struct LL_COORD_TYPE_GL 
 {
-public:
-	LLCoordWindow() : LLCoord()
-	{}
-	LLCoordWindow(S32 x, S32 y) : LLCoord(x, y)
-	{}
-	bool operator==(const LLCoordWindow& other) const { return mX == other.mX && mY == other.mY; }
-	bool operator!=(const LLCoordWindow& other) const { return !(*this == other); }
-};
+	typedef S32 value_t;
 
+	LLCoordCommon convertToCommon() const
+	{
+		const LLCoordGL& self = LLCoordGL::getTypedCoords(*this);
+		return LLCoordCommon(self.mX, self.mY);
+	}
 
-// Screen coords start at left, top = 0, 0
-class LLCoordScreen : public LLCoord
+	void convertFromCommon(const LLCoordCommon& from)
+	{
+		LLCoordGL& self = LLCoordGL::getTypedCoords(*this);
+		self.mX = from.mX;
+		self.mY = from.mY;
+	}
+};
+
+struct LL_COORD_TYPE_WINDOW 
 {
-public:
-	LLCoordScreen() : LLCoord()
-	{}
-	LLCoordScreen(S32 x, S32 y) : LLCoord(x, y)
-	{}
-	bool operator==(const LLCoordScreen& other) const { return mX == other.mX && mY == other.mY; }
-	bool operator!=(const LLCoordScreen& other) const { return !(*this == other); }
+	typedef S32 value_t;
+
+	LLCoordCommon convertToCommon() const;
+	void convertFromCommon(const LLCoordCommon& from);
 };
 
-class LLCoordFont : public LLCoord
+struct LL_COORD_TYPE_SCREEN 
 {
-public:
-	F32 mZ;
-	
-	LLCoordFont() : LLCoord(), mZ(0.f)
-	{}
-	LLCoordFont(S32 x, S32 y, F32 z = 0) : LLCoord(x,y), mZ(z)
-	{}
-	
-	void set(S32 x, S32 y) { LLCoord::set(x,y); mZ = 0.f; }
-	void set(S32 x, S32 y, F32 z) { mX = x; mY = y; mZ = z; }
-	bool operator==(const LLCoordFont& other) const { return mX == other.mX && mY == other.mY; }
-	bool operator!=(const LLCoordFont& other) const { return !(*this == other); }
+	typedef S32 value_t;
+
+	LLCoordCommon convertToCommon() const;
+	void convertFromCommon(const LLCoordCommon& from);
 };
-	
 
 #endif
diff --git a/indra/llmath/llmath.h b/indra/llmath/llmath.h
index 9297bcbac2..b93f89d674 100644
--- a/indra/llmath/llmath.h
+++ b/indra/llmath/llmath.h
@@ -85,7 +85,7 @@ const F32	F_ALMOST_ONE	= 1.0f - F_ALMOST_ZERO;
 const F32 FP_MAG_THRESHOLD = 0.0000001f;
 
 // TODO: Replace with logic like is_approx_equal
-inline BOOL is_approx_zero( F32 f ) { return (-F_APPROXIMATELY_ZERO < f) && (f < F_APPROXIMATELY_ZERO); }
+inline bool is_approx_zero( F32 f ) { return (-F_APPROXIMATELY_ZERO < f) && (f < F_APPROXIMATELY_ZERO); }
 
 // These functions work by interpreting sign+exp+mantissa as an unsigned
 // integer.
@@ -111,13 +111,13 @@ inline BOOL is_approx_zero( F32 f ) { return (-F_APPROXIMATELY_ZERO < f) && (f <
 // WARNING: Infinity is comparable with F32_MAX and negative 
 // infinity is comparable with F32_MIN
 
-inline BOOL is_approx_equal(F32 x, F32 y)
+inline bool is_approx_equal(F32 x, F32 y)
 {
 	const S32 COMPARE_MANTISSA_UP_TO_BIT = 0x02;
 	return (std::abs((S32) ((U32&)x - (U32&)y) ) < COMPARE_MANTISSA_UP_TO_BIT);
 }
 
-inline BOOL is_approx_equal(F64 x, F64 y)
+inline bool is_approx_equal(F64 x, F64 y)
 {
 	const S64 COMPARE_MANTISSA_UP_TO_BIT = 0x02;
 	return (std::abs((S32) ((U64&)x - (U64&)y) ) < COMPARE_MANTISSA_UP_TO_BIT);
diff --git a/indra/llmath/llmatrix3a.h b/indra/llmath/llmatrix3a.h
index adb7e3389d..9916cfd2da 100644
--- a/indra/llmath/llmatrix3a.h
+++ b/indra/llmath/llmatrix3a.h
@@ -111,7 +111,7 @@ public:
 
 protected:
 
-	LLVector4a mColumns[3];
+	LL_ALIGN_16(LLVector4a mColumns[3]);
 
 };
 
diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h
index 27cf5b79f6..c4cefdb4fa 100644
--- a/indra/llmath/llmatrix4a.h
+++ b/indra/llmath/llmatrix4a.h
@@ -34,7 +34,7 @@
 class LLMatrix4a
 {
 public:
-	LLVector4a mMatrix[4];
+	LL_ALIGN_16(LLVector4a mMatrix[4]);
 
 	inline void clear()
 	{
diff --git a/indra/llmath/lloctree.h b/indra/llmath/lloctree.h
index 3c1ae45d68..7348904c61 100644
--- a/indra/llmath/lloctree.h
+++ b/indra/llmath/lloctree.h
@@ -31,7 +31,6 @@
 #include "v3math.h"
 #include "llvector4a.h"
 #include <vector>
-#include <set>
 
 #define OCT_ERRS LL_WARNS("OctreeErrors")
 
@@ -79,16 +78,18 @@ public:
 
 	typedef LLOctreeTraveler<T>									oct_traveler;
 	typedef LLTreeTraveler<T>									tree_traveler;
-	typedef typename std::set<LLPointer<T> >					element_list;
-	typedef typename std::set<LLPointer<T> >::iterator			element_iter;
-	typedef typename std::set<LLPointer<T> >::const_iterator	const_element_iter;
+	typedef std::vector< LLPointer<T> >							element_list;		// note:  don't remove the whitespace between "> >"
+	typedef LLPointer<T>*										element_iter;
+	typedef const LLPointer<T>*									const_element_iter;
 	typedef typename std::vector<LLTreeListener<T>*>::iterator	tree_listener_iter;
-	typedef typename std::vector<LLOctreeNode<T>* >				child_list;
+	typedef LLOctreeNode<T>**									child_list;
+	typedef LLOctreeNode<T>**									child_iter;
+
 	typedef LLTreeNode<T>		BaseType;
 	typedef LLOctreeNode<T>		oct_node;
 	typedef LLOctreeListener<T>	oct_listener;
 
-	/*void* operator new(size_t size)
+	void* operator new(size_t size)
 	{
 		return ll_aligned_malloc_16(size);
 	}
@@ -96,7 +97,7 @@ public:
 	void operator delete(void* ptr)
 	{
 		ll_aligned_free_16(ptr);
-	}*/
+	}
 
 	LLOctreeNode(	const LLVector4a& center, 
 					const LLVector4a& size, 
@@ -105,6 +106,10 @@ public:
 	:	mParent((oct_node*)parent), 
 		mOctant(octant) 
 	{ 
+		//always keep a NULL terminated list to avoid out of bounds exceptions in debug builds
+		mData.push_back(NULL);
+		mDataEnd = &mData[0];
+
 		mCenter = center;
 		mSize = size;
 
@@ -114,6 +119,8 @@ public:
 			mOctant = ((oct_node*) mParent)->getOctant(mCenter);
 		}
 
+		mElementCount = 0;
+
 		clearChildren();
 	}
 
@@ -121,6 +128,16 @@ public:
 	{ 
 		BaseType::destroyListeners(); 
 		
+		for (U32 i = 0; i < mElementCount; ++i)
+		{
+			mData[i]->setBinIndex(-1);
+			mData[i] = NULL;
+		}
+
+		mData.clear();
+		mData.push_back(NULL);
+		mDataEnd = &mData[0];
+
 		for (U32 i = 0; i < getChildCount(); i++)
 		{
 			delete getChild(i);
@@ -217,13 +234,18 @@ public:
 	}
 
 	void accept(oct_traveler* visitor)				{ visitor->visit(this); }
-	virtual bool isLeaf() const						{ return mChild.empty(); }
+	virtual bool isLeaf() const						{ return mChildCount == 0; }
 	
-	U32 getElementCount() const						{ return mData.size(); }
+	U32 getElementCount() const						{ return mElementCount; }
+	bool isEmpty() const							{ return mElementCount == 0; }
 	element_list& getData()							{ return mData; }
 	const element_list& getData() const				{ return mData; }
-	
-	U32 getChildCount()	const						{ return mChild.size(); }
+	element_iter getDataBegin()						{ return &mData[0]; }
+	element_iter getDataEnd()						{ return mDataEnd; }
+	const_element_iter getDataBegin() const			{ return &mData[0]; }
+	const_element_iter getDataEnd() const			{ return mDataEnd; }
+		
+	U32 getChildCount()	const						{ return mChildCount; }
 	oct_node* getChild(U32 index)					{ return mChild[index]; }
 	const oct_node* getChild(U32 index) const		{ return mChild[index]; }
 	child_list& getChildren()						{ return mChild; }
@@ -287,7 +309,7 @@ public:
 	
 	virtual bool insert(T* data)
 	{
-		if (data == NULL)
+		if (data == NULL || data->getBinIndex() != -1)
 		{
 			OCT_ERRS << "!!! INVALID ELEMENT ADDED TO OCTREE BRANCH !!!" << llendl;
 			return false;
@@ -300,16 +322,11 @@ public:
 			if ((getElementCount() < gOctreeMaxCapacity && contains(data->getBinRadius()) ||
 				(data->getBinRadius() > getSize()[0] &&	parent && parent->getElementCount() >= gOctreeMaxCapacity))) 
 			{ //it belongs here
-#if LL_OCTREE_PARANOIA_CHECK
-				//if this is a redundant insertion, error out (should never happen)
-				if (mData.find(data) != mData.end())
-				{
-					llwarns << "Redundant octree insertion detected. " << data << llendl;
-					return false;
-				}
-#endif
-
-				mData.insert(data);
+				mData.push_back(NULL);
+				mData[mElementCount] = data;
+				mElementCount++;
+				mDataEnd = &mData[mElementCount];
+				data->setBinIndex(mElementCount-1);
 				BaseType::insert(data);
 				return true;
 			}
@@ -344,7 +361,11 @@ public:
 
 				if( lt == 0x7 )
 				{
-					mData.insert(data);
+					mData.push_back(NULL);
+					mData[mElementCount] = data;
+					mElementCount++;
+					mDataEnd = &mData[mElementCount];
+					data->setBinIndex(mElementCount-1);
 					BaseType::insert(data);
 					return true;
 				}
@@ -394,22 +415,58 @@ public:
 		return false;
 	}
 
+	void _remove(T* data, S32 i)
+	{ //precondition -- mElementCount > 0, idx is in range [0, mElementCount)
+
+		mElementCount--;
+		data->setBinIndex(-1); 
+		
+		if (mElementCount > 0)
+		{
+			if (mElementCount != i)
+			{
+				mData[i] = mData[mElementCount]; //might unref data, do not access data after this point
+				mData[i]->setBinIndex(i);
+			}
+
+			mData[mElementCount] = NULL;
+			mData.pop_back();
+			mDataEnd = &mData[mElementCount];
+		}
+		else
+		{
+			mData.clear();
+			mData.push_back(NULL);
+			mDataEnd = &mData[0];
+		}
+
+		notifyRemoval(data);
+		checkAlive();
+	}
+
 	bool remove(T* data)
 	{
-		if (mData.find(data) != mData.end())
-		{	//we have data
-			mData.erase(data);
-			notifyRemoval(data);
-			checkAlive();
-			return true;
-		}
-		else if (isInside(data))
+		S32 i = data->getBinIndex();
+
+		if (i >= 0 && i < mElementCount)
+		{
+			if (mData[i] == data)
+			{ //found it
+				_remove(data, i);
+				llassert(data->getBinIndex() == -1);
+				return true;
+			}
+		}
+		
+		if (isInside(data))
 		{
 			oct_node* dest = getNodeAt(data);
 
 			if (dest != this)
 			{
-				return dest->remove(data);
+				bool ret = dest->remove(data);
+				llassert(data->getBinIndex() == -1);
+				return ret;
 			}
 		}
 
@@ -426,20 +483,22 @@ public:
 		}
 
 		//node is now root
-		llwarns << "!!! OCTREE REMOVING FACE BY ADDRESS, SEVERE PERFORMANCE PENALTY |||" << llendl;
+		llwarns << "!!! OCTREE REMOVING ELEMENT BY ADDRESS, SEVERE PERFORMANCE PENALTY |||" << llendl;
 		node->removeByAddress(data);
+		llassert(data->getBinIndex() == -1);
 		return true;
 	}
 
 	void removeByAddress(T* data)
 	{
-        if (mData.find(data) != mData.end())
+        for (U32 i = 0; i < mElementCount; ++i)
 		{
-			mData.erase(data);
-			notifyRemoval(data);
-			llwarns << "FOUND!" << llendl;
-			checkAlive();
-			return;
+			if (mData[i] == data)
+			{ //we have data
+				_remove(data, i);
+				llwarns << "FOUND!" << llendl;
+				return;
+			}
 		}
 		
 		for (U32 i = 0; i < getChildCount(); i++)
@@ -451,7 +510,7 @@ public:
 
 	void clearChildren()
 	{
-		mChild.clear();
+		mChildCount = 0;
 
 		U32* foo = (U32*) mChildMap;
 		foo[0] = foo[1] = 0xFFFFFFFF;
@@ -512,9 +571,10 @@ public:
 		}
 #endif
 
-		mChildMap[child->getOctant()] = (U8) mChild.size();
+		mChildMap[child->getOctant()] = mChildCount;
 
-		mChild.push_back(child);
+		mChild[mChildCount] = child;
+		++mChildCount;
 		child->setParent(this);
 
 		if (!silent)
@@ -534,21 +594,23 @@ public:
 			oct_listener* listener = getOctListener(i);
 			listener->handleChildRemoval(this, getChild(index));
 		}
-
 		
-
 		if (destroy)
 		{
 			mChild[index]->destroy();
 			delete mChild[index];
 		}
-		mChild.erase(mChild.begin() + index);
+
+		--mChildCount;
+
+		mChild[index] = mChild[mChildCount];
+		
 
 		//rebuild child map
 		U32* foo = (U32*) mChildMap;
 		foo[0] = foo[1] = 0xFFFFFFFF;
 
-		for (U32 i = 0; i < mChild.size(); ++i)
+		for (U32 i = 0; i < mChildCount; ++i)
 		{
 			mChildMap[mChild[i]->getOctant()] = i;
 		}
@@ -599,10 +661,13 @@ protected:
 	oct_node* mParent;
 	U8 mOctant;
 
-	child_list mChild;
+	LLOctreeNode<T>* mChild[8];
 	U8 mChildMap[8];
+	U32 mChildCount;
 
 	element_list mData;
+	element_iter mDataEnd;
+	U32 mElementCount;
 		
 }; 
 
diff --git a/indra/llmath/llplane.h b/indra/llmath/llplane.h
index a611894721..3c32441b11 100644
--- a/indra/llmath/llplane.h
+++ b/indra/llmath/llplane.h
@@ -36,6 +36,8 @@
 // The plane normal = [A, B, C]
 // The closest approach = D / sqrt(A*A + B*B + C*C)
 
+
+LL_ALIGN_PREFIX(16)
 class LLPlane
 {
 public:
@@ -94,7 +96,7 @@ public:
 		
 private:
 	LLVector4a mV;
-};
+} LL_ALIGN_POSTFIX(16);
 
 
 
diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h
index c7cdf7b32c..01458521ec 100644
--- a/indra/llmath/llsimdmath.h
+++ b/indra/llmath/llsimdmath.h
@@ -67,11 +67,10 @@ template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
 
 #define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
 
-
-
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
+#include "llmemory.h"
 #include "llsimdtypes.h"
 #include "llsimdtypes.inl"
 
diff --git a/indra/llmath/llsimdtypes.inl b/indra/llmath/llsimdtypes.inl
index 712239e425..e905c84954 100644
--- a/indra/llmath/llsimdtypes.inl
+++ b/indra/llmath/llsimdtypes.inl
@@ -62,6 +62,7 @@ inline LLSimdScalar operator/(const LLSimdScalar& a, const LLSimdScalar& b)
 inline LLSimdScalar operator-(const LLSimdScalar& a)
 {
 	static LL_ALIGN_16(const U32 signMask[4]) = {0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+	ll_assert_aligned(signMask,16);
 	return _mm_xor_ps(*reinterpret_cast<const LLQuad*>(signMask), a);
 }
 
@@ -146,6 +147,7 @@ inline LLSimdScalar& LLSimdScalar::operator/=(const LLSimdScalar& rhs)
 inline LLSimdScalar LLSimdScalar::getAbs() const
 {
 	static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
+	ll_assert_aligned(F_ABS_MASK_4A,16);
 	return _mm_and_ps( mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
 }
 
diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp
index b66b7a7076..6edeb0fefe 100644
--- a/indra/llmath/llvector4a.cpp
+++ b/indra/llmath/llvector4a.cpp
@@ -24,6 +24,7 @@
  * $/LicenseInfo$
  */
 
+#include "llmemory.h"
 #include "llmath.h"
 #include "llquantize.h"
 
@@ -44,7 +45,10 @@ extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F
 	assert(dst != NULL);
 	assert(bytes > 0);
 	assert((bytes % sizeof(F32))== 0); 
-	
+	ll_assert_aligned(src,16);
+	ll_assert_aligned(dst,16);
+	assert(bytes%16==0);
+
 	F32* end = dst + (bytes / sizeof(F32) );
 
 	if (bytes > 64)
@@ -189,6 +193,8 @@ void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high )
 		LLVector4a oneOverDelta;
 		{
 			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
+			ll_assert_aligned(F_TWO_4A,16);
+			
 			LLVector4a two; two.load4a( F_TWO_4A );
 
 			// Here we use _mm_rcp_ps plus one round of newton-raphson
diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h
index 596082509d..0526793d3a 100644
--- a/indra/llmath/llvector4a.h
+++ b/indra/llmath/llvector4a.h
@@ -32,6 +32,7 @@ class LLRotation;
 
 #include <assert.h>
 #include "llpreprocessor.h"
+#include "llmemory.h"
 
 ///////////////////////////////////
 // FIRST TIME USERS PLEASE READ
@@ -46,6 +47,7 @@ class LLRotation;
 // LLVector3/LLVector4. 
 /////////////////////////////////
 
+LL_ALIGN_PREFIX(16)
 class LLVector4a
 {
 public:
@@ -82,6 +84,7 @@ public:
 	}
 
 	// Copy words 16-byte blocks from src to dst. Source and destination must not overlap. 
+	// Source and dest must be 16-byte aligned and size must be multiple of 16.
 	static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes);
 
 	////////////////////////////////////
@@ -90,6 +93,7 @@ public:
 	
 	LLVector4a()
 	{ //DO NOT INITIALIZE -- The overhead is completely unnecessary
+		ll_assert_aligned(this,16);
 	}
 	
 	LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f)
@@ -313,7 +317,7 @@ public:
 
 private:
 	LLQuad mQ;
-};
+} LL_ALIGN_POSTFIX(16);
 
 inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p)
 {
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
index 7ad22a5631..7c52ffef21 100644
--- a/indra/llmath/llvector4a.inl
+++ b/indra/llmath/llvector4a.inl
@@ -475,6 +475,7 @@ inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F3
 inline LLBool32 LLVector4a::isFinite3() const
 {
 	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+	ll_assert_aligned(nanOrInfMask,16);
 	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
 	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
 	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
diff --git a/indra/llmath/llvector4logical.h b/indra/llmath/llvector4logical.h
index dd66b09d43..c5698f7cea 100644
--- a/indra/llmath/llvector4logical.h
+++ b/indra/llmath/llvector4logical.h
@@ -27,6 +27,7 @@
 #ifndef	LL_VECTOR4LOGICAL_H
 #define	LL_VECTOR4LOGICAL_H
 
+#include "llmemory.h"
 
 ////////////////////////////
 // LLVector4Logical
@@ -77,6 +78,7 @@ public:
 	inline LLVector4Logical& invert()
 	{
 		static const LL_ALIGN_16(U32 allOnes[4]) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+		ll_assert_aligned(allOnes,16);
 		mQ = _mm_andnot_ps( mQ, *(LLQuad*)(allOnes) );
 		return *this;
 	}
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index da0fa32963..02c8d2b86f 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -35,7 +35,6 @@
 #include <cmath>
 
 #include "llerror.h"
-#include "llmemtype.h"
 
 #include "llvolumemgr.h"
 #include "v2math.h"
@@ -95,17 +94,6 @@ const S32 SCULPT_MIN_AREA_DETAIL = 1;
 
 extern BOOL gDebugGL;
 
-void assert_aligned(void* ptr, uintptr_t alignment)
-{
-#if 0
-	uintptr_t t = (uintptr_t) ptr;
-	if (t%alignment != 0)
-	{
-		llerrs << "Alignment check failed." << llendl;
-	}
-#endif
-}
-
 BOOL check_same_clock_dir( const LLVector3& pt1, const LLVector3& pt2, const LLVector3& pt3, const LLVector3& norm)
 {    
 	LLVector3 test = (pt2-pt1)%(pt3-pt2);
@@ -328,16 +316,16 @@ public:
 		LLVector4a& min = node->mExtents[0];
 		LLVector4a& max = node->mExtents[1];
 
-		if (!branch->getData().empty())
+		if (!branch->isEmpty())
 		{ //node has data, find AABB that binds data set
-			const LLVolumeTriangle* tri = *(branch->getData().begin());
+			const LLVolumeTriangle* tri = *(branch->getDataBegin());
 			
 			//initialize min/max to first available vertex
 			min = *(tri->mV[0]);
 			max = *(tri->mV[0]);
 			
 			for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter = 
-				branch->getData().begin(); iter != branch->getData().end(); ++iter)
+				branch->getDataBegin(); iter != branch->getDataEnd(); ++iter)
 			{ //for each triangle in node
 
 				//stretch by triangles in node
@@ -352,7 +340,7 @@ public:
 				max.setMax(max, *tri->mV[2]);
 			}
 		}
-		else if (!branch->getChildren().empty())
+		else if (!branch->isLeaf())
 		{ //no data, but child nodes exist
 			LLVolumeOctreeListener* child = (LLVolumeOctreeListener*) branch->getChild(0)->getListener(0);
 
@@ -389,8 +377,6 @@ public:
 
 LLProfile::Face* LLProfile::addCap(S16 faceID)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	Face *face   = vector_append(mFaces, 1);
 	
 	face->mIndex = 0;
@@ -403,8 +389,6 @@ LLProfile::Face* LLProfile::addCap(S16 faceID)
 
 LLProfile::Face* LLProfile::addFace(S32 i, S32 count, F32 scaleU, S16 faceID, BOOL flat)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	Face *face   = vector_append(mFaces, 1);
 	
 	face->mIndex = i;
@@ -420,7 +404,6 @@ LLProfile::Face* LLProfile::addFace(S32 i, S32 count, F32 scaleU, S16 faceID, BO
 //static
 S32 LLProfile::getNumNGonPoints(const LLProfileParams& params, S32 sides, F32 offset, F32 bevel, F32 ang_scale, S32 split)
 { // this is basically LLProfile::genNGon stripped down to only the operations that influence the number of points
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
 	S32 np = 0;
 
 	// Generate an n-sided "circular" path.
@@ -486,8 +469,6 @@ S32 LLProfile::getNumNGonPoints(const LLProfileParams& params, S32 sides, F32 of
 // filleted and chamfered corners
 void LLProfile::genNGon(const LLProfileParams& params, S32 sides, F32 offset, F32 bevel, F32 ang_scale, S32 split)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	// Generate an n-sided "circular" path.
 	// 0 is (1,0), and we go counter-clockwise along a circular path from there.
 	const F32 tableScale[] = { 1, 1, 1, 0.5f, 0.707107f, 0.53f, 0.525f, 0.5f };
@@ -741,8 +722,6 @@ LLProfile::Face* LLProfile::addHole(const LLProfileParams& params, BOOL flat, F3
 S32 LLProfile::getNumPoints(const LLProfileParams& params, BOOL path_open,F32 detail, S32 split,
 						 BOOL is_sculpted, S32 sculpt_size)
 { // this is basically LLProfile::generate stripped down to only operations that influence the number of points
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	if (detail < MIN_LOD)
 	{
 		detail = MIN_LOD;
@@ -853,8 +832,6 @@ S32 LLProfile::getNumPoints(const LLProfileParams& params, BOOL path_open,F32 de
 BOOL LLProfile::generate(const LLProfileParams& params, BOOL path_open,F32 detail, S32 split,
 						 BOOL is_sculpted, S32 sculpt_size)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	if ((!mDirty) && (!is_sculpted))
 	{
 		return FALSE;
@@ -1127,8 +1104,6 @@ BOOL LLProfile::generate(const LLProfileParams& params, BOOL path_open,F32 detai
 
 BOOL LLProfileParams::importFile(LLFILE *fp)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	const S32 BUFSIZE = 16384;
 	char buffer[BUFSIZE];	/* Flawfinder: ignore */
 	// *NOTE: changing the size or type of these buffers will require
@@ -1204,8 +1179,6 @@ BOOL LLProfileParams::exportFile(LLFILE *fp) const
 
 BOOL LLProfileParams::importLegacyStream(std::istream& input_stream)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	const S32 BUFSIZE = 16384;
 	char buffer[BUFSIZE];	/* Flawfinder: ignore */
 	// *NOTE: changing the size or type of these buffers will require
@@ -1297,7 +1270,6 @@ bool LLProfileParams::fromLLSD(LLSD& sd)
 
 void LLProfileParams::copyParams(const LLProfileParams &params)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
 	setCurveType(params.getCurveType());
 	setBegin(params.getBegin());
 	setEnd(params.getEnd());
@@ -1514,8 +1486,6 @@ const LLVector2 LLPathParams::getEndScale() const
 
 S32 LLPath::getNumPoints(const LLPathParams& params, F32 detail)
 { // this is basically LLPath::generate stripped down to only the operations that influence the number of points
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	if (detail < MIN_LOD)
 	{
 		detail = MIN_LOD;
@@ -1565,8 +1535,6 @@ S32 LLPath::getNumPoints(const LLPathParams& params, F32 detail)
 BOOL LLPath::generate(const LLPathParams& params, F32 detail, S32 split,
 					  BOOL is_sculpted, S32 sculpt_size)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	if ((!mDirty) && (!is_sculpted))
 	{
 		return FALSE;
@@ -1694,8 +1662,6 @@ BOOL LLPath::generate(const LLPathParams& params, F32 detail, S32 split,
 BOOL LLDynamicPath::generate(const LLPathParams& params, F32 detail, S32 split,
 							 BOOL is_sculpted, S32 sculpt_size)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	mOpen = TRUE; // Draw end caps
 	if (getPathLength() == 0)
 	{
@@ -1717,8 +1683,6 @@ BOOL LLDynamicPath::generate(const LLPathParams& params, F32 detail, S32 split,
 
 BOOL LLPathParams::importFile(LLFILE *fp)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	const S32 BUFSIZE = 16384;
 	char buffer[BUFSIZE];	/* Flawfinder: ignore */
 	// *NOTE: changing the size or type of these buffers will require
@@ -1863,8 +1827,6 @@ BOOL LLPathParams::exportFile(LLFILE *fp) const
 
 BOOL LLPathParams::importLegacyStream(std::istream& input_stream)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	const S32 BUFSIZE = 16384;
 	char buffer[BUFSIZE];	/* Flawfinder: ignore */
 	// *NOTE: changing the size or type of these buffers will require
@@ -2072,12 +2034,11 @@ S32 LLVolume::sNumMeshPoints = 0;
 LLVolume::LLVolume(const LLVolumeParams &params, const F32 detail, const BOOL generate_single_face, const BOOL is_unique)
 	: mParams(params)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	mUnique = is_unique;
 	mFaceMask = 0x0;
 	mDetail = detail;
 	mSculptLevel = -2;
+	mSurfaceArea = 1.f; //only calculated for sculpts, defaults to 1 for all other prims
 	mIsMeshAssetLoaded = FALSE;
 	mLODScaleBias.setVec(1,1,1);
 	mHullPoints = NULL;
@@ -2144,7 +2105,6 @@ LLVolume::~LLVolume()
 
 BOOL LLVolume::generate()
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
 	llassert_always(mProfilep);
 	
 	//Added 10.03.05 Dave Parks
@@ -2740,8 +2700,6 @@ S32	LLVolume::getNumFaces() const
 
 void LLVolume::createVolumeFaces()
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-
 	if (mGenerateSingleFace)
 	{
 		// do nothing
@@ -2903,7 +2861,7 @@ F32 LLVolume::sculptGetSurfaceArea()
 			// compute the area of the quad by taking the length of the cross product of the two triangles
 			LLVector3 cross1 = (p1 - p2) % (p1 - p3);
 			LLVector3 cross2 = (p4 - p2) % (p4 - p3);
-			area += (cross1.magVec() + cross2.magVec()) / 2.0;
+			area += (cross1.magVec() + cross2.magVec()) / 2.f;
 		}
 	}
 
@@ -2913,8 +2871,6 @@ F32 LLVolume::sculptGetSurfaceArea()
 // create placeholder shape
 void LLVolume::sculptGeneratePlaceholder()
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	S32 sizeS = mPathp->mPath.size();
 	S32 sizeT = mProfilep->mProfile.size();
 	
@@ -2951,9 +2907,6 @@ void LLVolume::sculptGenerateMapVertices(U16 sculpt_width, U16 sculpt_height, S8
 	BOOL sculpt_mirror = sculpt_type & LL_SCULPT_FLAG_MIRROR;
 	BOOL reverse_horizontal = (sculpt_invert ? !sculpt_mirror : sculpt_mirror);  // XOR
 	
-	
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	S32 sizeS = mPathp->mPath.size();
 	S32 sizeT = mProfilep->mProfile.size();
 	
@@ -3102,7 +3055,6 @@ void sculpt_calc_mesh_resolution(U16 width, U16 height, U8 type, F32 detail, S32
 // sculpt replaces generate() for sculpted surfaces
 void LLVolume::sculpt(U16 sculpt_width, U16 sculpt_height, S8 sculpt_components, const U8* sculpt_data, S32 sculpt_level)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
     U8 sculpt_type = mParams.getSculptType();
 
 	BOOL data_is_empty = FALSE;
@@ -3144,6 +3096,8 @@ void LLVolume::sculpt(U16 sculpt_width, U16 sculpt_height, S8 sculpt_components,
 		{
 			F32 area = sculptGetSurfaceArea();
 
+			mSurfaceArea = area;
+
 			const F32 SCULPT_MAX_AREA = 384.f;
 
 			if (area < SCULPT_MIN_AREA || area > SCULPT_MAX_AREA)
@@ -3237,7 +3191,6 @@ bool LLVolumeParams::operator<(const LLVolumeParams &params) const
 
 void LLVolumeParams::copyParams(const LLVolumeParams &params)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
 	mProfileParams.copyParams(params.mProfileParams);
 	mPathParams.copyParams(params.mPathParams);
 	mSculptID = params.getSculptID();
@@ -3609,8 +3562,6 @@ bool LLVolumeParams::validate(U8 prof_curve, F32 prof_begin, F32 prof_end, F32 h
 
 S32 *LLVolume::getTriangleIndices(U32 &num_indices) const
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	S32 expected_num_triangle_indices = getNumTriangleIndices();
 	if (expected_num_triangle_indices > MAX_VOLUME_TRIANGLE_INDICES)
 	{
@@ -4338,8 +4289,6 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 										  const LLMatrix3& norm_mat_in,
 										  S32 face_mask)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-
 	LLMatrix4a mat;
 	mat.loadu(mat_in);
 
@@ -4617,18 +4566,83 @@ S32 LLVolume::lineSegmentIntersect(const LLVector4a& start, const LLVector4a& en
 				genBinormals(i);
 			}
 
-			if (!face.mOctree)
-			{
-				face.createOctree();
-			}
-			
-			//LLVector4a* p = (LLVector4a*) face.mPositions;
+			if (isUnique())
+			{ //don't bother with an octree for flexi volumes
+				U32 tri_count = face.mNumIndices/3;
+
+				for (U32 j = 0; j < tri_count; ++j)
+				{
+					U16 idx0 = face.mIndices[j*3+0];
+					U16 idx1 = face.mIndices[j*3+1];
+					U16 idx2 = face.mIndices[j*3+2];
+
+					const LLVector4a& v0 = face.mPositions[idx0];
+					const LLVector4a& v1 = face.mPositions[idx1];
+					const LLVector4a& v2 = face.mPositions[idx2];
+				
+					F32 a,b,t;
 
-			LLOctreeTriangleRayIntersect intersect(start, dir, &face, &closest_t, intersection, tex_coord, normal, bi_normal);
-			intersect.traverse(face.mOctree);
-			if (intersect.mHitFace)
+					if (LLTriangleRayIntersect(v0, v1, v2,
+							start, dir, a, b, t))
+					{
+						if ((t >= 0.f) &&      // if hit is after start
+							(t <= 1.f) &&      // and before end
+							(t < closest_t))   // and this hit is closer
+						{
+							closest_t = t;
+							hit_face = i;
+
+							if (intersection != NULL)
+							{
+								LLVector4a intersect = dir;
+								intersect.mul(closest_t);
+								intersect.add(start);
+								intersection->set(intersect.getF32ptr());
+							}
+
+
+							if (tex_coord != NULL)
+							{
+								LLVector2* tc = (LLVector2*) face.mTexCoords;
+								*tex_coord = ((1.f - a - b)  * tc[idx0] +
+									a              * tc[idx1] +
+									b              * tc[idx2]);
+
+							}
+
+							if (normal!= NULL)
+							{
+								LLVector4* norm = (LLVector4*) face.mNormals;
+
+								*normal		= ((1.f - a - b)  * LLVector3(norm[idx0]) + 
+									a              * LLVector3(norm[idx1]) +
+									b              * LLVector3(norm[idx2]));
+							}
+
+							if (bi_normal != NULL)
+							{
+								LLVector4* binormal = (LLVector4*) face.mBinormals;
+								*bi_normal = ((1.f - a - b)  * LLVector3(binormal[idx0]) + 
+										a              * LLVector3(binormal[idx1]) +
+										b              * LLVector3(binormal[idx2]));
+							}
+						}
+					}
+				}
+			}
+			else
 			{
-				hit_face = i;
+				if (!face.mOctree)
+				{
+					face.createOctree();
+				}
+			
+				LLOctreeTriangleRayIntersect intersect(start, dir, &face, &closest_t, intersection, tex_coord, normal, bi_normal);
+				intersect.traverse(face.mOctree);
+				if (intersect.mHitFace)
+				{
+					hit_face = i;
+				}
 			}
 		}		
 	}
@@ -4736,241 +4750,8 @@ BOOL equalTriangle(const S32 *a, const S32 *b)
 	return FALSE;
 }
 
-BOOL LLVolume::cleanupTriangleData( const S32 num_input_vertices,
-									const std::vector<Point>& input_vertices,
-									const S32 num_input_triangles,
-									S32 *input_triangles,
-									S32 &num_output_vertices,
-									LLVector3 **output_vertices,
-									S32 &num_output_triangles,
-									S32 **output_triangles)
-{
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
-	/* Testing: avoid any cleanup
-	static BOOL skip_cleanup = TRUE;
-	if ( skip_cleanup )
-	{
-		num_output_vertices = num_input_vertices;
-		num_output_triangles = num_input_triangles;
-
-		*output_vertices = new LLVector3[num_input_vertices];
-		for (S32 index = 0; index < num_input_vertices; index++)
-		{
-			(*output_vertices)[index] = input_vertices[index].mPos;
-		}
-
-		*output_triangles = new S32[num_input_triangles*3];
-		memcpy(*output_triangles, input_triangles, 3*num_input_triangles*sizeof(S32));		// Flawfinder: ignore
-		return TRUE;
-	}
-	*/
-
-	// Here's how we do this:
-	// Create a structure which contains the original vertex index and the
-	// LLVector3 data.
-	// "Sort" the data by the vectors
-	// Create an array the size of the old vertex list, with a mapping of
-	// old indices to new indices.
-	// Go through triangles, shift so the lowest index is first
-	// Sort triangles by first index
-	// Remove duplicate triangles
-	// Allocate and pack new triangle data.
-
-	//LLTimer cleanupTimer;
-	//llinfos << "In vertices: " << num_input_vertices << llendl;
-	//llinfos << "In triangles: " << num_input_triangles << llendl;
-
-	S32 i;
-	typedef std::multiset<LLVertexIndexPair*, lessVertex> vertex_set_t;
-	vertex_set_t vertex_list;
-
-	LLVertexIndexPair *pairp = NULL;
-	for (i = 0; i < num_input_vertices; i++)
-	{
-		LLVertexIndexPair *new_pairp = new LLVertexIndexPair(input_vertices[i].mPos, i);
-		vertex_list.insert(new_pairp);
-	}
-
-	// Generate the vertex mapping and the list of vertices without
-	// duplicates.  This will crash if there are no vertices.
-	llassert(num_input_vertices > 0); // check for no vertices!
-	S32 *vertex_mapping = new S32[num_input_vertices];
-	LLVector3 *new_vertices = new LLVector3[num_input_vertices];
-	LLVertexIndexPair *prev_pairp = NULL;
-
-	S32 new_num_vertices;
-
-	new_num_vertices = 0;
-	for (vertex_set_t::iterator iter = vertex_list.begin(),
-			 end = vertex_list.end();
-		 iter != end; iter++)
-	{
-		pairp = *iter;
-		if (!prev_pairp || ((pairp->mVertex - prev_pairp->mVertex).magVecSquared() >= VERTEX_SLOP_SQRD))	
-		{
-			new_vertices[new_num_vertices] = pairp->mVertex;
-			//llinfos << "Added vertex " << new_num_vertices << " : " << pairp->mVertex << llendl;
-			new_num_vertices++;
-			// Update the previous
-			prev_pairp = pairp;
-		}
-		else
-		{
-			//llinfos << "Removed duplicate vertex " << pairp->mVertex << ", distance magVecSquared() is " << (pairp->mVertex - prev_pairp->mVertex).magVecSquared() << llendl;
-		}
-		vertex_mapping[pairp->mIndex] = new_num_vertices - 1;
-	}
-
-	// Iterate through triangles and remove degenerates, re-ordering vertices
-	// along the way.
-	S32 *new_triangles = new S32[num_input_triangles * 3];
-	S32 new_num_triangles = 0;
-
-	for (i = 0; i < num_input_triangles; i++)
-	{
-		S32 v1 = i*3;
-		S32 v2 = v1 + 1;
-		S32 v3 = v1 + 2;
-
-		//llinfos << "Checking triangle " << input_triangles[v1] << ":" << input_triangles[v2] << ":" << input_triangles[v3] << llendl;
-		input_triangles[v1] = vertex_mapping[input_triangles[v1]];
-		input_triangles[v2] = vertex_mapping[input_triangles[v2]];
-		input_triangles[v3] = vertex_mapping[input_triangles[v3]];
-
-		if ((input_triangles[v1] == input_triangles[v2])
-			|| (input_triangles[v1] == input_triangles[v3])
-			|| (input_triangles[v2] == input_triangles[v3]))
-		{
-			//llinfos << "Removing degenerate triangle " << input_triangles[v1] << ":" << input_triangles[v2] << ":" << input_triangles[v3] << llendl;
-			// Degenerate triangle, skip
-			continue;
-		}
-
-		if (input_triangles[v1] < input_triangles[v2])
-		{
-			if (input_triangles[v1] < input_triangles[v3])
-			{
-				// (0 < 1) && (0 < 2)
-				new_triangles[new_num_triangles*3] = input_triangles[v1];
-				new_triangles[new_num_triangles*3+1] = input_triangles[v2];
-				new_triangles[new_num_triangles*3+2] = input_triangles[v3];
-			}
-			else
-			{
-				// (0 < 1) && (2 < 0)
-				new_triangles[new_num_triangles*3] = input_triangles[v3];
-				new_triangles[new_num_triangles*3+1] = input_triangles[v1];
-				new_triangles[new_num_triangles*3+2] = input_triangles[v2];
-			}
-		}
-		else if (input_triangles[v2] < input_triangles[v3])
-		{
-			// (1 < 0) && (1 < 2)
-			new_triangles[new_num_triangles*3] = input_triangles[v2];
-			new_triangles[new_num_triangles*3+1] = input_triangles[v3];
-			new_triangles[new_num_triangles*3+2] = input_triangles[v1];
-		}
-		else
-		{
-			// (1 < 0) && (2 < 1)
-			new_triangles[new_num_triangles*3] = input_triangles[v3];
-			new_triangles[new_num_triangles*3+1] = input_triangles[v1];
-			new_triangles[new_num_triangles*3+2] = input_triangles[v2];
-		}
-		new_num_triangles++;
-	}
-
-	if (new_num_triangles == 0)
-	{
-		llwarns << "Created volume object with 0 faces." << llendl;
-		delete[] new_triangles;
-		delete[] vertex_mapping;
-		delete[] new_vertices;
-		return FALSE;
-	}
-
-	typedef std::set<S32*, lessTriangle> triangle_set_t;
-	triangle_set_t triangle_list;
-
-	for (i = 0; i < new_num_triangles; i++)
-	{
-		triangle_list.insert(&new_triangles[i*3]);
-	}
-
-	// Sort through the triangle list, and delete duplicates
-
-	S32 *prevp = NULL;
-	S32 *curp = NULL;
-
-	S32 *sorted_tris = new S32[new_num_triangles*3];
-	S32 cur_tri = 0;
-	for (triangle_set_t::iterator iter = triangle_list.begin(),
-			 end = triangle_list.end();
-		 iter != end; iter++)
-	{
-		curp = *iter;
-		if (!prevp || !equalTriangle(prevp, curp))
-		{
-			//llinfos << "Added triangle " << *curp << ":" << *(curp+1) << ":" << *(curp+2) << llendl;
-			sorted_tris[cur_tri*3] = *curp;
-			sorted_tris[cur_tri*3+1] = *(curp+1);
-			sorted_tris[cur_tri*3+2] = *(curp+2);
-			cur_tri++;
-			prevp = curp;
-		}
-		else
-		{
-			//llinfos << "Skipped triangle " << *curp << ":" << *(curp+1) << ":" << *(curp+2) << llendl;
-		}
-	}
-
-	*output_vertices = new LLVector3[new_num_vertices];
-	num_output_vertices = new_num_vertices;
-	for (i = 0; i < new_num_vertices; i++)
-	{
-		(*output_vertices)[i] = new_vertices[i];
-	}
-
-	*output_triangles = new S32[cur_tri*3];
-	num_output_triangles = cur_tri;
-	memcpy(*output_triangles, sorted_tris, 3*cur_tri*sizeof(S32));		/* Flawfinder: ignore */
-
-	/*
-	llinfos << "Out vertices: " << num_output_vertices << llendl;
-	llinfos << "Out triangles: " << num_output_triangles << llendl;
-	for (i = 0; i < num_output_vertices; i++)
-	{
-		llinfos << i << ":" << (*output_vertices)[i] << llendl;
-	}
-	for (i = 0; i < num_output_triangles; i++)
-	{
-		llinfos << i << ":" << (*output_triangles)[i*3] << ":" << (*output_triangles)[i*3+1] << ":" << (*output_triangles)[i*3+2] << llendl;
-	}
-	*/
-
-	//llinfos << "Out vertices: " << num_output_vertices << llendl;
-	//llinfos << "Out triangles: " << num_output_triangles << llendl;
-	delete[] vertex_mapping;
-	vertex_mapping = NULL;
-	delete[] new_vertices;
-	new_vertices = NULL;
-	delete[] new_triangles;
-	new_triangles = NULL;
-	delete[] sorted_tris;
-	sorted_tris = NULL;
-	triangle_list.clear();
-	std::for_each(vertex_list.begin(), vertex_list.end(), DeletePointer());
-	vertex_list.clear();
-	
-	return TRUE;
-}
-
-
 BOOL LLVolumeParams::importFile(LLFILE *fp)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	//llinfos << "importing volume" << llendl;
 	const S32 BUFSIZE = 16384;
 	char buffer[BUFSIZE];	/* Flawfinder: ignore */
@@ -5025,8 +4806,6 @@ BOOL LLVolumeParams::exportFile(LLFILE *fp) const
 
 BOOL LLVolumeParams::importLegacyStream(std::istream& input_stream)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	//llinfos << "importing volume" << llendl;
 	const S32 BUFSIZE = 16384;
 	// *NOTE: changing the size or type of this buffer will require
@@ -5066,8 +4845,6 @@ BOOL LLVolumeParams::importLegacyStream(std::istream& input_stream)
 
 BOOL LLVolumeParams::exportLegacyStream(std::ostream& output_stream) const
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	output_stream <<"\tshape 0\n";
 	output_stream <<"\t{\n";
 	mPathParams.exportLegacyStream(output_stream);
@@ -5822,7 +5599,7 @@ F32 find_vertex_score(LLVCacheVertexData& data)
 	}
 
 	//bonus points for having low valence
-	F32 valence_boost = powf(data.mActiveTriangles, -FindVertexScore_ValenceBoostPower);
+	F32 valence_boost = powf((F32)data.mActiveTriangles, -FindVertexScore_ValenceBoostPower);
 	score += FindVertexScore_ValenceBoostScale * valence_boost;
 
 	return score;
@@ -6283,8 +6060,6 @@ void	LerpPlanarVertex(LLVolumeFace::VertexData& v0,
 
 BOOL LLVolumeFace::createUnCutCubeCap(LLVolume* volume, BOOL partial_build)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	const std::vector<LLVolume::Point>& mesh = volume->getMesh();
 	const std::vector<LLVector3>& profile = volume->getProfile().mProfile;
 	S32 max_s = volume->getProfile().getTotal();
@@ -6435,8 +6210,6 @@ BOOL LLVolumeFace::createUnCutCubeCap(LLVolume* volume, BOOL partial_build)
 
 BOOL LLVolumeFace::createCap(LLVolume* volume, BOOL partial_build)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	if (!(mTypeMask & HOLLOW_MASK) && 
 		!(mTypeMask & OPEN_MASK) && 
 		((volume->getParams().getPathParams().getBegin()==0.0f)&&
@@ -6823,8 +6596,6 @@ BOOL LLVolumeFace::createCap(LLVolume* volume, BOOL partial_build)
 
 void LLVolumeFace::createBinormals()
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	if (!mBinormals)
 	{
 		allocateBinormals(mNumVertices);
@@ -6894,14 +6665,14 @@ void LLVolumeFace::resizeVertices(S32 num_verts)
 	if (num_verts)
 	{
 		mPositions = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		assert_aligned(mPositions, 16);
+		ll_assert_aligned(mPositions, 16);
 		mNormals = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		assert_aligned(mNormals, 16);
+		ll_assert_aligned(mNormals, 16);
 
 		//pad texture coordinate block end to allow for QWORD reads
 		S32 size = ((num_verts*sizeof(LLVector2)) + 0xF) & ~0xF;
 		mTexCoords = (LLVector2*) ll_aligned_malloc_16(size);
-		assert_aligned(mTexCoords, 16);
+		ll_assert_aligned(mTexCoords, 16);
 	}
 	else
 	{
@@ -6922,17 +6693,21 @@ void LLVolumeFace::pushVertex(const LLVector4a& pos, const LLVector4a& norm, con
 {
 	S32 new_verts = mNumVertices+1;
 	S32 new_size = new_verts*16;
-//	S32 old_size = mNumVertices*16;
+	S32 old_size = mNumVertices*16;
 
 	//positions
-	mPositions = (LLVector4a*) realloc(mPositions, new_size);
+	mPositions = (LLVector4a*) ll_aligned_realloc_16(mPositions, new_size, old_size);
+	ll_assert_aligned(mPositions,16);
 	
 	//normals
-	mNormals = (LLVector4a*) realloc(mNormals, new_size);
-	
+	mNormals = (LLVector4a*) ll_aligned_realloc_16(mNormals, new_size, old_size);
+	ll_assert_aligned(mNormals,16);
+
 	//tex coords
 	new_size = ((new_verts*8)+0xF) & ~0xF;
-	mTexCoords = (LLVector2*) realloc(mTexCoords, new_size);
+	old_size = ((mNumVertices*8)+0xF) & ~0xF;
+	mTexCoords = (LLVector2*) ll_aligned_realloc_16(mTexCoords, new_size, old_size);
+	ll_assert_aligned(mTexCoords,16);
 	
 
 	//just clear binormals
@@ -6985,7 +6760,8 @@ void LLVolumeFace::pushIndex(const U16& idx)
 	S32 old_size = ((mNumIndices*2)+0xF) & ~0xF;
 	if (new_size != old_size)
 	{
-		mIndices = (U16*) realloc(mIndices, new_size);
+		mIndices = (U16*) ll_aligned_realloc_16(mIndices, new_size, old_size);
+		ll_assert_aligned(mIndices,16);
 	}
 	
 	mIndices[mNumIndices++] = idx;
@@ -7026,12 +6802,12 @@ void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMat
 	}
 
 	//allocate new buffer space
-	mPositions = (LLVector4a*) realloc(mPositions, new_count*sizeof(LLVector4a));
-	assert_aligned(mPositions, 16);
-	mNormals = (LLVector4a*) realloc(mNormals, new_count*sizeof(LLVector4a));
-	assert_aligned(mNormals, 16);
-	mTexCoords = (LLVector2*) realloc(mTexCoords, (new_count*sizeof(LLVector2)+0xF) & ~0xF);
-	assert_aligned(mTexCoords, 16);
+	mPositions = (LLVector4a*) ll_aligned_realloc_16(mPositions, new_count*sizeof(LLVector4a), mNumVertices*sizeof(LLVector4a));
+	ll_assert_aligned(mPositions, 16);
+	mNormals = (LLVector4a*) ll_aligned_realloc_16(mNormals, new_count*sizeof(LLVector4a), mNumVertices*sizeof(LLVector4a));
+	ll_assert_aligned(mNormals, 16);
+	mTexCoords = (LLVector2*) ll_aligned_realloc_16(mTexCoords, (new_count*sizeof(LLVector2)+0xF) & ~0xF, (mNumVertices*sizeof(LLVector2)+0xF) & ~0xF);
+	ll_assert_aligned(mTexCoords, 16);
 	
 	mNumVertices = new_count;
 
@@ -7077,7 +6853,7 @@ void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMat
 	new_count = mNumIndices + face.mNumIndices;
 
 	//allocate new index buffer
-	mIndices = (U16*) realloc(mIndices, (new_count*sizeof(U16)+0xF) & ~0xF);
+	mIndices = (U16*) ll_aligned_realloc_16(mIndices, (new_count*sizeof(U16)+0xF) & ~0xF, (mNumIndices*sizeof(U16)+0xF) & ~0xF);
 	
 	//get destination address into new index buffer
 	U16* dst_idx = mIndices+mNumIndices;
@@ -7091,8 +6867,6 @@ void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMat
 
 BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
-	
 	BOOL flat = mTypeMask & FLAT_MASK;
 
 	U8 sculpt_type = volume->getParams().getSculptType();
diff --git a/indra/llmath/llvolume.h b/indra/llmath/llvolume.h
index afd1ec5eed..c845556557 100644
--- a/indra/llmath/llvolume.h
+++ b/indra/llmath/llvolume.h
@@ -54,6 +54,7 @@ class LLVolumeTriangle;
 #include "llstrider.h"
 #include "v4coloru.h"
 #include "llrefcount.h"
+#include "llpointer.h"
 #include "llfile.h"
 
 //============================================================================
@@ -919,6 +920,10 @@ public:
 	LLVector2*  mTexCoords;
 	U16* mIndices;
 
+	//vertex buffer filled in by LLFace to cache this volume face geometry in vram 
+	// (declared as a LLPointer to LLRefCount to avoid dependency on LLVertexBuffer)
+	mutable LLPointer<LLRefCount> mVertexBuffer; 
+
 	std::vector<S32>	mEdge;
 
 	//list of skin weights for rigged volumes
@@ -963,6 +968,7 @@ public:
 	S32	getNumFaces() const;
 	S32 getNumVolumeFaces() const							{ return mVolumeFaces.size(); }
 	F32 getDetail() const									{ return mDetail; }
+	F32 getSurfaceArea() const								{ return mSurfaceArea; }
 	const LLVolumeParams& getParams() const					{ return mParams; }
 	LLVolumeParams getCopyOfParams() const					{ return mParams; }
 	const LLProfile& getProfile() const						{ return *mProfilep; }
@@ -1017,17 +1023,6 @@ public:
 								   LLVector3* normal = NULL,
 								   LLVector3* bi_normal = NULL);
 	
-	// The following cleans up vertices and triangles,
-	// getting rid of degenerate triangles and duplicate vertices,
-	// and allocates new arrays with the clean data.
-	static BOOL cleanupTriangleData( const S32 num_input_vertices,
-								const std::vector<Point> &input_vertices,
-								const S32 num_input_triangles,
-								S32 *input_triangles,
-								S32 &num_output_vertices,
-								LLVector3 **output_vertices,
-								S32 &num_output_triangles,
-								S32 **output_triangles);
 	LLFaceID generateFaceMask();
 
 	BOOL isFaceMaskValid(LLFaceID face_mask);
@@ -1065,6 +1060,7 @@ public:
 	BOOL mUnique;
 	F32 mDetail;
 	S32 mSculptLevel;
+	F32 mSurfaceArea; //unscaled surface area
 	BOOL mIsMeshAssetLoaded;
 	
 	LLVolumeParams mParams;
diff --git a/indra/llmath/llvolumemgr.cpp b/indra/llmath/llvolumemgr.cpp
index c60b750088..9083273ee5 100644
--- a/indra/llmath/llvolumemgr.cpp
+++ b/indra/llmath/llvolumemgr.cpp
@@ -26,7 +26,6 @@
 #include "linden_common.h"
 
 #include "llvolumemgr.h"
-#include "llmemtype.h"
 #include "llvolume.h"
 
 
@@ -182,7 +181,6 @@ void LLVolumeMgr::insertGroup(LLVolumeLODGroup* volgroup)
 // protected
 LLVolumeLODGroup* LLVolumeMgr::createNewGroup(const LLVolumeParams& volume_params)
 {
-	LLMemType m1(LLMemType::MTYPE_VOLUME);
 	LLVolumeLODGroup* volgroup = new LLVolumeLODGroup(volume_params);
 	insertGroup(volgroup);
 	return volgroup;
@@ -297,7 +295,6 @@ LLVolume* LLVolumeLODGroup::refLOD(const S32 detail)
 	mRefs++;
 	if (mVolumeLODs[detail].isNull())
 	{
-		LLMemType m1(LLMemType::MTYPE_VOLUME);
 		mVolumeLODs[detail] = new LLVolume(mVolumeParams, mDetailScales[detail]);
 	}
 	mLODRefs[detail]++;
diff --git a/indra/llmath/llvolumeoctree.cpp b/indra/llmath/llvolumeoctree.cpp
index b5a935c2b5..cc83cb7235 100644
--- a/indra/llmath/llvolumeoctree.cpp
+++ b/indra/llmath/llvolumeoctree.cpp
@@ -131,7 +131,7 @@ void LLOctreeTriangleRayIntersect::traverse(const LLOctreeNode<LLVolumeTriangle>
 void LLOctreeTriangleRayIntersect::visit(const LLOctreeNode<LLVolumeTriangle>* node)
 {
 	for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter = 
-			node->getData().begin(); iter != node->getData().end(); ++iter)
+			node->getDataBegin(); iter != node->getDataEnd(); ++iter)
 	{
 		const LLVolumeTriangle* tri = *iter;
 
@@ -236,8 +236,8 @@ void LLVolumeOctreeValidate::visit(const LLOctreeNode<LLVolumeTriangle>* branch)
 	}
 
 	//children fit, check data
-	for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter = branch->getData().begin(); 
-			iter != branch->getData().end(); ++iter)
+	for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter = branch->getDataBegin(); 
+			iter != branch->getDataEnd(); ++iter)
 	{
 		const LLVolumeTriangle* tri = *iter;
 
diff --git a/indra/llmath/llvolumeoctree.h b/indra/llmath/llvolumeoctree.h
index 688d91dc40..9ae34a0c4e 100644
--- a/indra/llmath/llvolumeoctree.h
+++ b/indra/llmath/llvolumeoctree.h
@@ -37,9 +37,19 @@
 class LLVolumeTriangle : public LLRefCount
 {
 public:
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	LLVolumeTriangle()
 	{
-		
+		mBinIndex = -1;	
 	}
 
 	LLVolumeTriangle(const LLVolumeTriangle& rhs)
@@ -58,21 +68,38 @@ public:
 	
 	}
 
-	LLVector4a mPositionGroup;
+	LL_ALIGN_16(LLVector4a mPositionGroup);
 
 	const LLVector4a* mV[3];
 	U16 mIndex[3];
 
 	F32 mRadius;
+	mutable S32 mBinIndex;
+
 
 	virtual const LLVector4a& getPositionGroup() const;
 	virtual const F32& getBinRadius() const;
+	
+	S32 getBinIndex() const { return mBinIndex; }
+	void setBinIndex(S32 idx) const { mBinIndex = idx; }
+
+
 };
 
 class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle>
 {
 public:
 	
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node);
 	~LLVolumeOctreeListener();
 	
@@ -99,8 +126,8 @@ public:
 	
 
 public:
-	LLVector4a mBounds[2]; // bounding box (center, size) of this node and all its children (tight fit to objects)
-	LLVector4a mExtents[2]; // extents (min, max) of this node and all its children
+	LL_ALIGN_16(LLVector4a mBounds[2]); // bounding box (center, size) of this node and all its children (tight fit to objects)
+	LL_ALIGN_16(LLVector4a mExtents[2]); // extents (min, max) of this node and all its children
 };
 
 class LLOctreeTriangleRayIntersect : public LLOctreeTraveler<LLVolumeTriangle>
diff --git a/indra/llmath/m4math.cpp b/indra/llmath/m4math.cpp
index bad4deb4de..6a1b4143cf 100644
--- a/indra/llmath/m4math.cpp
+++ b/indra/llmath/m4math.cpp
@@ -858,25 +858,25 @@ LLSD LLMatrix4::getValue() const
 
 void LLMatrix4::setValue(const LLSD& data) 
 {
-	mMatrix[0][0] = data[0].asReal();
-	mMatrix[0][1] = data[1].asReal();
-	mMatrix[0][2] = data[2].asReal();
-	mMatrix[0][3] = data[3].asReal();
-
-	mMatrix[1][0] = data[4].asReal();
-	mMatrix[1][1] = data[5].asReal();
-	mMatrix[1][2] = data[6].asReal();
-	mMatrix[1][3] = data[7].asReal();
-
-	mMatrix[2][0] = data[8].asReal();
-	mMatrix[2][1] = data[9].asReal();
-	mMatrix[2][2] = data[10].asReal();
-	mMatrix[2][3] = data[11].asReal();
-
-	mMatrix[3][0] = data[12].asReal();
-	mMatrix[3][1] = data[13].asReal();
-	mMatrix[3][2] = data[14].asReal();
-	mMatrix[3][3] = data[15].asReal();
+	mMatrix[0][0] = (F32)data[0].asReal();
+	mMatrix[0][1] = (F32)data[1].asReal();
+	mMatrix[0][2] = (F32)data[2].asReal();
+	mMatrix[0][3] = (F32)data[3].asReal();
+
+	mMatrix[1][0] = (F32)data[4].asReal();
+	mMatrix[1][1] = (F32)data[5].asReal();
+	mMatrix[1][2] = (F32)data[6].asReal();
+	mMatrix[1][3] = (F32)data[7].asReal();
+
+	mMatrix[2][0] = (F32)data[8].asReal();
+	mMatrix[2][1] = (F32)data[9].asReal();
+	mMatrix[2][2] = (F32)data[10].asReal();
+	mMatrix[2][3] = (F32)data[11].asReal();
+
+	mMatrix[3][0] = (F32)data[12].asReal();
+	mMatrix[3][1] = (F32)data[13].asReal();
+	mMatrix[3][2] = (F32)data[14].asReal();
+	mMatrix[3][3] = (F32)data[15].asReal();
 }
 
 
diff --git a/indra/llmath/tests/alignment_test.cpp b/indra/llmath/tests/alignment_test.cpp
new file mode 100644
index 0000000000..5ee3c45502
--- /dev/null
+++ b/indra/llmath/tests/alignment_test.cpp
@@ -0,0 +1,141 @@
+/**
+ * @file v3dmath_test.cpp
+ * @author Vir
+ * @date 2011-12
+ * @brief v3dmath test cases.
+ *
+ * $LicenseInfo:firstyear=2011&license=viewerlgpl$
+ * Second Life Viewer Source Code
+ * Copyright (C) 2011, Linden Research, Inc.
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License only.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * 
+ * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
+ * $/LicenseInfo$
+ */
+
+// Tests related to allocating objects with alignment constraints, particularly for SSE support.
+
+#include "linden_common.h"
+#include "../test/lltut.h"
+#include "../llmath.h"
+#include "../llsimdmath.h"
+#include "../llvector4a.h"
+
+namespace tut
+{
+
+#define is_aligned(ptr,alignment) ((reinterpret_cast<uintptr_t>(ptr))%(alignment)==0)
+#define is_aligned_relative(ptr,base_ptr,alignment) ((reinterpret_cast<uintptr_t>(ptr)-reinterpret_cast<uintptr_t>(base_ptr))%(alignment)==0)
+
+struct alignment_test {};
+
+typedef test_group<alignment_test> alignment_test_t;
+typedef alignment_test_t::object alignment_test_object_t;
+tut::alignment_test_t tut_alignment_test("LLAlignment");
+
+LL_ALIGN_PREFIX(16)
+class MyVector4a
+{
+public:
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void *p)
+	{
+		ll_aligned_free_16(p);
+	}
+
+	void* operator new[](size_t count)
+	{	// try to allocate count bytes for an array
+		return ll_aligned_malloc_16(count);
+	}
+
+	void operator delete[](void *p)
+	{
+		ll_aligned_free_16(p);
+	}
+
+	LLQuad mQ;
+} LL_ALIGN_POSTFIX(16);
+
+
+// Verify that aligned allocators perform as advertised.
+template<> template<>
+void alignment_test_object_t::test<1>()
+{
+#   ifdef LL_DEBUG
+//	skip("This test fails on Windows when compiled in debug mode.");
+#   endif
+	
+	const int num_tests = 7;
+	void *align_ptr;
+	for (int i=0; i<num_tests; i++)
+	{
+		align_ptr = ll_aligned_malloc_16(sizeof(MyVector4a));
+		ensure("ll_aligned_malloc_16 failed", is_aligned(align_ptr,16));
+
+		align_ptr = ll_aligned_realloc_16(align_ptr,2*sizeof(MyVector4a), sizeof(MyVector4a));
+		ensure("ll_aligned_realloc_16 failed", is_aligned(align_ptr,16));
+
+		ll_aligned_free_16(align_ptr);
+
+		align_ptr = ll_aligned_malloc_32(sizeof(MyVector4a));
+		ensure("ll_aligned_malloc_32 failed", is_aligned(align_ptr,32));
+		ll_aligned_free_32(align_ptr);
+	}
+}
+
+// In-place allocation of objects and arrays.
+template<> template<>
+void alignment_test_object_t::test<2>()
+{
+	MyVector4a vec1;
+	ensure("LLAlignment vec1 unaligned", is_aligned(&vec1,16));
+	
+	MyVector4a veca[12];
+	ensure("LLAlignment veca unaligned", is_aligned(veca,16));
+}
+
+// Heap allocation of objects and arrays.
+template<> template<>
+void alignment_test_object_t::test<3>()
+{
+#   ifdef LL_DEBUG
+//	skip("This test fails on Windows when compiled in debug mode.");
+#   endif
+	
+	const int ARR_SIZE = 7;
+	for(int i=0; i<ARR_SIZE; i++)
+	{
+		MyVector4a *vecp = new MyVector4a;
+		ensure("LLAlignment vecp unaligned", is_aligned(vecp,16));
+		delete vecp;
+	}
+
+	MyVector4a *veca = new MyVector4a[ARR_SIZE];
+	//std::cout << "veca base is " << (S32) veca << std::endl;
+	ensure("LLAligment veca base", is_aligned(veca,16));
+	for(int i=0; i<ARR_SIZE; i++)
+	{
+		std::cout << "veca[" << i << "]" << std::endl;
+		ensure("LLAlignment veca member unaligned", is_aligned(&veca[i],16));
+	}
+	delete [] veca; 
+}
+
+}
diff --git a/indra/llmath/v3color.h b/indra/llmath/v3color.h
index 56cb2ae73e..daf3a6857b 100644
--- a/indra/llmath/v3color.h
+++ b/indra/llmath/v3color.h
@@ -33,6 +33,7 @@ class LLVector4;
 #include "llerror.h"
 #include "llmath.h"
 #include "llsd.h"
+#include <string.h>
 
 //  LLColor3 = |r g b|