11 files changed, 101 insertions, 71 deletions
diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h
index 7ba347062f..5291a05607 100644
--- a/indra/llmath/llmatrix4a.h
+++ b/indra/llmath/llmatrix4a.h
@@ -36,6 +36,26 @@ class LLMatrix4a
 public:
 	LL_ALIGN_16(LLVector4a mMatrix[4]);
 
+    LLMatrix4a()
+    {
+
+    }
+
+    explicit LLMatrix4a(const LLMatrix4& val)
+    {
+        loadu(val);
+    }
+
+    inline F32* getF32ptr()
+    {
+        return (F32*) &mMatrix;
+    }
+
+    inline const F32* getF32ptr() const
+    {
+        return (F32*)&mMatrix;
+    }
+
 	inline void clear()
 	{
 		mMatrix[0].clear();
@@ -44,6 +64,14 @@ public:
 		mMatrix[3].clear();
 	}
 
+    inline void setIdentity()
+    {
+        mMatrix[0].set(1.f, 0.f, 0.f, 0.f);
+        mMatrix[1].set(0.f, 1.f, 0.f, 0.f);
+        mMatrix[2].set(0.f, 0.f, 1.f, 0.f);
+        mMatrix[3].set(0.f, 0.f, 0.f, 1.f);
+    }
+
 	inline void loadu(const LLMatrix4& src)
 	{
 		mMatrix[0] = _mm_loadu_ps(src.mMatrix[0]);
@@ -105,7 +133,7 @@ public:
 		mMatrix[3].setAdd(a.mMatrix[3],d3);
 	}
 
-	inline void rotate(const LLVector4a& v, LLVector4a& res)
+	inline void rotate(const LLVector4a& v, LLVector4a& res) const
 	{
 		LLVector4a y,z;
 
@@ -151,6 +179,8 @@ public:
     {
         affineTransformSSE(v,res);
     }
+
+    const LLVector4a& getTranslation() const { return mMatrix[3]; }
 };
 
 inline LLVector4a rowMul(const LLVector4a &row, const LLMatrix4a &mat)
@@ -176,6 +206,15 @@ inline void matMul(const LLMatrix4a &a, const LLMatrix4a &b, LLMatrix4a &res)
     res.mMatrix[3] = row3;
 }
 
+//Faster version of matMul wehere res must not be a or b
+inline void matMulUnsafe(const LLMatrix4a &a, const LLMatrix4a &b, LLMatrix4a &res)
+{
+    res.mMatrix[0] = rowMul(a.mMatrix[0], b);
+    res.mMatrix[1] = rowMul(a.mMatrix[1], b);
+    res.mMatrix[2] = rowMul(a.mMatrix[2], b);
+    res.mMatrix[3] = rowMul(a.mMatrix[3], b);
+}
+
 inline std::ostream& operator<<(std::ostream& s, const LLMatrix4a& m)
 {
     s << "[" << m.mMatrix[0] << ", " << m.mMatrix[1] << ", " << m.mMatrix[2] << ", " << m.mMatrix[3] << "]";
diff --git a/indra/llmath/lloctree.h b/indra/llmath/lloctree.h
index 0e2f62f9db..8c4a1304b4 100644
--- a/indra/llmath/lloctree.h
+++ b/indra/llmath/lloctree.h
@@ -74,8 +74,9 @@ public:
 };
 
 template <class T>
-class LLOctreeNode : public LLTreeNode<T>
+class alignas(16) LLOctreeNode : public LLTreeNode<T>
 {
+    LL_ALIGN_NEW
 public:
 
 	typedef LLOctreeTraveler<T>									oct_traveler;
@@ -91,16 +92,6 @@ public:
 	typedef LLOctreeNode<T>		oct_node;
 	typedef LLOctreeListener<T>	oct_listener;
 
-	void* operator new(size_t size)
-	{
-		return ll_aligned_malloc_16(size);
-	}
-
-	void operator delete(void* ptr)
-	{
-		ll_aligned_free_16(ptr);
-	}
-
 	LLOctreeNode(	const LLVector4a& center, 
 					const LLVector4a& size, 
 					BaseType* parent, 
diff --git a/indra/llmath/llrigginginfo.h b/indra/llmath/llrigginginfo.h
index b3d6bc2d19..059c6ae082 100644
--- a/indra/llmath/llrigginginfo.h
+++ b/indra/llmath/llrigginginfo.h
@@ -34,9 +34,9 @@
 
 // Extents are in joint space
 // isRiggedTo is based on the state of all currently associated rigged meshes
-LL_ALIGN_PREFIX(16)
-class LLJointRiggingInfo
+class alignas(16) LLJointRiggingInfo
 {
+    LL_ALIGN_NEW
 public:
     LLJointRiggingInfo();
     bool isRiggedTo() const;
@@ -45,31 +45,10 @@ public:
     const LLVector4a *getRiggedExtents() const;
     void merge(const LLJointRiggingInfo& other);
 
-	void* operator new(size_t size)
-	{
-		return ll_aligned_malloc_16(size);
-	}
-
-	void operator delete(void* ptr)
-	{
-		ll_aligned_free_16(ptr);
-	}
-
-	void* operator new[](size_t size)
-	{
-		return ll_aligned_malloc_16(size);
-	}
-
-	void operator delete[](void* ptr)
-	{
-		ll_aligned_free_16(ptr);
-	}
-
-
 private:
-	LL_ALIGN_16(LLVector4a mRiggedExtents[2]);
+	LLVector4a mRiggedExtents[2];
     bool mIsRiggedTo;
-} LL_ALIGN_POSTFIX(16);
+};
 
 // For storing all the rigging info associated with a given avatar or
 // object, keyed by joint_num.
diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h
index 27abf39537..53c8f604f6 100644
--- a/indra/llmath/llvector4a.h
+++ b/indra/llmath/llvector4a.h
@@ -46,11 +46,10 @@ class LLRotation;
 // of this writing, July 08, 2010) about getting it implemented before you resort to
 // LLVector3/LLVector4. 
 /////////////////////////////////
-struct LLVector4a;
 
-LL_ALIGN_PREFIX(16)
-struct LLVector4a
+class alignas(16) LLVector4a
 {
+    LL_ALIGN_NEW
 public:
 
 	///////////////////////////////////
@@ -138,10 +137,10 @@ public:
 	// BASIC GET/SET 
 	////////////////////////////////////
 	
-	// Return a "this" as an F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+	// Return a "this" as an F32 pointer.
 	inline F32* getF32ptr();
 	
-	// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+	// Return a "this" as a const F32 pointer.
 	inline const F32* const getF32ptr() const;
 	
 	// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
@@ -324,7 +323,7 @@ public:
     
 private:
 	LLQuad mQ;
-} LL_ALIGN_POSTFIX(16);
+};
 
 inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p)
 {
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
index 69d3d01efe..8be1c1b114 100644
--- a/indra/llmath/llvector4a.inl
+++ b/indra/llmath/llvector4a.inl
@@ -58,13 +58,13 @@ inline void LLVector4a::store4a(F32* dst) const
 // BASIC GET/SET 
 ////////////////////////////////////
 
-// Return a "this" as an F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+// Return a "this" as an F32 pointer.
 F32* LLVector4a::getF32ptr()
 {
 	return (F32*) &mQ;
 }
 
-// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
+// Return a "this" as a const F32 pointer.
 const F32* const LLVector4a::getF32ptr() const
 {
 	return (const F32* const) &mQ;
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index e085fa6ada..130f30bedc 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -383,6 +383,7 @@ public:
 	virtual void visit(const LLOctreeNode<LLVolumeTriangle>* branch)
 	{ //this is a depth first traversal, so it's safe to assum all children have complete
 		//bounding data
+	LL_PROFILE_ZONE_SCOPED
 
 		LLVolumeOctreeListener* node = (LLVolumeOctreeListener*) branch->getListener(0);
 
@@ -822,6 +823,8 @@ S32 LLProfile::getNumPoints(const LLProfileParams& params, BOOL path_open,F32 de
 BOOL LLProfile::generate(const LLProfileParams& params, BOOL path_open,F32 detail, S32 split,
 						 BOOL is_sculpted, S32 sculpt_size)
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	if ((!mDirty) && (!is_sculpted))
 	{
 		return FALSE;
@@ -1302,6 +1305,8 @@ S32 LLPath::getNumNGonPoints(const LLPathParams& params, S32 sides, F32 startOff
 
 void LLPath::genNGon(const LLPathParams& params, S32 sides, F32 startOff, F32 end_scale, F32 twist_scale)
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	// Generates a circular path, starting at (1, 0, 0), counterclockwise along the xz plane.
 	static const F32 tableScale[] = { 1, 1, 1, 0.5f, 0.707107f, 0.53f, 0.525f, 0.5f };
 
@@ -1536,6 +1541,8 @@ S32 LLPath::getNumPoints(const LLPathParams& params, F32 detail)
 BOOL LLPath::generate(const LLPathParams& params, F32 detail, S32 split,
 					  BOOL is_sculpted, S32 sculpt_size)
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	if ((!mDirty) && (!is_sculpted))
 	{
 		return FALSE;
@@ -2112,6 +2119,8 @@ LLVolume::~LLVolume()
 
 BOOL LLVolume::generate()
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	LL_CHECK_MEMORY
 	llassert_always(mProfilep);
 	
@@ -2370,6 +2379,8 @@ bool LLVolumeFace::VertexData::compareNormal(const LLVolumeFace::VertexData& rhs
 
 bool LLVolume::unpackVolumeFaces(std::istream& is, S32 size)
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	//input stream is now pointing at a zlib compressed block of LLSD
 	//decompress block
 	LLSD mdl;
@@ -2755,6 +2766,8 @@ S32	LLVolume::getNumFaces() const
 
 void LLVolume::createVolumeFaces()
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	if (mGenerateSingleFace)
 	{
 		// do nothing
@@ -3720,6 +3733,8 @@ void LLVolume::generateSilhouetteVertices(std::vector<LLVector3> &vertices,
 										  const LLMatrix3& norm_mat_in,
 										  S32 face_mask)
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	LLMatrix4a mat;
 	mat.loadu(mat_in);
 
@@ -4846,6 +4861,8 @@ void LLVolumeFace::freeData()
 
 BOOL LLVolumeFace::create(LLVolume* volume, BOOL partial_build)
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	//tree for this face is no longer valid
 	delete mOctree;
 	mOctree = NULL;
@@ -5514,6 +5531,8 @@ bool LLVolumeFace::cacheOptimize()
 
 void LLVolumeFace::createOctree(F32 scaler, const LLVector4a& center, const LLVector4a& size)
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	if (mOctree)
 	{
 		return;
@@ -6287,6 +6306,8 @@ void CalculateTangentArray(U32 vertexCount, const LLVector4a *vertex, const LLVe
 
 void LLVolumeFace::createTangents()
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	if (!mTangents)
 	{
 		allocateTangents(mNumVertices);
@@ -6482,6 +6503,8 @@ void LLVolumeFace::fillFromLegacyData(std::vector<LLVolumeFace::VertexData>& v,
 
 BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 {
+	LL_PROFILE_ZONE_SCOPED
+
 	LL_CHECK_MEMORY
 	BOOL flat = mTypeMask & FLAT_MASK;
 
@@ -6974,6 +6997,8 @@ BOOL LLVolumeFace::createSide(LLVolume* volume, BOOL partial_build)
 void CalculateTangentArray(U32 vertexCount, const LLVector4a *vertex, const LLVector4a *normal,
         const LLVector2 *texcoord, U32 triangleCount, const U16* index_array, LLVector4a *tangent)
 {
+	LL_PROFILE_ZONE_SCOPED
+
     //LLVector4a *tan1 = new LLVector4a[vertexCount * 2];
 	LLVector4a* tan1 = (LLVector4a*) ll_aligned_malloc_16(vertexCount*2*sizeof(LLVector4a));
 	// new(tan1) LLVector4a;
diff --git a/indra/llmath/llvolumeoctree.h b/indra/llmath/llvolumeoctree.h
index 13150028d8..b2bc440368 100644
--- a/indra/llmath/llvolumeoctree.h
+++ b/indra/llmath/llvolumeoctree.h
@@ -34,19 +34,10 @@
 #include "llvolume.h"
 #include "llvector4a.h"
 
-class LLVolumeTriangle : public LLRefCount
+class alignas(16) LLVolumeTriangle : public LLRefCount
 {
+    LL_ALIGN_NEW
 public:
-	void* operator new(size_t size)
-	{
-		return ll_aligned_malloc_16(size);
-	}
-
-	void operator delete(void* ptr)
-	{
-		ll_aligned_free_16(ptr);
-	}
-
 	LLVolumeTriangle()
 	{
 		mBinIndex = -1;	
@@ -86,20 +77,10 @@ public:
 
 };
 
-class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle>
+class alignas(16) LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle>
 {
+    LL_ALIGN_NEW
 public:
-	
-	void* operator new(size_t size)
-	{
-		return ll_aligned_malloc_16(size);
-	}
-
-	void operator delete(void* ptr)
-	{
-		ll_aligned_free_16(ptr);
-	}
-
 	LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node);
 	~LLVolumeOctreeListener();
 	
diff --git a/indra/llmath/m4math.cpp b/indra/llmath/m4math.cpp
index 3baf1bad18..6e40dae30b 100644
--- a/indra/llmath/m4math.cpp
+++ b/indra/llmath/m4math.cpp
@@ -32,8 +32,7 @@
 #include "m4math.h"
 #include "m3math.h"
 #include "llquaternion.h"
-
-
+#include "llmatrix4a.h"
 
 
 // LLMatrix4
@@ -115,6 +114,12 @@ LLMatrix4::LLMatrix4(const LLQuaternion &q)
 	*this = initRotation(q);
 }
 
+LLMatrix4::LLMatrix4(const LLMatrix4a& mat)
+    : LLMatrix4(mat.getF32ptr())
+{
+    
+}
+
 LLMatrix4::LLMatrix4(const LLQuaternion &q, const LLVector4 &pos)
 {
 	*this = initRotTrans(q, pos);
diff --git a/indra/llmath/m4math.h b/indra/llmath/m4math.h
index bf60adb9b6..b9da970cde 100644
--- a/indra/llmath/m4math.h
+++ b/indra/llmath/m4math.h
@@ -32,6 +32,7 @@
 class LLVector4;
 class LLMatrix3;
 class LLQuaternion;
+class LLMatrix4a;
 
 // NOTA BENE: Currently assuming a right-handed, x-forward, y-left, z-up universe
 
@@ -104,6 +105,7 @@ public:
 	explicit LLMatrix4(const F32 *mat);								// Initializes Matrix to values in mat
 	explicit LLMatrix4(const LLMatrix3 &mat);						// Initializes Matrix to values in mat and sets position to (0,0,0)
 	explicit LLMatrix4(const LLQuaternion &q);						// Initializes Matrix with rotation q and sets position to (0,0,0)
+    explicit LLMatrix4(const LLMatrix4a& mat);
 
 	LLMatrix4(const LLMatrix3 &mat, const LLVector4 &pos);	// Initializes Matrix to values in mat and pos
 
diff --git a/indra/llmath/v3math.cpp b/indra/llmath/v3math.cpp
index b04c67d926..93010d2250 100644
--- a/indra/llmath/v3math.cpp
+++ b/indra/llmath/v3math.cpp
@@ -316,6 +316,12 @@ LLVector3::LLVector3(const LLVector4 &vec)
 	mV[VZ] = (F32)vec.mV[VZ];
 }
 
+LLVector3::LLVector3(const LLVector4a& vec)
+    : LLVector3(vec.getF32ptr())
+{
+
+}
+
 LLVector3::LLVector3(const LLSD& sd)
 {
 	setValue(sd);
diff --git a/indra/llmath/v3math.h b/indra/llmath/v3math.h
index 6f857d7061..068f489020 100644
--- a/indra/llmath/v3math.h
+++ b/indra/llmath/v3math.h
@@ -33,6 +33,7 @@
 #include "llsd.h"
 class LLVector2;
 class LLVector4;
+class LLVector4a;
 class LLMatrix3;
 class LLMatrix4;
 class LLVector3d;
@@ -62,7 +63,9 @@ class LLVector3
 		explicit LLVector3(const LLVector2 &vec);				// Initializes LLVector3 to (vec[0]. vec[1], 0)
 		explicit LLVector3(const LLVector3d &vec);				// Initializes LLVector3 to (vec[0]. vec[1], vec[2])
 		explicit LLVector3(const LLVector4 &vec);				// Initializes LLVector4 to (vec[0]. vec[1], vec[2])
-		explicit LLVector3(const LLSD& sd);
+        explicit LLVector3(const LLVector4a& vec);              // Initializes LLVector4 to (vec[0]. vec[1], vec[2])
+        explicit LLVector3(const LLSD& sd);
+        
 
 		LLSD getValue() const;