3 files changed, 42 insertions, 11 deletions
diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h
index 4938775e2b..61e30f11cc 100644
--- a/indra/llcommon/llmemory.h
+++ b/indra/llcommon/llmemory.h
@@ -201,24 +201,36 @@ inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __
 
 	if (bytes > 64)
 	{
+
+		// Find start of 64b aligned area within block
+		//
 		void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
 		
 		//at least 64 bytes before the end of the destination, switch to 16 byte copies
 		void* end_64 = end-64;
-		
+	
+		// Prefetch the head of the 64b area now
+		//
 		_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
 		_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
 		_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
 		_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
-		
+	
+		// Copy 16b chunks until we're 64b aligned
+		//
 		while (dst < begin_64)
 		{
 
 			_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
-			dst += 4;
-			src += 4;
+			dst += 16;
+			src += 16;
 		}
-		
+	
+		// Copy 64b chunks up to your tail
+		//
+		// might be good to shmoo the 512b prefetch offset
+		// (characterize performance for various values)
+		//
 		while (dst < end_64)
 		{
 			_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
@@ -232,6 +244,8 @@ inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __
 		}
 	}
 
+	// Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies)
+	//
 	while (dst < end)
 	{
 		_mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
diff --git a/indra/newview/llface.cpp b/indra/newview/llface.cpp
index 86e5f20812..5f86205175 100755
--- a/indra/newview/llface.cpp
+++ b/indra/newview/llface.cpp
@@ -64,6 +64,8 @@ BOOL LLFace::sSafeRenderSelect = TRUE; // FALSE
 
 #define DOTVEC(a,b) (a.mV[0]*b.mV[0] + a.mV[1]*b.mV[1] + a.mV[2]*b.mV[2])
 
+//#pragma GCC diagnostic ignored "-Wuninitialized"
+
 /*
 For each vertex, given:
 	B - binormal
@@ -1982,6 +1984,7 @@ BOOL LLFace::getGeometryVolume(const LLVolume& volume,
 				
 			//_mm_prefetch((char*)dst, _MM_HINT_NTA);
 
+
 			LLVector4a res0; //,res1,res2,res3;
 			
 			LLVector4a texIdx;
diff --git a/indra/newview/llpolymesh.cpp b/indra/newview/llpolymesh.cpp
index 5f5258bbce..916f3d8e06 100644
--- a/indra/newview/llpolymesh.cpp
+++ b/indra/newview/llpolymesh.cpp
@@ -983,12 +983,26 @@ LLVector4a *LLPolyMesh::getScaledBinormals()
 //-----------------------------------------------------------------------------
 void LLPolyMesh::initializeForMorph()
 {
-    LLVector4a::memcpyNonAliased16((F32*) mCoords, (F32*) mSharedData->mBaseCoords, sizeof(LLVector4a) * mSharedData->mNumVertices);
-	LLVector4a::memcpyNonAliased16((F32*) mNormals, (F32*) mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices);
-	LLVector4a::memcpyNonAliased16((F32*) mScaledNormals, (F32*) mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices);
-	LLVector4a::memcpyNonAliased16((F32*) mBinormals, (F32*) mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices);
-	LLVector4a::memcpyNonAliased16((F32*) mScaledBinormals, (F32*) mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices);
-	LLVector4a::memcpyNonAliased16((F32*) mTexCoords, (F32*) mSharedData->mTexCoords, sizeof(LLVector2) * (mSharedData->mNumVertices + mSharedData->mNumVertices%2));
+	// Must insure that src and dst of copies below
+	// are actually 16b aligned...the 16b mod 0 size
+	// is assumed from the data being LLVector4a
+	//
+	ll_assert_aligned(mCoords,16);
+	ll_assert_aligned(mNormals,16);
+	ll_assert_aligned(mScaledNormals,16);
+	ll_assert_aligned(mBinormals,16);
+	ll_assert_aligned(mScaledBinormals,16);
+	ll_assert_aligned(mTexCoords,16);
+	ll_assert_aligned(mSharedData->mBaseCoords,16);
+	ll_assert_aligned(mSharedData->mBaseNormals,16);
+	ll_assert_aligned(mSharedData->mTexCoords,16);
+
+        ll_memcpy_nonaliased_aligned_16((char*)mCoords, (char*)mSharedData->mBaseCoords, sizeof(LLVector4a) * mSharedData->mNumVertices);
+	ll_memcpy_nonaliased_aligned_16((char*)mNormals, (char*)mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices);
+	ll_memcpy_nonaliased_aligned_16((char*)mScaledNormals, (char*)mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices);
+	ll_memcpy_nonaliased_aligned_16((char*)mBinormals, (char*)mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices);
+	ll_memcpy_nonaliased_aligned_16((char*)mScaledBinormals, (char*)mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices);
+	ll_memcpy_nonaliased_aligned_16((char*)mTexCoords, (char*)mSharedData->mTexCoords, sizeof(LLVector2) * (mSharedData->mNumVertices + mSharedData->mNumVertices%2));
 
 	for (U32 i = 0; i < mSharedData->mNumVertices; ++i)
 	{