diff options
-rw-r--r-- | indra/llcommon/llmemory.h | 24 | ||||
-rwxr-xr-x | indra/newview/llface.cpp | 3 | ||||
-rw-r--r-- | indra/newview/llpolymesh.cpp | 26 |
3 files changed, 42 insertions, 11 deletions
diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h index 4938775e2b..61e30f11cc 100644 --- a/indra/llcommon/llmemory.h +++ b/indra/llcommon/llmemory.h @@ -201,24 +201,36 @@ inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __ if (bytes > 64) { + + // Find start of 64b aligned area within block + // void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst); //at least 64 bytes before the end of the destination, switch to 16 byte copies void* end_64 = end-64; - + + // Prefetch the head of the 64b area now + // _mm_prefetch((char*)begin_64, _MM_HINT_NTA); _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA); _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA); _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA); - + + // Copy 16b chunks until we're 64b aligned + // while (dst < begin_64) { _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); - dst += 4; - src += 4; + dst += 16; + src += 16; } - + + // Copy 64b chunks up to your tail + // + // might be good to shmoo the 512b prefetch offset + // (characterize performance for various values) + // while (dst < end_64) { _mm_prefetch((char*)src + 512, _MM_HINT_NTA); @@ -232,6 +244,8 @@ inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __ } } + // Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies) + // while (dst < end) { _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); diff --git a/indra/newview/llface.cpp b/indra/newview/llface.cpp index 86e5f20812..5f86205175 100755 --- a/indra/newview/llface.cpp +++ b/indra/newview/llface.cpp @@ -64,6 +64,8 @@ BOOL LLFace::sSafeRenderSelect = TRUE; // FALSE #define DOTVEC(a,b) (a.mV[0]*b.mV[0] + a.mV[1]*b.mV[1] + a.mV[2]*b.mV[2]) +//#pragma GCC diagnostic ignored "-Wuninitialized" + /* For each vertex, given: B - binormal @@ -1982,6 +1984,7 @@ BOOL LLFace::getGeometryVolume(const LLVolume& volume, //_mm_prefetch((char*)dst, _MM_HINT_NTA); + LLVector4a res0; //,res1,res2,res3; LLVector4a texIdx; diff --git a/indra/newview/llpolymesh.cpp b/indra/newview/llpolymesh.cpp index 5f5258bbce..916f3d8e06 100644 --- a/indra/newview/llpolymesh.cpp +++ b/indra/newview/llpolymesh.cpp @@ -983,12 +983,26 @@ LLVector4a *LLPolyMesh::getScaledBinormals() //----------------------------------------------------------------------------- void LLPolyMesh::initializeForMorph() { - LLVector4a::memcpyNonAliased16((F32*) mCoords, (F32*) mSharedData->mBaseCoords, sizeof(LLVector4a) * mSharedData->mNumVertices); - LLVector4a::memcpyNonAliased16((F32*) mNormals, (F32*) mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices); - LLVector4a::memcpyNonAliased16((F32*) mScaledNormals, (F32*) mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices); - LLVector4a::memcpyNonAliased16((F32*) mBinormals, (F32*) mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices); - LLVector4a::memcpyNonAliased16((F32*) mScaledBinormals, (F32*) mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices); - LLVector4a::memcpyNonAliased16((F32*) mTexCoords, (F32*) mSharedData->mTexCoords, sizeof(LLVector2) * (mSharedData->mNumVertices + mSharedData->mNumVertices%2)); + // Must insure that src and dst of copies below + // are actually 16b aligned...the 16b mod 0 size + // is assumed from the data being LLVector4a + // + ll_assert_aligned(mCoords,16); + ll_assert_aligned(mNormals,16); + ll_assert_aligned(mScaledNormals,16); + ll_assert_aligned(mBinormals,16); + ll_assert_aligned(mScaledBinormals,16); + ll_assert_aligned(mTexCoords,16); + ll_assert_aligned(mSharedData->mBaseCoords,16); + ll_assert_aligned(mSharedData->mBaseNormals,16); + ll_assert_aligned(mSharedData->mTexCoords,16); + + ll_memcpy_nonaliased_aligned_16((char*)mCoords, (char*)mSharedData->mBaseCoords, sizeof(LLVector4a) * mSharedData->mNumVertices); + ll_memcpy_nonaliased_aligned_16((char*)mNormals, (char*)mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices); + ll_memcpy_nonaliased_aligned_16((char*)mScaledNormals, (char*)mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices); + ll_memcpy_nonaliased_aligned_16((char*)mBinormals, (char*)mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices); + ll_memcpy_nonaliased_aligned_16((char*)mScaledBinormals, (char*)mSharedData->mBaseNormals, sizeof(LLVector4a) * mSharedData->mNumVertices); + ll_memcpy_nonaliased_aligned_16((char*)mTexCoords, (char*)mSharedData->mTexCoords, sizeof(LLVector2) * (mSharedData->mNumVertices + mSharedData->mNumVertices%2)); for (U32 i = 0; i < mSharedData->mNumVertices; ++i) { |