From 5d2fea6262d91eb8d3c06d97a160ca9373b96889 Mon Sep 17 00:00:00 2001 From: "Graham Madarasz (Graham Linden)" Date: Wed, 13 Mar 2013 10:42:40 -0700 Subject: Move fast memcpy to llcommon and use it in llalignedarray pushback on all platforms. Code Review: DaveP --- indra/llcommon/llalignedarray.h | 16 +----- indra/llcommon/llmemory.h | 116 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 112 insertions(+), 20 deletions(-) (limited to 'indra/llcommon') diff --git a/indra/llcommon/llalignedarray.h b/indra/llcommon/llalignedarray.h index 5e04e8050f..ed8fd31205 100644 --- a/indra/llcommon/llalignedarray.h +++ b/indra/llcommon/llalignedarray.h @@ -29,10 +29,6 @@ #include "llmemory.h" -#if LL_WINDOWS -#include "llvector4a.h" // for 16b fast copy -#endif - template class LLAlignedArray { @@ -81,11 +77,7 @@ void LLAlignedArray::push_back(const T& elem) T* new_buf = (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment); if (mArray) { -#if LL_WINDOWS - LLVector4a::memcpyNonAliased16((F32*) new_buf, (F32*) mArray, sizeof(T)*mElementCount); -#else - memcpy((F32*)new_buf, (F32*)mArray, sizeof(T)*mElementCount); -#endif + ll_memcpy_nonaliased_aligned_16((char*)new_buf, (char*)mArray, sizeof(T)*mElementCount); } old_buf = mArray; mArray = new_buf; @@ -106,11 +98,7 @@ void LLAlignedArray::resize(U32 size) T* new_buf = mCapacity > 0 ? (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment) : NULL; if (mArray) { -#if LL_WINDOWS - LLVector4a::memcpyNonAliased16((F32*) new_buf, (F32*) mArray, sizeof(T)*mElementCount); -#else - memcpy((F32*) new_buf, (F32*) mArray, sizeof(T)*mElementCount); -#endif + ll_memcpy_nonaliased_aligned_16((char*) new_buf, (char*) mArray, sizeof(T)*mElementCount); ll_aligned_free(mArray); } diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h index 46cabfadcd..61e30f11cc 100644 --- a/indra/llcommon/llmemory.h +++ b/indra/llcommon/llmemory.h @@ -36,6 +36,44 @@ class LLMutex ; #define LL_CHECK_MEMORY #endif +LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment); + +#ifdef SHOW_ASSERT +#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast(ptr),((U32)alignment)) +#else +#define ll_assert_aligned(ptr,alignment) +#endif + +#include + +template T* LL_NEXT_ALIGNED_ADDRESS(T* address) +{ + return reinterpret_cast( + (reinterpret_cast(address) + 0xF) & ~0xF); +} + +template T* LL_NEXT_ALIGNED_ADDRESS_64(T* address) +{ + return reinterpret_cast( + (reinterpret_cast(address) + 0x3F) & ~0x3F); +} + +#if LL_LINUX || LL_DARWIN + +#define LL_ALIGN_PREFIX(x) +#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x))) + +#elif LL_WINDOWS + +#define LL_ALIGN_PREFIX(x) __declspec(align(x)) +#define LL_ALIGN_POSTFIX(x) + +#else +#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined" +#endif + +#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16) + inline void* ll_aligned_malloc( size_t size, int align ) { #if defined(LL_WINDOWS) @@ -144,6 +182,78 @@ inline void ll_aligned_free_32(void *p) #endif } + +// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP. +// Source and dest must be 16-byte aligned and size must be multiple of 16. +// +inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes) +{ + assert(src != NULL); + assert(dst != NULL); + assert(bytes > 0); + assert((bytes % sizeof(F32))== 0); + ll_assert_aligned(src,16); + ll_assert_aligned(dst,16); + assert((src < dst) ? ((src + bytes) < dst) : ((dst + bytes) < src)); + assert(bytes%16==0); + + char* end = dst + bytes; + + if (bytes > 64) + { + + // Find start of 64b aligned area within block + // + void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst); + + //at least 64 bytes before the end of the destination, switch to 16 byte copies + void* end_64 = end-64; + + // Prefetch the head of the 64b area now + // + _mm_prefetch((char*)begin_64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA); + + // Copy 16b chunks until we're 64b aligned + // + while (dst < begin_64) + { + + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + dst += 16; + src += 16; + } + + // Copy 64b chunks up to your tail + // + // might be good to shmoo the 512b prefetch offset + // (characterize performance for various values) + // + while (dst < end_64) + { + _mm_prefetch((char*)src + 512, _MM_HINT_NTA); + _mm_prefetch((char*)dst + 512, _MM_HINT_NTA); + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + _mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16))); + _mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32))); + _mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48))); + dst += 64; + src += 64; + } + } + + // Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies) + // + while (dst < end) + { + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + dst += 16; + src += 16; + } +} + #ifndef __DEBUG_PRIVATE_MEM__ #define __DEBUG_PRIVATE_MEM__ 0 #endif @@ -552,13 +662,7 @@ void LLPrivateMemoryPoolTester::operator delete[](void* addr) // LLSingleton moved to llsingleton.h -LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment); -#ifdef SHOW_ASSERT -#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast(ptr),((U32)alignment)) -#else -#define ll_assert_aligned(ptr,alignment) -#endif #endif -- cgit v1.2.3