summaryrefslogtreecommitdiff
path: root/indra/llcommon/llmemory.h
diff options
context:
space:
mode:
authorOz Linden <oz@lindenlab.com>2013-11-19 17:59:55 -0500
committerOz Linden <oz@lindenlab.com>2013-11-19 17:59:55 -0500
commit0031e9a97be1bf6e9fe773c23506494d09ce91ae (patch)
tree220f195c82174b7cc8e94dceb2553e59fe5837a5 /indra/llcommon/llmemory.h
parentb7edc965bc77ab21e9a1e3f6b424299a50053529 (diff)
parentebc9bcbf69f7a519677a6522979a6bf6cbb04bb8 (diff)
merge up to 3.6.10-release; some of the storm-68 changes lost
Diffstat (limited to 'indra/llcommon/llmemory.h')
-rwxr-xr-x[-rw-r--r--]indra/llcommon/llmemory.h185
1 files changed, 169 insertions, 16 deletions
diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h
index bbbdaa6497..61e30f11cc 100644..100755
--- a/indra/llcommon/llmemory.h
+++ b/indra/llcommon/llmemory.h
@@ -26,27 +26,85 @@
#ifndef LLMEMORY_H
#define LLMEMORY_H
-#include "llmemtype.h"
-#if LL_DEBUG
+#include "linden_common.h"
+
+class LLMutex ;
+
+#if LL_WINDOWS && LL_DEBUG
+#define LL_CHECK_MEMORY llassert(_CrtCheckMemory());
+#else
+#define LL_CHECK_MEMORY
+#endif
+
+LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);
+
+#ifdef SHOW_ASSERT
+#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment))
+#else
+#define ll_assert_aligned(ptr,alignment)
+#endif
+
+#include <xmmintrin.h>
+
+template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address)
+{
+ return reinterpret_cast<T*>(
+ (reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF);
+}
+
+template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
+{
+ return reinterpret_cast<T*>(
+ (reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F);
+}
+
+#if LL_LINUX || LL_DARWIN
+
+#define LL_ALIGN_PREFIX(x)
+#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x)))
+
+#elif LL_WINDOWS
+
+#define LL_ALIGN_PREFIX(x) __declspec(align(x))
+#define LL_ALIGN_POSTFIX(x)
+
+#else
+#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
+#endif
+
+#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
+
inline void* ll_aligned_malloc( size_t size, int align )
{
+#if defined(LL_WINDOWS)
+ return _aligned_malloc(size, align);
+#else
void* mem = malloc( size + (align - 1) + sizeof(void*) );
char* aligned = ((char*)mem) + sizeof(void*);
aligned += align - ((uintptr_t)aligned & (align - 1));
((void**)aligned)[-1] = mem;
return aligned;
+#endif
}
inline void ll_aligned_free( void* ptr )
{
- free( ((void**)ptr)[-1] );
+#if defined(LL_WINDOWS)
+ _aligned_free(ptr);
+#else
+ if (ptr)
+ {
+ free( ((void**)ptr)[-1] );
+ }
+#endif
}
+#if !LL_USE_TCMALLOC
inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16().
{
#if defined(LL_WINDOWS)
- return _mm_malloc(size, 16);
+ return _aligned_malloc(size, 16);
#elif defined(LL_DARWIN)
return malloc(size); // default osx malloc is 16 byte aligned.
#else
@@ -61,7 +119,7 @@ inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed wi
inline void ll_aligned_free_16(void *p)
{
#if defined(LL_WINDOWS)
- _mm_free(p);
+ _aligned_free(p);
#elif defined(LL_DARWIN)
return free(p);
#else
@@ -69,10 +127,39 @@ inline void ll_aligned_free_16(void *p)
#endif
}
+inline void* ll_aligned_realloc_16(void* ptr, size_t size, size_t old_size) // returned hunk MUST be freed with ll_aligned_free_16().
+{
+#if defined(LL_WINDOWS)
+ return _aligned_realloc(ptr, size, 16);
+#elif defined(LL_DARWIN)
+ return realloc(ptr,size); // default osx malloc is 16 byte aligned.
+#else
+ //FIXME: memcpy is SLOW
+ void* ret = ll_aligned_malloc_16(size);
+ if (ptr)
+ {
+ if (ret)
+ {
+ // Only copy the size of the smallest memory block to avoid memory corruption.
+ memcpy(ret, ptr, llmin(old_size, size));
+ }
+ ll_aligned_free_16(ptr);
+ }
+ return ret;
+#endif
+}
+
+#else // USE_TCMALLOC
+// ll_aligned_foo_16 are not needed with tcmalloc
+#define ll_aligned_malloc_16 malloc
+#define ll_aligned_realloc_16(a,b,c) realloc(a,b)
+#define ll_aligned_free_16 free
+#endif // USE_TCMALLOC
+
inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed with ll_aligned_free_32().
{
#if defined(LL_WINDOWS)
- return _mm_malloc(size, 32);
+ return _aligned_malloc(size, 32);
#elif defined(LL_DARWIN)
return ll_aligned_malloc( size, 32 );
#else
@@ -87,22 +174,85 @@ inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed wi
inline void ll_aligned_free_32(void *p)
{
#if defined(LL_WINDOWS)
- _mm_free(p);
+ _aligned_free(p);
#elif defined(LL_DARWIN)
ll_aligned_free( p );
#else
free(p); // posix_memalign() is compatible with heap deallocator
#endif
}
-#else // LL_DEBUG
-// ll_aligned_foo are noops now that we use tcmalloc everywhere (tcmalloc aligns automatically at appropriate intervals)
-#define ll_aligned_malloc( size, align ) malloc(size)
-#define ll_aligned_free( ptr ) free(ptr)
-#define ll_aligned_malloc_16 malloc
-#define ll_aligned_free_16 free
-#define ll_aligned_malloc_32 malloc
-#define ll_aligned_free_32 free
-#endif // LL_DEBUG
+
+
+// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP.
+// Source and dest must be 16-byte aligned and size must be multiple of 16.
+//
+inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes)
+{
+ assert(src != NULL);
+ assert(dst != NULL);
+ assert(bytes > 0);
+ assert((bytes % sizeof(F32))== 0);
+ ll_assert_aligned(src,16);
+ ll_assert_aligned(dst,16);
+ assert((src < dst) ? ((src + bytes) < dst) : ((dst + bytes) < src));
+ assert(bytes%16==0);
+
+ char* end = dst + bytes;
+
+ if (bytes > 64)
+ {
+
+ // Find start of 64b aligned area within block
+ //
+ void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
+
+ //at least 64 bytes before the end of the destination, switch to 16 byte copies
+ void* end_64 = end-64;
+
+ // Prefetch the head of the 64b area now
+ //
+ _mm_prefetch((char*)begin_64, _MM_HINT_NTA);
+ _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
+ _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
+ _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
+
+ // Copy 16b chunks until we're 64b aligned
+ //
+ while (dst < begin_64)
+ {
+
+ _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
+ dst += 16;
+ src += 16;
+ }
+
+ // Copy 64b chunks up to your tail
+ //
+ // might be good to shmoo the 512b prefetch offset
+ // (characterize performance for various values)
+ //
+ while (dst < end_64)
+ {
+ _mm_prefetch((char*)src + 512, _MM_HINT_NTA);
+ _mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
+ _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
+ _mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16)));
+ _mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32)));
+ _mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48)));
+ dst += 64;
+ src += 64;
+ }
+ }
+
+ // Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies)
+ //
+ while (dst < end)
+ {
+ _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
+ dst += 16;
+ src += 16;
+ }
+}
#ifndef __DEBUG_PRIVATE_MEM__
#define __DEBUG_PRIVATE_MEM__ 0
@@ -512,4 +662,7 @@ void LLPrivateMemoryPoolTester::operator delete[](void* addr)
// LLSingleton moved to llsingleton.h
+
+
+
#endif