diff options
Diffstat (limited to 'indra/llcommon/llmemory.h')
-rwxr-xr-x[-rw-r--r--] | indra/llcommon/llmemory.h | 292 |
1 files changed, 219 insertions, 73 deletions
diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h index e725bdd9fa..c4c9cc0566 100644..100755 --- a/indra/llcommon/llmemory.h +++ b/indra/llcommon/llmemory.h @@ -27,30 +27,112 @@ #define LLMEMORY_H #include "linden_common.h" +#include "llunits.h" +#if !LL_WINDOWS +#include <stdint.h> +#endif class LLMutex ; #if LL_WINDOWS && LL_DEBUG #define LL_CHECK_MEMORY llassert(_CrtCheckMemory()); #else -#define LL_CHECK_MEMORY +#define LL_CHECK_MEMORY #endif -inline void* ll_aligned_malloc( size_t size, int align ) -{ - void* mem = malloc( size + (align - 1) + sizeof(void*) ); - char* aligned = ((char*)mem) + sizeof(void*); - aligned += align - ((uintptr_t)aligned & (align - 1)); - ((void**)aligned)[-1] = mem; - return aligned; +#if LL_WINDOWS +#define LL_ALIGN_OF __alignof +#else +#define LL_ALIGN_OF __align_of__ +#endif + +#if LL_WINDOWS +#define LL_DEFAULT_HEAP_ALIGN 8 +#elif LL_DARWIN +#define LL_DEFAULT_HEAP_ALIGN 16 +#elif LL_LINUX +#define LL_DEFAULT_HEAP_ALIGN 8 +#endif + + +LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment); + +#ifdef SHOW_ASSERT +#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment)) +#else +#define ll_assert_aligned(ptr,alignment) +#endif + +#include <xmmintrin.h> + +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address) +{ + return reinterpret_cast<T*>( + (reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF); } -inline void ll_aligned_free( void* ptr ) -{ - free( ((void**)ptr)[-1] ); +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address) +{ + return reinterpret_cast<T*>( + (reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F); } +#if LL_LINUX || LL_DARWIN + +#define LL_ALIGN_PREFIX(x) +#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x))) + +#elif LL_WINDOWS + +#define LL_ALIGN_PREFIX(x) __declspec(align(x)) +#define LL_ALIGN_POSTFIX(x) + +#else +#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined" +#endif + +#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16) + +//------------------------------------------------------------------------------------------------ +//------------------------------------------------------------------------------------------------ + // for enable buffer overrun detection predefine LL_DEBUG_BUFFER_OVERRUN in current library + // change preprocessro code to: #if 1 && defined(LL_WINDOWS) + +#if 0 && defined(LL_WINDOWS) + void* ll_aligned_malloc_fallback( size_t size, int align ); + void ll_aligned_free_fallback( void* ptr ); +//------------------------------------------------------------------------------------------------ +#else + inline void* ll_aligned_malloc_fallback( size_t size, int align ) + { + #if defined(LL_WINDOWS) + return _aligned_malloc(size, align); + #else + void* mem = malloc( size + (align - 1) + sizeof(void*) ); + char* aligned = ((char*)mem) + sizeof(void*); + aligned += align - ((uintptr_t)aligned & (align - 1)); + + ((void**)aligned)[-1] = mem; + return aligned; + #endif + } + + inline void ll_aligned_free_fallback( void* ptr ) + { + #if defined(LL_WINDOWS) + _aligned_free(ptr); + #else + if (ptr) + { + free( ((void**)ptr)[-1] ); + } + #endif + } +#endif +//------------------------------------------------------------------------------------------------ +//------------------------------------------------------------------------------------------------ + #if !LL_USE_TCMALLOC inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16(). { @@ -112,7 +194,7 @@ inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed wi #if defined(LL_WINDOWS) return _aligned_malloc(size, 32); #elif defined(LL_DARWIN) - return ll_aligned_malloc( size, 32 ); + return ll_aligned_malloc_fallback( size, 32 ); #else void *rtn; if (LL_LIKELY(0 == posix_memalign(&rtn, 32, size))) @@ -127,12 +209,127 @@ inline void ll_aligned_free_32(void *p) #if defined(LL_WINDOWS) _aligned_free(p); #elif defined(LL_DARWIN) - ll_aligned_free( p ); + ll_aligned_free_fallback( p ); #else free(p); // posix_memalign() is compatible with heap deallocator #endif } +// general purpose dispatch functions that are forced inline so they can compile down to a single call +template<size_t ALIGNMENT> +LL_FORCE_INLINE void* ll_aligned_malloc(size_t size) +{ + if (LL_DEFAULT_HEAP_ALIGN % ALIGNMENT == 0) + { + return malloc(size); + } + else if (ALIGNMENT == 16) + { + return ll_aligned_malloc_16(size); + } + else if (ALIGNMENT == 32) + { + return ll_aligned_malloc_32(size); + } + else + { + return ll_aligned_malloc_fallback(size, ALIGNMENT); + } +} + +template<size_t ALIGNMENT> +LL_FORCE_INLINE void ll_aligned_free(void* ptr) +{ + if (ALIGNMENT == LL_DEFAULT_HEAP_ALIGN) + { + free(ptr); + } + else if (ALIGNMENT == 16) + { + ll_aligned_free_16(ptr); + } + else if (ALIGNMENT == 32) + { + return ll_aligned_free_32(ptr); + } + else + { + return ll_aligned_free_fallback(ptr); + } +} + +// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP. +// Source and dest must be 16-byte aligned and size must be multiple of 16. +// +inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes) +{ + assert(src != NULL); + assert(dst != NULL); + assert(bytes > 0); + assert((bytes % sizeof(F32))== 0); + ll_assert_aligned(src,16); + ll_assert_aligned(dst,16); + + assert((src < dst) ? ((src + bytes) <= dst) : ((dst + bytes) <= src)); + assert(bytes%16==0); + + char* end = dst + bytes; + + if (bytes > 64) + { + + // Find start of 64b aligned area within block + // + void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst); + + //at least 64 bytes before the end of the destination, switch to 16 byte copies + void* end_64 = end-64; + + // Prefetch the head of the 64b area now + // + _mm_prefetch((char*)begin_64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA); + + // Copy 16b chunks until we're 64b aligned + // + while (dst < begin_64) + { + + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + dst += 16; + src += 16; + } + + // Copy 64b chunks up to your tail + // + // might be good to shmoo the 512b prefetch offset + // (characterize performance for various values) + // + while (dst < end_64) + { + _mm_prefetch((char*)src + 512, _MM_HINT_NTA); + _mm_prefetch((char*)dst + 512, _MM_HINT_NTA); + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + _mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16))); + _mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32))); + _mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48))); + dst += 64; + src += 64; + } + } + + // Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies) + // + while (dst < end) + { + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + dst += 16; + src += 16; + } +} + #ifndef __DEBUG_PRIVATE_MEM__ #define __DEBUG_PRIVATE_MEM__ 0 #endif @@ -148,70 +345,25 @@ public: static U64 getCurrentRSS(); static U32 getWorkingSetSize(); static void* tryToAlloc(void* address, U32 size); - static void initMaxHeapSizeGB(F32 max_heap_size_gb, BOOL prevent_heap_failure); + static void initMaxHeapSizeGB(F32Gigabytes max_heap_size, BOOL prevent_heap_failure); static void updateMemoryInfo() ; static void logMemoryInfo(BOOL update = FALSE); static bool isMemoryPoolLow(); - static U32 getAvailableMemKB() ; - static U32 getMaxMemKB() ; - static U32 getAllocatedMemKB() ; + static U32Kilobytes getAvailableMemKB() ; + static U32Kilobytes getMaxMemKB() ; + static U32Kilobytes getAllocatedMemKB() ; private: static char* reserveMem; - static U32 sAvailPhysicalMemInKB ; - static U32 sMaxPhysicalMemInKB ; - static U32 sAllocatedMemInKB; - static U32 sAllocatedPageSizeInKB ; + static U32Kilobytes sAvailPhysicalMemInKB ; + static U32Kilobytes sMaxPhysicalMemInKB ; + static U32Kilobytes sAllocatedMemInKB; + static U32Kilobytes sAllocatedPageSizeInKB ; - static U32 sMaxHeapSizeInKB; + static U32Kilobytes sMaxHeapSizeInKB; static BOOL sEnableMemoryFailurePrevention; }; -//---------------------------------------------------------------------------- -#if MEM_TRACK_MEM -class LLMutex ; -class LL_COMMON_API LLMemTracker -{ -private: - LLMemTracker() ; - ~LLMemTracker() ; - -public: - static void release() ; - static LLMemTracker* getInstance() ; - - void track(const char* function, const int line) ; - void preDraw(BOOL pause) ; - void postDraw() ; - const char* getNextLine() ; - -private: - static LLMemTracker* sInstance ; - - char** mStringBuffer ; - S32 mCapacity ; - U32 mLastAllocatedMem ; - S32 mCurIndex ; - S32 mCounter; - S32 mDrawnIndex; - S32 mNumOfDrawn; - BOOL mPaused; - LLMutex* mMutexp ; -}; - -#define MEM_TRACK_RELEASE LLMemTracker::release() ; -#define MEM_TRACK LLMemTracker::getInstance()->track(__FUNCTION__, __LINE__) ; - -#else // MEM_TRACK_MEM - -#define MEM_TRACK_RELEASE -#define MEM_TRACK - -#endif // MEM_TRACK_MEM - -//---------------------------------------------------------------------------- - - // //class LLPrivateMemoryPool defines a private memory pool for an application to use, so the application does not //need to access the heap directly fro each memory allocation. Throught this, the allocation speed is faster, @@ -541,13 +693,7 @@ void LLPrivateMemoryPoolTester::operator delete[](void* addr) // LLSingleton moved to llsingleton.h -LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment); -#ifdef SHOW_ASSERT -#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment)) -#else -#define ll_assert_aligned(ptr,alignment) -#endif #endif |