diff options
Diffstat (limited to 'indra/llcommon/llmemory.h')
-rw-r--r-- | indra/llcommon/llmemory.h | 836 |
1 files changed, 418 insertions, 418 deletions
diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h index ea360881c6..2c3f66fab8 100644 --- a/indra/llcommon/llmemory.h +++ b/indra/llcommon/llmemory.h @@ -1,418 +1,418 @@ -/**
- * @file llmemory.h
- * @brief Memory allocation/deallocation header-stuff goes here.
- *
- * $LicenseInfo:firstyear=2002&license=viewerlgpl$
- * Second Life Viewer Source Code
- * Copyright (C) 2010, Linden Research, Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License only.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
- * $/LicenseInfo$
- */
-#ifndef LLMEMORY_H
-#define LLMEMORY_H
-
-#include "linden_common.h"
-#include "llunits.h"
-#include "stdtypes.h"
-#if !LL_WINDOWS
-#include <stdint.h>
-#endif
-
-class LLMutex ;
-
-#if LL_WINDOWS && LL_DEBUG
-#define LL_CHECK_MEMORY llassert(_CrtCheckMemory());
-#else
-#define LL_CHECK_MEMORY
-#endif
-
-
-#if LL_WINDOWS
-#define LL_ALIGN_OF __alignof
-#else
-#define LL_ALIGN_OF __align_of__
-#endif
-
-#if LL_WINDOWS
-#define LL_DEFAULT_HEAP_ALIGN 8
-#elif LL_DARWIN
-#define LL_DEFAULT_HEAP_ALIGN 16
-#elif LL_LINUX
-#define LL_DEFAULT_HEAP_ALIGN 8
-#endif
-
-
-LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment);
-
-#ifdef SHOW_ASSERT
-// This is incredibly expensive - in profiling Windows RWD builds, 30%
-// of CPU time was in aligment checks.
-//#define ASSERT_ALIGNMENT
-#endif
-
-#ifdef ASSERT_ALIGNMENT
-#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(uintptr_t(ptr),((U32)alignment))
-#else
-#define ll_assert_aligned(ptr,alignment)
-#endif
-
-#include <xmmintrin.h>
-
-template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address)
-{
- return reinterpret_cast<T*>(
- (uintptr_t(address) + 0xF) & ~0xF);
-}
-
-template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
-{
- return reinterpret_cast<T*>(
- (uintptr_t(address) + 0x3F) & ~0x3F);
-}
-
-#if LL_LINUX || LL_DARWIN
-
-#define LL_ALIGN_PREFIX(x)
-#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x)))
-
-#elif LL_WINDOWS
-
-#define LL_ALIGN_PREFIX(x) __declspec(align(x))
-#define LL_ALIGN_POSTFIX(x)
-
-#else
-#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
-#endif
-
-#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
-
-#define LL_ALIGN_NEW \
-public: \
- void* operator new(size_t size) \
- { \
- return ll_aligned_malloc_16(size); \
- } \
- \
- void operator delete(void* ptr) \
- { \
- ll_aligned_free_16(ptr); \
- } \
- \
- void* operator new[](size_t size) \
- { \
- return ll_aligned_malloc_16(size); \
- } \
- \
- void operator delete[](void* ptr) \
- { \
- ll_aligned_free_16(ptr); \
- }
-
-
-//------------------------------------------------------------------------------------------------
-//------------------------------------------------------------------------------------------------
- // for enable buffer overrun detection predefine LL_DEBUG_BUFFER_OVERRUN in current library
- // change preprocessor code to: #if 1 && defined(LL_WINDOWS)
-
-#if 0 && defined(LL_WINDOWS)
- void* ll_aligned_malloc_fallback( size_t size, int align );
- void ll_aligned_free_fallback( void* ptr );
-//------------------------------------------------------------------------------------------------
-#else
- inline void* ll_aligned_malloc_fallback( size_t size, int align )
- {
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
- #if defined(LL_WINDOWS)
- void* ret = _aligned_malloc(size, align);
- #else
- char* aligned = NULL;
- void* mem = malloc( size + (align - 1) + sizeof(void*) );
- if (mem)
- {
- aligned = ((char*)mem) + sizeof(void*);
- aligned += align - ((uintptr_t)aligned & (align - 1));
-
- ((void**)aligned)[-1] = mem;
- }
- void* ret = aligned;
- #endif
- LL_PROFILE_ALLOC(ret, size);
- return ret;
- }
-
- inline void ll_aligned_free_fallback( void* ptr )
- {
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
- LL_PROFILE_FREE(ptr);
- #if defined(LL_WINDOWS)
- _aligned_free(ptr);
- #else
- if (ptr)
- {
- free( ((void**)ptr)[-1] );
- }
- #endif
- }
-#endif
-//------------------------------------------------------------------------------------------------
-//------------------------------------------------------------------------------------------------
-
-inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16().
-{
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
-#if defined(LL_WINDOWS)
- void* ret = _aligned_malloc(size, 16);
-#elif defined(LL_DARWIN)
- void* ret = malloc(size); // default osx malloc is 16 byte aligned.
-#else
- void *ret;
- if (0 != posix_memalign(&ret, 16, size))
- return nullptr;
-#endif
- LL_PROFILE_ALLOC(ret, size);
- return ret;
-}
-
-inline void ll_aligned_free_16(void *p)
-{
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
- LL_PROFILE_FREE(p);
-#if defined(LL_WINDOWS)
- _aligned_free(p);
-#elif defined(LL_DARWIN)
- return free(p);
-#else
- free(p); // posix_memalign() is compatible with heap deallocator
-#endif
-}
-
-inline void* ll_aligned_realloc_16(void* ptr, size_t size, size_t old_size) // returned hunk MUST be freed with ll_aligned_free_16().
-{
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
- LL_PROFILE_FREE(ptr);
-#if defined(LL_WINDOWS)
- void* ret = _aligned_realloc(ptr, size, 16);
-#elif defined(LL_DARWIN)
- void* ret = realloc(ptr,size); // default osx malloc is 16 byte aligned.
-#else
- //FIXME: memcpy is SLOW
- void* ret = ll_aligned_malloc_16(size);
- if (ptr)
- {
- if (ret)
- {
- // Only copy the size of the smallest memory block to avoid memory corruption.
- memcpy(ret, ptr, llmin(old_size, size));
- }
- ll_aligned_free_16(ptr);
- }
-#endif
- LL_PROFILE_ALLOC(ptr, size);
- return ret;
-}
-
-inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed with ll_aligned_free_32().
-{
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
-#if defined(LL_WINDOWS)
- void* ret = _aligned_malloc(size, 32);
-#elif defined(LL_DARWIN)
- void* ret = ll_aligned_malloc_fallback( size, 32 );
-#else
- void *ret;
- if (0 != posix_memalign(&ret, 32, size))
- return nullptr;
-#endif
- LL_PROFILE_ALLOC(ret, size);
- return ret;
-}
-
-inline void ll_aligned_free_32(void *p)
-{
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
- LL_PROFILE_FREE(p);
-#if defined(LL_WINDOWS)
- _aligned_free(p);
-#elif defined(LL_DARWIN)
- ll_aligned_free_fallback( p );
-#else
- free(p); // posix_memalign() is compatible with heap deallocator
-#endif
-}
-
-// general purpose dispatch functions that are forced inline so they can compile down to a single call
-template<size_t ALIGNMENT>
-LL_FORCE_INLINE void* ll_aligned_malloc(size_t size)
-{
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
- void* ret;
- if (LL_DEFAULT_HEAP_ALIGN % ALIGNMENT == 0)
- {
- ret = malloc(size);
- LL_PROFILE_ALLOC(ret, size);
- }
- else if (ALIGNMENT == 16)
- {
- ret = ll_aligned_malloc_16(size);
- }
- else if (ALIGNMENT == 32)
- {
- ret = ll_aligned_malloc_32(size);
- }
- else
- {
- ret = ll_aligned_malloc_fallback(size, ALIGNMENT);
- }
- return ret;
-}
-
-template<size_t ALIGNMENT>
-LL_FORCE_INLINE void ll_aligned_free(void* ptr)
-{
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
- if (ALIGNMENT == LL_DEFAULT_HEAP_ALIGN)
- {
- LL_PROFILE_FREE(ptr);
- free(ptr);
- }
- else if (ALIGNMENT == 16)
- {
- ll_aligned_free_16(ptr);
- }
- else if (ALIGNMENT == 32)
- {
- return ll_aligned_free_32(ptr);
- }
- else
- {
- return ll_aligned_free_fallback(ptr);
- }
-}
-
-// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP.
-// Source and dest must be 16-byte aligned and size must be multiple of 16.
-//
-inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes)
-{
- LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY;
- assert(src != NULL);
- assert(dst != NULL);
- assert(bytes > 0);
- assert((bytes % sizeof(F32))== 0);
- ll_assert_aligned(src,16);
- ll_assert_aligned(dst,16);
-
- assert((src < dst) ? ((src + bytes) <= dst) : ((dst + bytes) <= src));
- assert(bytes%16==0);
-
- char* end = dst + bytes;
-
- if (bytes > 64)
- {
-
- // Find start of 64b aligned area within block
- //
- void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
-
- //at least 64 bytes before the end of the destination, switch to 16 byte copies
- void* end_64 = end-64;
-
- // Prefetch the head of the 64b area now
- //
- _mm_prefetch((char*)begin_64, _MM_HINT_NTA);
- _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
- _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
- _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
-
- // Copy 16b chunks until we're 64b aligned
- //
- while (dst < begin_64)
- {
-
- _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
- dst += 16;
- src += 16;
- }
-
- // Copy 64b chunks up to your tail
- //
- // might be good to shmoo the 512b prefetch offset
- // (characterize performance for various values)
- //
- while (dst < end_64)
- {
- _mm_prefetch((char*)src + 512, _MM_HINT_NTA);
- _mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
- _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
- _mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16)));
- _mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32)));
- _mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48)));
- dst += 64;
- src += 64;
- }
- }
-
- // Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies)
- //
- while (dst < end)
- {
- _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src));
- dst += 16;
- src += 16;
- }
-}
-
-#ifndef __DEBUG_PRIVATE_MEM__
-#define __DEBUG_PRIVATE_MEM__ 0
-#endif
-
-class LL_COMMON_API LLMemory
-{
-public:
- // Return the resident set size of the current process, in bytes.
- // Return value is zero if not known.
- static U64 getCurrentRSS();
- static void* tryToAlloc(void* address, U32 size);
- static void initMaxHeapSizeGB(F32Gigabytes max_heap_size);
- static void updateMemoryInfo() ;
- static void logMemoryInfo(bool update = false);
-
- static U32Kilobytes getAvailableMemKB() ;
- static U32Kilobytes getMaxMemKB() ;
- static U32Kilobytes getAllocatedMemKB() ;
-private:
- static U32Kilobytes sAvailPhysicalMemInKB ;
- static U32Kilobytes sMaxPhysicalMemInKB ;
- static U32Kilobytes sAllocatedMemInKB;
- static U32Kilobytes sAllocatedPageSizeInKB ;
-
- static U32Kilobytes sMaxHeapSizeInKB;
-};
-
-// LLRefCount moved to llrefcount.h
-
-// LLPointer moved to llpointer.h
-
-// LLSafeHandle moved to llsafehandle.h
-
-// LLSingleton moved to llsingleton.h
-
-
-
-
-#endif
+/** + * @file llmemory.h + * @brief Memory allocation/deallocation header-stuff goes here. + * + * $LicenseInfo:firstyear=2002&license=viewerlgpl$ + * Second Life Viewer Source Code + * Copyright (C) 2010, Linden Research, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License only. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA + * $/LicenseInfo$ + */ +#ifndef LLMEMORY_H +#define LLMEMORY_H + +#include "linden_common.h" +#include "llunits.h" +#include "stdtypes.h" +#if !LL_WINDOWS +#include <stdint.h> +#endif + +class LLMutex ; + +#if LL_WINDOWS && LL_DEBUG +#define LL_CHECK_MEMORY llassert(_CrtCheckMemory()); +#else +#define LL_CHECK_MEMORY +#endif + + +#if LL_WINDOWS +#define LL_ALIGN_OF __alignof +#else +#define LL_ALIGN_OF __align_of__ +#endif + +#if LL_WINDOWS +#define LL_DEFAULT_HEAP_ALIGN 8 +#elif LL_DARWIN +#define LL_DEFAULT_HEAP_ALIGN 16 +#elif LL_LINUX +#define LL_DEFAULT_HEAP_ALIGN 8 +#endif + + +LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment); + +#ifdef SHOW_ASSERT +// This is incredibly expensive - in profiling Windows RWD builds, 30% +// of CPU time was in aligment checks. +//#define ASSERT_ALIGNMENT +#endif + +#ifdef ASSERT_ALIGNMENT +#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(uintptr_t(ptr),((U32)alignment)) +#else +#define ll_assert_aligned(ptr,alignment) +#endif + +#include <xmmintrin.h> + +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address) +{ + return reinterpret_cast<T*>( + (uintptr_t(address) + 0xF) & ~0xF); +} + +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address) +{ + return reinterpret_cast<T*>( + (uintptr_t(address) + 0x3F) & ~0x3F); +} + +#if LL_LINUX || LL_DARWIN + +#define LL_ALIGN_PREFIX(x) +#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x))) + +#elif LL_WINDOWS + +#define LL_ALIGN_PREFIX(x) __declspec(align(x)) +#define LL_ALIGN_POSTFIX(x) + +#else +#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined" +#endif + +#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16) + +#define LL_ALIGN_NEW \ +public: \ + void* operator new(size_t size) \ + { \ + return ll_aligned_malloc_16(size); \ + } \ + \ + void operator delete(void* ptr) \ + { \ + ll_aligned_free_16(ptr); \ + } \ + \ + void* operator new[](size_t size) \ + { \ + return ll_aligned_malloc_16(size); \ + } \ + \ + void operator delete[](void* ptr) \ + { \ + ll_aligned_free_16(ptr); \ + } + + +//------------------------------------------------------------------------------------------------ +//------------------------------------------------------------------------------------------------ + // for enable buffer overrun detection predefine LL_DEBUG_BUFFER_OVERRUN in current library + // change preprocessor code to: #if 1 && defined(LL_WINDOWS) + +#if 0 && defined(LL_WINDOWS) + void* ll_aligned_malloc_fallback( size_t size, int align ); + void ll_aligned_free_fallback( void* ptr ); +//------------------------------------------------------------------------------------------------ +#else + inline void* ll_aligned_malloc_fallback( size_t size, int align ) + { + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; + #if defined(LL_WINDOWS) + void* ret = _aligned_malloc(size, align); + #else + char* aligned = NULL; + void* mem = malloc( size + (align - 1) + sizeof(void*) ); + if (mem) + { + aligned = ((char*)mem) + sizeof(void*); + aligned += align - ((uintptr_t)aligned & (align - 1)); + + ((void**)aligned)[-1] = mem; + } + void* ret = aligned; + #endif + LL_PROFILE_ALLOC(ret, size); + return ret; + } + + inline void ll_aligned_free_fallback( void* ptr ) + { + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; + LL_PROFILE_FREE(ptr); + #if defined(LL_WINDOWS) + _aligned_free(ptr); + #else + if (ptr) + { + free( ((void**)ptr)[-1] ); + } + #endif + } +#endif +//------------------------------------------------------------------------------------------------ +//------------------------------------------------------------------------------------------------ + +inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16(). +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; +#if defined(LL_WINDOWS) + void* ret = _aligned_malloc(size, 16); +#elif defined(LL_DARWIN) + void* ret = malloc(size); // default osx malloc is 16 byte aligned. +#else + void *ret; + if (0 != posix_memalign(&ret, 16, size)) + return nullptr; +#endif + LL_PROFILE_ALLOC(ret, size); + return ret; +} + +inline void ll_aligned_free_16(void *p) +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; + LL_PROFILE_FREE(p); +#if defined(LL_WINDOWS) + _aligned_free(p); +#elif defined(LL_DARWIN) + return free(p); +#else + free(p); // posix_memalign() is compatible with heap deallocator +#endif +} + +inline void* ll_aligned_realloc_16(void* ptr, size_t size, size_t old_size) // returned hunk MUST be freed with ll_aligned_free_16(). +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; + LL_PROFILE_FREE(ptr); +#if defined(LL_WINDOWS) + void* ret = _aligned_realloc(ptr, size, 16); +#elif defined(LL_DARWIN) + void* ret = realloc(ptr,size); // default osx malloc is 16 byte aligned. +#else + //FIXME: memcpy is SLOW + void* ret = ll_aligned_malloc_16(size); + if (ptr) + { + if (ret) + { + // Only copy the size of the smallest memory block to avoid memory corruption. + memcpy(ret, ptr, llmin(old_size, size)); + } + ll_aligned_free_16(ptr); + } +#endif + LL_PROFILE_ALLOC(ptr, size); + return ret; +} + +inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed with ll_aligned_free_32(). +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; +#if defined(LL_WINDOWS) + void* ret = _aligned_malloc(size, 32); +#elif defined(LL_DARWIN) + void* ret = ll_aligned_malloc_fallback( size, 32 ); +#else + void *ret; + if (0 != posix_memalign(&ret, 32, size)) + return nullptr; +#endif + LL_PROFILE_ALLOC(ret, size); + return ret; +} + +inline void ll_aligned_free_32(void *p) +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; + LL_PROFILE_FREE(p); +#if defined(LL_WINDOWS) + _aligned_free(p); +#elif defined(LL_DARWIN) + ll_aligned_free_fallback( p ); +#else + free(p); // posix_memalign() is compatible with heap deallocator +#endif +} + +// general purpose dispatch functions that are forced inline so they can compile down to a single call +template<size_t ALIGNMENT> +LL_FORCE_INLINE void* ll_aligned_malloc(size_t size) +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; + void* ret; + if (LL_DEFAULT_HEAP_ALIGN % ALIGNMENT == 0) + { + ret = malloc(size); + LL_PROFILE_ALLOC(ret, size); + } + else if (ALIGNMENT == 16) + { + ret = ll_aligned_malloc_16(size); + } + else if (ALIGNMENT == 32) + { + ret = ll_aligned_malloc_32(size); + } + else + { + ret = ll_aligned_malloc_fallback(size, ALIGNMENT); + } + return ret; +} + +template<size_t ALIGNMENT> +LL_FORCE_INLINE void ll_aligned_free(void* ptr) +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; + if (ALIGNMENT == LL_DEFAULT_HEAP_ALIGN) + { + LL_PROFILE_FREE(ptr); + free(ptr); + } + else if (ALIGNMENT == 16) + { + ll_aligned_free_16(ptr); + } + else if (ALIGNMENT == 32) + { + return ll_aligned_free_32(ptr); + } + else + { + return ll_aligned_free_fallback(ptr); + } +} + +// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP. +// Source and dest must be 16-byte aligned and size must be multiple of 16. +// +inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes) +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_MEMORY; + assert(src != NULL); + assert(dst != NULL); + assert(bytes > 0); + assert((bytes % sizeof(F32))== 0); + ll_assert_aligned(src,16); + ll_assert_aligned(dst,16); + + assert((src < dst) ? ((src + bytes) <= dst) : ((dst + bytes) <= src)); + assert(bytes%16==0); + + char* end = dst + bytes; + + if (bytes > 64) + { + + // Find start of 64b aligned area within block + // + void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst); + + //at least 64 bytes before the end of the destination, switch to 16 byte copies + void* end_64 = end-64; + + // Prefetch the head of the 64b area now + // + _mm_prefetch((char*)begin_64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA); + + // Copy 16b chunks until we're 64b aligned + // + while (dst < begin_64) + { + + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + dst += 16; + src += 16; + } + + // Copy 64b chunks up to your tail + // + // might be good to shmoo the 512b prefetch offset + // (characterize performance for various values) + // + while (dst < end_64) + { + _mm_prefetch((char*)src + 512, _MM_HINT_NTA); + _mm_prefetch((char*)dst + 512, _MM_HINT_NTA); + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + _mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16))); + _mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32))); + _mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48))); + dst += 64; + src += 64; + } + } + + // Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies) + // + while (dst < end) + { + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + dst += 16; + src += 16; + } +} + +#ifndef __DEBUG_PRIVATE_MEM__ +#define __DEBUG_PRIVATE_MEM__ 0 +#endif + +class LL_COMMON_API LLMemory +{ +public: + // Return the resident set size of the current process, in bytes. + // Return value is zero if not known. + static U64 getCurrentRSS(); + static void* tryToAlloc(void* address, U32 size); + static void initMaxHeapSizeGB(F32Gigabytes max_heap_size); + static void updateMemoryInfo() ; + static void logMemoryInfo(bool update = false); + + static U32Kilobytes getAvailableMemKB() ; + static U32Kilobytes getMaxMemKB() ; + static U32Kilobytes getAllocatedMemKB() ; +private: + static U32Kilobytes sAvailPhysicalMemInKB ; + static U32Kilobytes sMaxPhysicalMemInKB ; + static U32Kilobytes sAllocatedMemInKB; + static U32Kilobytes sAllocatedPageSizeInKB ; + + static U32Kilobytes sMaxHeapSizeInKB; +}; + +// LLRefCount moved to llrefcount.h + +// LLPointer moved to llpointer.h + +// LLSafeHandle moved to llsafehandle.h + +// LLSingleton moved to llsingleton.h + + + + +#endif |