diff options
Diffstat (limited to 'indra/llcommon')
| -rw-r--r-- | indra/llcommon/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | indra/llcommon/llalignedarray.h | 139 | ||||
| -rw-r--r-- | indra/llcommon/llmemory.h | 116 |
3 files changed, 250 insertions, 6 deletions
diff --git a/indra/llcommon/CMakeLists.txt b/indra/llcommon/CMakeLists.txt index e019c17280..0c2ceebb52 100644 --- a/indra/llcommon/CMakeLists.txt +++ b/indra/llcommon/CMakeLists.txt @@ -121,6 +121,7 @@ set(llcommon_HEADER_FILES linden_common.h linked_lists.h llaccountingcost.h + llalignedarray.h llallocator.h llallocator_heap_profile.h llagentconstants.h diff --git a/indra/llcommon/llalignedarray.h b/indra/llcommon/llalignedarray.h new file mode 100644 index 0000000000..ed8fd31205 --- /dev/null +++ b/indra/llcommon/llalignedarray.h @@ -0,0 +1,139 @@ +/** + * @file llalignedarray.h + * @brief A static array which obeys alignment restrictions and mimics std::vector accessors. + * + * $LicenseInfo:firstyear=2002&license=viewerlgpl$ + * Second Life Viewer Source Code + * Copyright (C) 2010, Linden Research, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License only. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA + * $/LicenseInfo$ + */ + +#ifndef LL_LLALIGNEDARRAY_H +#define LL_LLALIGNEDARRAY_H + +#include "llmemory.h" + +template <class T, U32 alignment> +class LLAlignedArray +{ +public: + T* mArray; + U32 mElementCount; + U32 mCapacity; + + LLAlignedArray(); + ~LLAlignedArray(); + + void push_back(const T& elem); + U32 size() const { return mElementCount; } + void resize(U32 size); + T* append(S32 N); + T& operator[](int idx); + const T& operator[](int idx) const; +}; + +template <class T, U32 alignment> +LLAlignedArray<T, alignment>::LLAlignedArray() +{ + llassert(alignment >= 16); + mArray = NULL; + mElementCount = 0; + mCapacity = 0; +} + +template <class T, U32 alignment> +LLAlignedArray<T, alignment>::~LLAlignedArray() +{ + ll_aligned_free(mArray); + mArray = NULL; + mElementCount = 0; + mCapacity = 0; +} + +template <class T, U32 alignment> +void LLAlignedArray<T, alignment>::push_back(const T& elem) +{ + T* old_buf = NULL; + if (mCapacity <= mElementCount) + { + mCapacity++; + mCapacity *= 2; + T* new_buf = (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment); + if (mArray) + { + ll_memcpy_nonaliased_aligned_16((char*)new_buf, (char*)mArray, sizeof(T)*mElementCount); + } + old_buf = mArray; + mArray = new_buf; + } + + mArray[mElementCount++] = elem; + + //delete old array here to prevent error on a.push_back(a[0]) + ll_aligned_free(old_buf); +} + +template <class T, U32 alignment> +void LLAlignedArray<T, alignment>::resize(U32 size) +{ + if (mCapacity < size) + { + mCapacity = size+mCapacity*2; + T* new_buf = mCapacity > 0 ? (T*) ll_aligned_malloc(mCapacity*sizeof(T), alignment) : NULL; + if (mArray) + { + ll_memcpy_nonaliased_aligned_16((char*) new_buf, (char*) mArray, sizeof(T)*mElementCount); + ll_aligned_free(mArray); + } + + /*for (U32 i = mElementCount; i < mCapacity; ++i) + { + new(new_buf+i) T(); + }*/ + mArray = new_buf; + } + + mElementCount = size; +} + + +template <class T, U32 alignment> +T& LLAlignedArray<T, alignment>::operator[](int idx) +{ + llassert(idx < mElementCount); + return mArray[idx]; +} + +template <class T, U32 alignment> +const T& LLAlignedArray<T, alignment>::operator[](int idx) const +{ + llassert(idx < mElementCount); + return mArray[idx]; +} + +template <class T, U32 alignment> +T* LLAlignedArray<T, alignment>::append(S32 N) +{ + U32 sz = size(); + resize(sz+N); + return &((*this)[sz]); +} + +#endif + diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h index 46cabfadcd..61e30f11cc 100644 --- a/indra/llcommon/llmemory.h +++ b/indra/llcommon/llmemory.h @@ -36,6 +36,44 @@ class LLMutex ; #define LL_CHECK_MEMORY #endif +LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment); + +#ifdef SHOW_ASSERT +#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment)) +#else +#define ll_assert_aligned(ptr,alignment) +#endif + +#include <xmmintrin.h> + +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address) +{ + return reinterpret_cast<T*>( + (reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF); +} + +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address) +{ + return reinterpret_cast<T*>( + (reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F); +} + +#if LL_LINUX || LL_DARWIN + +#define LL_ALIGN_PREFIX(x) +#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x))) + +#elif LL_WINDOWS + +#define LL_ALIGN_PREFIX(x) __declspec(align(x)) +#define LL_ALIGN_POSTFIX(x) + +#else +#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined" +#endif + +#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16) + inline void* ll_aligned_malloc( size_t size, int align ) { #if defined(LL_WINDOWS) @@ -144,6 +182,78 @@ inline void ll_aligned_free_32(void *p) #endif } + +// Copy words 16-byte blocks from src to dst. Source and destination MUST NOT OVERLAP. +// Source and dest must be 16-byte aligned and size must be multiple of 16. +// +inline void ll_memcpy_nonaliased_aligned_16(char* __restrict dst, const char* __restrict src, size_t bytes) +{ + assert(src != NULL); + assert(dst != NULL); + assert(bytes > 0); + assert((bytes % sizeof(F32))== 0); + ll_assert_aligned(src,16); + ll_assert_aligned(dst,16); + assert((src < dst) ? ((src + bytes) < dst) : ((dst + bytes) < src)); + assert(bytes%16==0); + + char* end = dst + bytes; + + if (bytes > 64) + { + + // Find start of 64b aligned area within block + // + void* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst); + + //at least 64 bytes before the end of the destination, switch to 16 byte copies + void* end_64 = end-64; + + // Prefetch the head of the 64b area now + // + _mm_prefetch((char*)begin_64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA); + + // Copy 16b chunks until we're 64b aligned + // + while (dst < begin_64) + { + + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + dst += 16; + src += 16; + } + + // Copy 64b chunks up to your tail + // + // might be good to shmoo the 512b prefetch offset + // (characterize performance for various values) + // + while (dst < end_64) + { + _mm_prefetch((char*)src + 512, _MM_HINT_NTA); + _mm_prefetch((char*)dst + 512, _MM_HINT_NTA); + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + _mm_store_ps((F32*)(dst + 16), _mm_load_ps((F32*)(src + 16))); + _mm_store_ps((F32*)(dst + 32), _mm_load_ps((F32*)(src + 32))); + _mm_store_ps((F32*)(dst + 48), _mm_load_ps((F32*)(src + 48))); + dst += 64; + src += 64; + } + } + + // Copy remainder 16b tail chunks (or ALL 16b chunks for sub-64b copies) + // + while (dst < end) + { + _mm_store_ps((F32*)dst, _mm_load_ps((F32*)src)); + dst += 16; + src += 16; + } +} + #ifndef __DEBUG_PRIVATE_MEM__ #define __DEBUG_PRIVATE_MEM__ 0 #endif @@ -552,13 +662,7 @@ void LLPrivateMemoryPoolTester::operator delete[](void* addr) // LLSingleton moved to llsingleton.h -LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment); -#ifdef SHOW_ASSERT -#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment)) -#else -#define ll_assert_aligned(ptr,alignment) -#endif #endif |
