diff options
36 files changed, 298 insertions, 108 deletions
| diff --git a/indra/cmake/Copy3rdPartyLibs.cmake b/indra/cmake/Copy3rdPartyLibs.cmake index 224e0a8b51..9f05c4cff2 100644 --- a/indra/cmake/Copy3rdPartyLibs.cmake +++ b/indra/cmake/Copy3rdPartyLibs.cmake @@ -57,10 +57,10 @@ if(WINDOWS)          libhunspell.dll          ) -    if(USE_GOOGLE_PERFTOOLS) +    if(USE_TCMALLOC)        set(debug_files ${debug_files} libtcmalloc_minimal-debug.dll)        set(release_files ${release_files} libtcmalloc_minimal.dll) -    endif(USE_GOOGLE_PERFTOOLS) +    endif(USE_TCMALLOC)      if (FMOD)        set(debug_files ${debug_files} fmod.dll) @@ -272,13 +272,16 @@ elseif(LINUX)          libopenal.so          libopenjpeg.so          libssl.so -        libtcmalloc_minimal.so          libuuid.so.16          libuuid.so.16.0.22          libssl.so.1.0.0          libfontconfig.so.1.4.4         ) +    if (USE_TCMALLOC) +      set(release_files ${release_files} "libtcmalloc_minimal.so") +    endif (USE_TCMALLOC) +      if (FMOD)        set(release_files ${release_files} "libfmod-3.75.so")      endif (FMOD) diff --git a/indra/cmake/GooglePerfTools.cmake b/indra/cmake/GooglePerfTools.cmake index d9f91193be..09501e0406 100644 --- a/indra/cmake/GooglePerfTools.cmake +++ b/indra/cmake/GooglePerfTools.cmake @@ -1,20 +1,34 @@  # -*- cmake -*-  include(Prebuilt) +# If you want to enable or disable TCMALLOC in viewer builds, this is the place. +# set ON or OFF as desired. +set (USE_TCMALLOC ON) +  if (STANDALONE)    include(FindGooglePerfTools)  else (STANDALONE)    if (WINDOWS) -    use_prebuilt_binary(tcmalloc) -    set(TCMALLOC_LIBRARIES  -        debug libtcmalloc_minimal-debug -        optimized libtcmalloc_minimal) +    if (USE_TCMALLOC) +       use_prebuilt_binary(tcmalloc) +       set(TCMALLOC_LIBRARIES  +         debug libtcmalloc_minimal-debug +         optimized libtcmalloc_minimal) +       set(TCMALLOC_LINK_FLAGS  "/INCLUDE:__tcmalloc") +    else (USE_TCMALLOC) +      set(TCMALLOC_LIBRARIES) +      set(TCMALLOC_LINK_FLAGS) +    endif (USE_TCMALLOC)      set(GOOGLE_PERFTOOLS_FOUND "YES")    endif (WINDOWS)    if (LINUX) -    use_prebuilt_binary(tcmalloc) -    set(TCMALLOC_LIBRARIES  -    tcmalloc) +    if (USE_TCMALLOC) +      use_prebuilt_binary(tcmalloc) +      set(TCMALLOC_LIBRARIES  +        tcmalloc) +    else (USE_TCMALLOC) +      set(TCMALLOC_LIBRARIES) +    endif (USE_TCMALLOC)      set(PROFILER_LIBRARIES profiler)      set(GOOGLE_PERFTOOLS_INCLUDE_DIR          ${LIBS_PREBUILT_DIR}/include) @@ -29,13 +43,19 @@ if (GOOGLE_PERFTOOLS_FOUND)  endif (GOOGLE_PERFTOOLS_FOUND)  if (WINDOWS) -    set(USE_GOOGLE_PERFTOOLS ON) +   set(USE_GOOGLE_PERFTOOLS ON)  endif (WINDOWS)  if (USE_GOOGLE_PERFTOOLS) -  set(TCMALLOC_FLAG -ULL_USE_TCMALLOC=1) +  if (USE_TCMALLOC) +    set(TCMALLOC_FLAG -DLL_USE_TCMALLOC=1) +  else (USE_TCMALLOC) +    set(TCMALLOC_FLAG -ULL_USE_TCMALLOC) +  endif (USE_TCMALLOC) +endif (USE_GOOGLE_PERFTOOLS) + +if (USE_GOOGLE_PERFTOOLS)    include_directories(${GOOGLE_PERFTOOLS_INCLUDE_DIR})    set(GOOGLE_PERFTOOLS_LIBRARIES ${TCMALLOC_LIBRARIES} ${STACKTRACE_LIBRARIES} ${PROFILER_LIBRARIES})  else (USE_GOOGLE_PERFTOOLS) -  set(TCMALLOC_FLAG -ULL_USE_TCMALLOC)  endif (USE_GOOGLE_PERFTOOLS) diff --git a/indra/cmake/LLAddBuildTest.cmake b/indra/cmake/LLAddBuildTest.cmake index 03ce46781c..543075db5b 100755 --- a/indra/cmake/LLAddBuildTest.cmake +++ b/indra/cmake/LLAddBuildTest.cmake @@ -214,6 +214,15 @@ FUNCTION(LL_ADD_INTEGRATION_TEST      SET_TARGET_PROPERTIES(INTEGRATION_TEST_${testname} PROPERTIES COMPILE_FLAGS -I"${TUT_INCLUDE_DIR}")    endif(STANDALONE) +  if (WINDOWS) +    SET_TARGET_PROPERTIES(INTEGRATION_TEST_${testname} +        PROPERTIES +        LINK_FLAGS "/debug /NODEFAULTLIB:LIBCMT /SUBSYSTEM:WINDOWS ${TCMALLOC_LINK_FLAGS}" +        LINK_FLAGS_DEBUG "/NODEFAULTLIB:\"LIBCMT;LIBCMTD;MSVCRT\" /INCREMENTAL:NO" +        LINK_FLAGS_RELEASE "" +        ) +  endif (WINDOWS) +    # Add link deps to the executable    if(TEST_DEBUG)      message(STATUS "TARGET_LINK_LIBRARIES(INTEGRATION_TEST_${testname} ${libraries})") diff --git a/indra/llcommon/llallocator.cpp b/indra/llcommon/llallocator.cpp index 6f6abefc67..87654b5b97 100644 --- a/indra/llcommon/llallocator.cpp +++ b/indra/llcommon/llallocator.cpp @@ -27,7 +27,7 @@  #include "linden_common.h"  #include "llallocator.h" -#if LL_USE_TCMALLOC +#if (LL_USE_TCMALLOC && LL_USE_HEAP_PROFILER)  #include "google/heap-profiler.h"  #include "google/commandlineflags_public.h" diff --git a/indra/llcommon/llmemory.cpp b/indra/llcommon/llmemory.cpp index 3b9758f996..afaf366668 100644 --- a/indra/llcommon/llmemory.cpp +++ b/indra/llcommon/llmemory.cpp @@ -61,6 +61,18 @@ BOOL LLMemory::sEnableMemoryFailurePrevention = FALSE;  LLPrivateMemoryPoolManager::mem_allocation_info_t LLPrivateMemoryPoolManager::sMemAllocationTracker;  #endif +void ll_assert_aligned_func(uintptr_t ptr,U32 alignment) +{ +#ifdef SHOW_ASSERT +	// Redundant, place to set breakpoints. +	if (ptr%alignment!=0) +	{ +		llwarns << "alignment check failed" << llendl; +	} +	llassert(ptr%alignment==0); +#endif +} +  //static  void LLMemory::initClass()  { diff --git a/indra/llcommon/llmemory.h b/indra/llcommon/llmemory.h index bbbdaa6497..9dd776ff57 100644 --- a/indra/llcommon/llmemory.h +++ b/indra/llcommon/llmemory.h @@ -27,7 +27,6 @@  #define LLMEMORY_H  #include "llmemtype.h" -#if LL_DEBUG  inline void* ll_aligned_malloc( size_t size, int align )  {  	void* mem = malloc( size + (align - 1) + sizeof(void*) ); @@ -43,10 +42,11 @@ inline void ll_aligned_free( void* ptr )  	free( ((void**)ptr)[-1] );  } +#if !LL_USE_TCMALLOC  inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16().  {  #if defined(LL_WINDOWS) -	return _mm_malloc(size, 16); +	return _aligned_malloc(size, 16);  #elif defined(LL_DARWIN)  	return malloc(size); // default osx malloc is 16 byte aligned.  #else @@ -58,21 +58,38 @@ inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed wi  #endif  } +inline void* ll_aligned_realloc_16(void* ptr, size_t size) // returned hunk MUST be freed with ll_aligned_free_16(). +{ +#if defined(LL_WINDOWS) +	return _aligned_realloc(ptr, size, 16); +#elif defined(LL_DARWIN) +	return realloc(ptr,size); // default osx malloc is 16 byte aligned. +#else +	return realloc(ptr,size); // FIXME not guaranteed to be aligned. +#endif +} +  inline void ll_aligned_free_16(void *p)  {  #if defined(LL_WINDOWS) -	_mm_free(p); +	_aligned_free(p);  #elif defined(LL_DARWIN)  	return free(p);  #else  	free(p); // posix_memalign() is compatible with heap deallocator  #endif  } +#else // USE_TCMALLOC +// ll_aligned_foo_16 are not needed with tcmalloc +#define ll_aligned_malloc_16 malloc +#define ll_aligned_realloc_16 realloc +#define ll_aligned_free_16 free +#endif // USE_TCMALLOC  inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed with ll_aligned_free_32().  {  #if defined(LL_WINDOWS) -	return _mm_malloc(size, 32); +	return _aligned_malloc(size, 32);  #elif defined(LL_DARWIN)  	return ll_aligned_malloc( size, 32 );  #else @@ -87,22 +104,13 @@ inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed wi  inline void ll_aligned_free_32(void *p)  {  #if defined(LL_WINDOWS) -	_mm_free(p); +	_aligned_free(p);  #elif defined(LL_DARWIN)  	ll_aligned_free( p );  #else  	free(p); // posix_memalign() is compatible with heap deallocator  #endif  } -#else // LL_DEBUG -// ll_aligned_foo are noops now that we use tcmalloc everywhere (tcmalloc aligns automatically at appropriate intervals) -#define ll_aligned_malloc( size, align ) malloc(size) -#define ll_aligned_free( ptr ) free(ptr) -#define ll_aligned_malloc_16 malloc -#define ll_aligned_free_16 free -#define ll_aligned_malloc_32 malloc -#define ll_aligned_free_32 free -#endif // LL_DEBUG  #ifndef __DEBUG_PRIVATE_MEM__  #define __DEBUG_PRIVATE_MEM__  0 @@ -512,4 +520,13 @@ void  LLPrivateMemoryPoolTester::operator delete[](void* addr)  // LLSingleton moved to llsingleton.h +LL_COMMON_API void ll_assert_aligned_func(uintptr_t ptr,U32 alignment); + +#ifdef SHOW_ASSERT +#define ll_assert_aligned(ptr,alignment) ll_assert_aligned_func(reinterpret_cast<uintptr_t>(ptr),((U32)alignment)) +#else +#define ll_assert_aligned(ptr,alignment) +#endif + +  #endif diff --git a/indra/llmath/CMakeLists.txt b/indra/llmath/CMakeLists.txt index b5e59c1ca3..5865ae030c 100644 --- a/indra/llmath/CMakeLists.txt +++ b/indra/llmath/CMakeLists.txt @@ -117,6 +117,7 @@ if (LL_TESTS)    # INTEGRATION TESTS    set(test_libs llmath llcommon ${LLCOMMON_LIBRARIES} ${WINDOWS_LIBRARIES})    # TODO: Some of these need refactoring to be proper Unit tests rather than Integration tests. +  LL_ADD_INTEGRATION_TEST(alignment "" "${test_libs}")    LL_ADD_INTEGRATION_TEST(llbbox llbbox.cpp "${test_libs}")    LL_ADD_INTEGRATION_TEST(llquaternion llquaternion.cpp "${test_libs}")    LL_ADD_INTEGRATION_TEST(mathmisc "" "${test_libs}") diff --git a/indra/llmath/llcamera.h b/indra/llmath/llcamera.h index ec67b91d05..0b591be622 100644 --- a/indra/llmath/llcamera.h +++ b/indra/llmath/llcamera.h @@ -60,7 +60,7 @@ static const F32 MAX_FIELD_OF_VIEW = 175.f * DEG_TO_RAD;  // roll(), pitch(), yaw()  // etc... - +LL_ALIGN_PREFIX(16)  class LLCamera  : 	public LLCoordFrame  { @@ -108,7 +108,7 @@ public:  	};  private: -	LLPlane mAgentPlanes[7];  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP +	LL_ALIGN_16(LLPlane mAgentPlanes[7]);  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP  	U8 mPlaneMask[8];         // 8 for alignment	  	F32 mView;					// angle between top and bottom frustum planes in radians. @@ -116,13 +116,13 @@ private:  	S32 mViewHeightInPixels;	// for ViewHeightInPixels() only  	F32 mNearPlane;  	F32 mFarPlane; -	LLPlane mLocalPlanes[4]; +	LL_ALIGN_16(LLPlane mLocalPlanes[4]);  	F32 mFixedDistance;			// Always return this distance, unless < 0  	LLVector3 mFrustCenter;		// center of frustum and radius squared for ultra-quick exclusion test  	F32 mFrustRadiusSquared; -	LLPlane mWorldPlanes[PLANE_NUM]; -	LLPlane mHorizPlanes[HORIZ_PLANE_NUM]; +	LL_ALIGN_16(LLPlane mWorldPlanes[PLANE_NUM]); +	LL_ALIGN_16(LLPlane mHorizPlanes[HORIZ_PLANE_NUM]);  	U32 mPlaneCount;  //defaults to 6, if setUserClipPlane is called, uses user supplied clip plane in @@ -208,7 +208,7 @@ protected:  	void calculateFrustumPlanes(F32 left, F32 right, F32 top, F32 bottom);  	void calculateFrustumPlanesFromWindow(F32 x1, F32 y1, F32 x2, F32 y2);  	void calculateWorldFrustumPlanes(); -}; +} LL_ALIGN_POSTFIX(16);  #endif diff --git a/indra/llmath/llmatrix3a.h b/indra/llmath/llmatrix3a.h index adb7e3389d..9916cfd2da 100644 --- a/indra/llmath/llmatrix3a.h +++ b/indra/llmath/llmatrix3a.h @@ -111,7 +111,7 @@ public:  protected: -	LLVector4a mColumns[3]; +	LL_ALIGN_16(LLVector4a mColumns[3]);  }; diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h index 27cf5b79f6..c4cefdb4fa 100644 --- a/indra/llmath/llmatrix4a.h +++ b/indra/llmath/llmatrix4a.h @@ -34,7 +34,7 @@  class LLMatrix4a  {  public: -	LLVector4a mMatrix[4]; +	LL_ALIGN_16(LLVector4a mMatrix[4]);  	inline void clear()  	{ diff --git a/indra/llmath/lloctree.h b/indra/llmath/lloctree.h index 1b11e83b4a..6c7744cdf1 100644 --- a/indra/llmath/lloctree.h +++ b/indra/llmath/lloctree.h @@ -88,7 +88,7 @@ public:  	typedef LLOctreeNode<T>		oct_node;  	typedef LLOctreeListener<T>	oct_listener; -	/*void* operator new(size_t size) +	void* operator new(size_t size)  	{  		return ll_aligned_malloc_16(size);  	} @@ -96,7 +96,7 @@ public:  	void operator delete(void* ptr)  	{  		ll_aligned_free_16(ptr); -	}*/ +	}  	LLOctreeNode(	const LLVector4a& center,   					const LLVector4a& size,  diff --git a/indra/llmath/llplane.h b/indra/llmath/llplane.h index a611894721..3c32441b11 100644 --- a/indra/llmath/llplane.h +++ b/indra/llmath/llplane.h @@ -36,6 +36,8 @@  // The plane normal = [A, B, C]  // The closest approach = D / sqrt(A*A + B*B + C*C) + +LL_ALIGN_PREFIX(16)  class LLPlane  {  public: @@ -94,7 +96,7 @@ public:  private:  	LLVector4a mV; -}; +} LL_ALIGN_POSTFIX(16); diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h index c7cdf7b32c..01458521ec 100644 --- a/indra/llmath/llsimdmath.h +++ b/indra/llmath/llsimdmath.h @@ -67,11 +67,10 @@ template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)  #define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16) - -  #include <xmmintrin.h>  #include <emmintrin.h> +#include "llmemory.h"  #include "llsimdtypes.h"  #include "llsimdtypes.inl" diff --git a/indra/llmath/llsimdtypes.inl b/indra/llmath/llsimdtypes.inl index 712239e425..e905c84954 100644 --- a/indra/llmath/llsimdtypes.inl +++ b/indra/llmath/llsimdtypes.inl @@ -62,6 +62,7 @@ inline LLSimdScalar operator/(const LLSimdScalar& a, const LLSimdScalar& b)  inline LLSimdScalar operator-(const LLSimdScalar& a)  {  	static LL_ALIGN_16(const U32 signMask[4]) = {0x80000000, 0x80000000, 0x80000000, 0x80000000 }; +	ll_assert_aligned(signMask,16);  	return _mm_xor_ps(*reinterpret_cast<const LLQuad*>(signMask), a);  } @@ -146,6 +147,7 @@ inline LLSimdScalar& LLSimdScalar::operator/=(const LLSimdScalar& rhs)  inline LLSimdScalar LLSimdScalar::getAbs() const  {  	static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; +	ll_assert_aligned(F_ABS_MASK_4A,16);  	return _mm_and_ps( mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));  } diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp index b66b7a7076..6edeb0fefe 100644 --- a/indra/llmath/llvector4a.cpp +++ b/indra/llmath/llvector4a.cpp @@ -24,6 +24,7 @@   * $/LicenseInfo$   */ +#include "llmemory.h"  #include "llmath.h"  #include "llquantize.h" @@ -44,7 +45,10 @@ extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F  	assert(dst != NULL);  	assert(bytes > 0);  	assert((bytes % sizeof(F32))== 0);  -	 +	ll_assert_aligned(src,16); +	ll_assert_aligned(dst,16); +	assert(bytes%16==0); +  	F32* end = dst + (bytes / sizeof(F32) );  	if (bytes > 64) @@ -189,6 +193,8 @@ void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high )  		LLVector4a oneOverDelta;  		{  			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; +			ll_assert_aligned(F_TWO_4A,16); +			  			LLVector4a two; two.load4a( F_TWO_4A );  			// Here we use _mm_rcp_ps plus one round of newton-raphson diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h index 596082509d..0526793d3a 100644 --- a/indra/llmath/llvector4a.h +++ b/indra/llmath/llvector4a.h @@ -32,6 +32,7 @@ class LLRotation;  #include <assert.h>  #include "llpreprocessor.h" +#include "llmemory.h"  ///////////////////////////////////  // FIRST TIME USERS PLEASE READ @@ -46,6 +47,7 @@ class LLRotation;  // LLVector3/LLVector4.   ///////////////////////////////// +LL_ALIGN_PREFIX(16)  class LLVector4a  {  public: @@ -82,6 +84,7 @@ public:  	}  	// Copy words 16-byte blocks from src to dst. Source and destination must not overlap.  +	// Source and dest must be 16-byte aligned and size must be multiple of 16.  	static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes);  	//////////////////////////////////// @@ -90,6 +93,7 @@ public:  	LLVector4a()  	{ //DO NOT INITIALIZE -- The overhead is completely unnecessary +		ll_assert_aligned(this,16);  	}  	LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f) @@ -313,7 +317,7 @@ public:  private:  	LLQuad mQ; -}; +} LL_ALIGN_POSTFIX(16);  inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p)  { diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl index 7ad22a5631..7c52ffef21 100644 --- a/indra/llmath/llvector4a.inl +++ b/indra/llmath/llvector4a.inl @@ -475,6 +475,7 @@ inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F3  inline LLBool32 LLVector4a::isFinite3() const  {  	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; +	ll_assert_aligned(nanOrInfMask,16);  	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);  	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );  	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); diff --git a/indra/llmath/llvector4logical.h b/indra/llmath/llvector4logical.h index dd66b09d43..c5698f7cea 100644 --- a/indra/llmath/llvector4logical.h +++ b/indra/llmath/llvector4logical.h @@ -27,6 +27,7 @@  #ifndef	LL_VECTOR4LOGICAL_H  #define	LL_VECTOR4LOGICAL_H +#include "llmemory.h"  ////////////////////////////  // LLVector4Logical @@ -77,6 +78,7 @@ public:  	inline LLVector4Logical& invert()  	{  		static const LL_ALIGN_16(U32 allOnes[4]) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +		ll_assert_aligned(allOnes,16);  		mQ = _mm_andnot_ps( mQ, *(LLQuad*)(allOnes) );  		return *this;  	} diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp index cc9744756f..11fa7080ce 100644 --- a/indra/llmath/llvolume.cpp +++ b/indra/llmath/llvolume.cpp @@ -95,17 +95,6 @@ const S32 SCULPT_MIN_AREA_DETAIL = 1;  extern BOOL gDebugGL; -void assert_aligned(void* ptr, uintptr_t alignment) -{ -#if 0 -	uintptr_t t = (uintptr_t) ptr; -	if (t%alignment != 0) -	{ -		llerrs << "Alignment check failed." << llendl; -	} -#endif -} -  BOOL check_same_clock_dir( const LLVector3& pt1, const LLVector3& pt2, const LLVector3& pt3, const LLVector3& norm)  {      	LLVector3 test = (pt2-pt1)%(pt3-pt2); @@ -6962,14 +6951,14 @@ void LLVolumeFace::resizeVertices(S32 num_verts)  	if (num_verts)  	{  		mPositions = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts); -		assert_aligned(mPositions, 16); +		ll_assert_aligned(mPositions, 16);  		mNormals = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts); -		assert_aligned(mNormals, 16); +		ll_assert_aligned(mNormals, 16);  		//pad texture coordinate block end to allow for QWORD reads  		S32 size = ((num_verts*sizeof(LLVector2)) + 0xF) & ~0xF;  		mTexCoords = (LLVector2*) ll_aligned_malloc_16(size); -		assert_aligned(mTexCoords, 16); +		ll_assert_aligned(mTexCoords, 16);  	}  	else  	{ @@ -6993,14 +6982,17 @@ void LLVolumeFace::pushVertex(const LLVector4a& pos, const LLVector4a& norm, con  //	S32 old_size = mNumVertices*16;  	//positions -	mPositions = (LLVector4a*) realloc(mPositions, new_size); +	mPositions = (LLVector4a*) ll_aligned_realloc_16(mPositions, new_size); +	ll_assert_aligned(mPositions,16);  	//normals -	mNormals = (LLVector4a*) realloc(mNormals, new_size); -	 +	mNormals = (LLVector4a*) ll_aligned_realloc_16(mNormals, new_size); +	ll_assert_aligned(mNormals,16); +  	//tex coords  	new_size = ((new_verts*8)+0xF) & ~0xF; -	mTexCoords = (LLVector2*) realloc(mTexCoords, new_size); +	mTexCoords = (LLVector2*) ll_aligned_realloc_16(mTexCoords, new_size); +	ll_assert_aligned(mTexCoords,16);  	//just clear binormals @@ -7053,7 +7045,8 @@ void LLVolumeFace::pushIndex(const U16& idx)  	S32 old_size = ((mNumIndices*2)+0xF) & ~0xF;  	if (new_size != old_size)  	{ -		mIndices = (U16*) realloc(mIndices, new_size); +		mIndices = (U16*) ll_aligned_realloc_16(mIndices, new_size); +		ll_assert_aligned(mIndices,16);  	}  	mIndices[mNumIndices++] = idx; @@ -7094,12 +7087,12 @@ void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMat  	}  	//allocate new buffer space -	mPositions = (LLVector4a*) realloc(mPositions, new_count*sizeof(LLVector4a)); -	assert_aligned(mPositions, 16); -	mNormals = (LLVector4a*) realloc(mNormals, new_count*sizeof(LLVector4a)); -	assert_aligned(mNormals, 16); -	mTexCoords = (LLVector2*) realloc(mTexCoords, (new_count*sizeof(LLVector2)+0xF) & ~0xF); -	assert_aligned(mTexCoords, 16); +	mPositions = (LLVector4a*) ll_aligned_realloc_16(mPositions, new_count*sizeof(LLVector4a)); +	ll_assert_aligned(mPositions, 16); +	mNormals = (LLVector4a*) ll_aligned_realloc_16(mNormals, new_count*sizeof(LLVector4a)); +	ll_assert_aligned(mNormals, 16); +	mTexCoords = (LLVector2*) ll_aligned_realloc_16(mTexCoords, (new_count*sizeof(LLVector2)+0xF) & ~0xF); +	ll_assert_aligned(mTexCoords, 16);  	mNumVertices = new_count; @@ -7145,7 +7138,7 @@ void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMat  	new_count = mNumIndices + face.mNumIndices;  	//allocate new index buffer -	mIndices = (U16*) realloc(mIndices, (new_count*sizeof(U16)+0xF) & ~0xF); +	mIndices = (U16*) ll_aligned_realloc_16(mIndices, (new_count*sizeof(U16)+0xF) & ~0xF);  	//get destination address into new index buffer  	U16* dst_idx = mIndices+mNumIndices; diff --git a/indra/llmath/llvolumeoctree.h b/indra/llmath/llvolumeoctree.h index 688d91dc40..dac97b14b5 100644 --- a/indra/llmath/llvolumeoctree.h +++ b/indra/llmath/llvolumeoctree.h @@ -37,6 +37,16 @@  class LLVolumeTriangle : public LLRefCount  {  public: +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	} +  	LLVolumeTriangle()  	{ @@ -58,7 +68,7 @@ public:  	} -	LLVector4a mPositionGroup; +	LL_ALIGN_16(LLVector4a mPositionGroup);  	const LLVector4a* mV[3];  	U16 mIndex[3]; @@ -73,6 +83,16 @@ class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle>  {  public: +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	} +  	LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node);  	~LLVolumeOctreeListener(); @@ -99,8 +119,8 @@ public:  public: -	LLVector4a mBounds[2]; // bounding box (center, size) of this node and all its children (tight fit to objects) -	LLVector4a mExtents[2]; // extents (min, max) of this node and all its children +	LL_ALIGN_16(LLVector4a mBounds[2]); // bounding box (center, size) of this node and all its children (tight fit to objects) +	LL_ALIGN_16(LLVector4a mExtents[2]); // extents (min, max) of this node and all its children  };  class LLOctreeTriangleRayIntersect : public LLOctreeTraveler<LLVolumeTriangle> diff --git a/indra/llprimitive/llmodel.cpp b/indra/llprimitive/llmodel.cpp index cb32a510b8..28ed051c55 100644 --- a/indra/llprimitive/llmodel.cpp +++ b/indra/llprimitive/llmodel.cpp @@ -1026,7 +1026,8 @@ void LLModel::setVolumeFaceData(  	if (tc.get())  	{ -		LLVector4a::memcpyNonAliased16((F32*) face.mTexCoords, (F32*) tc.get(), num_verts*2*sizeof(F32)); +		U32 tex_size = (num_verts*2*sizeof(F32)+0xF)&~0xF; +		LLVector4a::memcpyNonAliased16((F32*) face.mTexCoords, (F32*) tc.get(), tex_size);  	}  	else  	{ diff --git a/indra/newview/CMakeLists.txt b/indra/newview/CMakeLists.txt index c0a252637f..2ae162a623 100644 --- a/indra/newview/CMakeLists.txt +++ b/indra/newview/CMakeLists.txt @@ -1565,8 +1565,7 @@ if (WINDOWS)      set_target_properties(${VIEWER_BINARY_NAME}          PROPERTIES          # *TODO -reenable this once we get server usage sorted out -        #LINK_FLAGS "/debug /NODEFAULTLIB:LIBCMT /SUBSYSTEM:WINDOWS /INCLUDE:\"__tcmalloc\"" -        LINK_FLAGS "/debug /NODEFAULTLIB:LIBCMT /SUBSYSTEM:WINDOWS /INCLUDE:__tcmalloc " +        LINK_FLAGS "/debug /NODEFAULTLIB:LIBCMT /SUBSYSTEM:WINDOWS "          LINK_FLAGS_DEBUG "/NODEFAULTLIB:\"LIBCMT;LIBCMTD;MSVCRT\" /INCREMENTAL:NO"          LINK_FLAGS_RELEASE "/FORCE:MULTIPLE /MAP\"secondlife-bin.MAP\" /OPT:REF"          ) @@ -1585,7 +1584,7 @@ if (WINDOWS)      # In the meantime, if you have any ideas on how to easily maintain one list, either here or in viewer_manifest.py      # and have the build deps get tracked *please* tell me about it. -    if(USE_GOOGLE_PERFTOOLS) +    if(USE_TCMALLOC)        # Configure a var for tcmalloc location, if used.        # Note the need to specify multiple names explicitly.        set(GOOGLE_PERF_TOOLS_SOURCE @@ -1593,7 +1592,7 @@ if (WINDOWS)          ${SHARED_LIB_STAGING_DIR}/RelWithDebInfo/libtcmalloc_minimal.dll          ${SHARED_LIB_STAGING_DIR}/Debug/libtcmalloc_minimal-debug.dll          ) -     endif(USE_GOOGLE_PERFTOOLS) +     endif(USE_TCMALLOC)      set(COPY_INPUT_DEPENDENCIES diff --git a/indra/newview/llappviewerwin32.cpp b/indra/newview/llappviewerwin32.cpp index bad60a9757..53c77fa22e 100644 --- a/indra/newview/llappviewerwin32.cpp +++ b/indra/newview/llappviewerwin32.cpp @@ -130,6 +130,8 @@ int APIENTRY WINMAIN(HINSTANCE hInstance,  	// This results in a 2-3x improvement in opening a new Inventory window (which uses a large numebr of allocations)  	// Note: This won't work when running from the debugger unless the _NO_DEBUG_HEAP environment variable is set to 1 +	// Enable to get mem debugging within visual studio. +	//_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);  	_CrtSetDbgFlag(0); // default, just making explicit  	ULONG ulEnableLFH = 2; diff --git a/indra/newview/lldrawable.h b/indra/newview/lldrawable.h index e2064b79f8..8c7db61502 100644 --- a/indra/newview/lldrawable.h +++ b/indra/newview/lldrawable.h @@ -59,6 +59,7 @@ class LLViewerTexture;  const U32 SILHOUETTE_HIGHLIGHT = 0;  // All data for new renderer goes into this class. +LL_ALIGN_PREFIX(16)  class LLDrawable : public LLRefCount  {  public: @@ -75,6 +76,16 @@ public:  	static void initClass(); +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	} +  	LLDrawable()				{ init(); }  	MEM_TYPE_NEW(LLMemType::MTYPE_DRAWABLE); @@ -281,8 +292,8 @@ public:  	} EDrawableFlags;  private: //aligned members -	LLVector4a		mExtents[2]; -	LLVector4a		mPositionGroup; +	LL_ALIGN_16(LLVector4a		mExtents[2]); +	LL_ALIGN_16(LLVector4a		mPositionGroup);  public:  	LLXformMatrix       mXform; @@ -323,7 +334,7 @@ private:  	static U32 sNumZombieDrawables;  	static LLDynamicArrayPtr<LLPointer<LLDrawable> > sDeadList; -}; +} LL_ALIGN_POSTFIX(16);  inline LLFace* LLDrawable::getFace(const S32 i) const diff --git a/indra/newview/lldynamictexture.h b/indra/newview/lldynamictexture.h index e18090545d..c51e7d1e1a 100644 --- a/indra/newview/lldynamictexture.h +++ b/indra/newview/lldynamictexture.h @@ -36,6 +36,16 @@  class LLViewerDynamicTexture : public LLViewerTexture  {  public: +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	} +  	enum  	{  		LL_VIEWER_DYNAMIC_TEXTURE = LLViewerTexture::DYNAMIC_TEXTURE, @@ -85,7 +95,7 @@ protected:  protected:  	BOOL mClamp;  	LLCoordGL mOrigin; -	LLCamera mCamera; +	LL_ALIGN_16(LLCamera mCamera);  	typedef std::set<LLViewerDynamicTexture*> instance_list_t;  	static instance_list_t sInstances[ LLViewerDynamicTexture::ORDER_COUNT ]; diff --git a/indra/newview/llface.h b/indra/newview/llface.h index 76ea5c853a..5dca27487f 100644 --- a/indra/newview/llface.h +++ b/indra/newview/llface.h @@ -59,6 +59,17 @@ class LLFace  {  public: +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	} + +  	LLFace(const LLFace& rhs)  	{  		*this = rhs; diff --git a/indra/newview/llfloatermodelpreview.cpp b/indra/newview/llfloatermodelpreview.cpp index 3fe535cbe8..a071f338ba 100755 --- a/indra/newview/llfloatermodelpreview.cpp +++ b/indra/newview/llfloatermodelpreview.cpp @@ -4774,7 +4774,8 @@ void LLModelPreview::genBuffers(S32 lod, bool include_skin_weights)  			if (vf.mTexCoords)  			{  				vb->getTexCoord0Strider(tc_strider); -				LLVector4a::memcpyNonAliased16((F32*) tc_strider.get(), (F32*) vf.mTexCoords, num_vertices*2*sizeof(F32)); +				S32 tex_size = (num_vertices*2*sizeof(F32)+0xF) & ~0xF; +				LLVector4a::memcpyNonAliased16((F32*) tc_strider.get(), (F32*) vf.mTexCoords, tex_size);  			}  			if (vf.mNormals) diff --git a/indra/newview/llpolymesh.cpp b/indra/newview/llpolymesh.cpp index 450f9b2be7..0860506086 100644 --- a/indra/newview/llpolymesh.cpp +++ b/indra/newview/llpolymesh.cpp @@ -129,22 +129,22 @@ void LLPolyMeshSharedData::freeMeshData()          {                  mNumVertices = 0; -                delete [] mBaseCoords; +                ll_aligned_free_16(mBaseCoords);                  mBaseCoords = NULL; -                delete [] mBaseNormals; +                ll_aligned_free_16(mBaseNormals);                  mBaseNormals = NULL; -                delete [] mBaseBinormals; +                ll_aligned_free_16(mBaseBinormals);                  mBaseBinormals = NULL; -                delete [] mTexCoords; +                ll_aligned_free_16(mTexCoords);                  mTexCoords = NULL; -                delete [] mDetailTexCoords; +                ll_aligned_free_16(mDetailTexCoords);                  mDetailTexCoords = NULL; -                delete [] mWeights; +                ll_aligned_free_16(mWeights);                  mWeights = NULL;          } @@ -229,12 +229,12 @@ U32 LLPolyMeshSharedData::getNumKB()  BOOL LLPolyMeshSharedData::allocateVertexData( U32 numVertices )  {          U32 i; -        mBaseCoords = new LLVector3[ numVertices ]; -        mBaseNormals = new LLVector3[ numVertices ]; -        mBaseBinormals = new LLVector3[ numVertices ]; -        mTexCoords = new LLVector2[ numVertices ]; -        mDetailTexCoords = new LLVector2[ numVertices ]; -        mWeights = new F32[ numVertices ]; +        mBaseCoords = (LLVector3*) ll_aligned_malloc_16(numVertices*sizeof(LLVector3)); +        mBaseNormals = (LLVector3*) ll_aligned_malloc_16(numVertices*sizeof(LLVector3)); +        mBaseBinormals = (LLVector3*) ll_aligned_malloc_16(numVertices*sizeof(LLVector3)); +        mTexCoords = (LLVector2*) ll_aligned_malloc_16(numVertices*sizeof(LLVector2)); +        mDetailTexCoords = (LLVector2*) ll_aligned_malloc_16(numVertices*sizeof(LLVector2)); +        mWeights = (F32*) ll_aligned_malloc_16(numVertices*sizeof(F32));          for (i = 0; i < numVertices; i++)          {                  mWeights[i] = 0.f; diff --git a/indra/newview/llspatialpartition.cpp b/indra/newview/llspatialpartition.cpp index 325a2d3004..45ef8f1a6d 100644 --- a/indra/newview/llspatialpartition.cpp +++ b/indra/newview/llspatialpartition.cpp @@ -529,6 +529,7 @@ void LLSpatialGroup::setVisible()  void LLSpatialGroup::validate()  { +	ll_assert_aligned(this,64);  #if LL_OCTREE_PARANOIA_CHECK  	sg_assert(!isState(DIRTY)); @@ -1195,6 +1196,8 @@ LLSpatialGroup::LLSpatialGroup(OctreeNode* node, LLSpatialPartition* part) :  	mCurUpdatingSlotp(NULL),  	mCurUpdatingTexture (NULL)  { +	ll_assert_aligned(this,16); +	  	sNodeCount++;  	LLMemType mt(LLMemType::MTYPE_SPACE_PARTITION); diff --git a/indra/newview/llspatialpartition.h b/indra/newview/llspatialpartition.h index f0e4f15a83..7968c28900 100644 --- a/indra/newview/llspatialpartition.h +++ b/indra/newview/llspatialpartition.h @@ -68,6 +68,16 @@ protected:  	~LLDrawInfo();	  public: +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	} +  	LLDrawInfo(const LLDrawInfo& rhs)  	{ @@ -106,7 +116,7 @@ public:  	F32 mPartSize;  	F32 mVSize;  	LLSpatialGroup* mGroup; -	LLFace* mFace; //associated face +	LL_ALIGN_16(LLFace* mFace); //associated face  	F32 mDistance;  	U32 mDrawMode; @@ -181,7 +191,7 @@ public:  	};  }; -LL_ALIGN_PREFIX(64) +LL_ALIGN_PREFIX(16)  class LLSpatialGroup : public LLOctreeListener<LLDrawable>  {  	friend class LLSpatialPartition; @@ -193,6 +203,16 @@ public:  		*this = rhs;  	} +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	} +  	const LLSpatialGroup& operator=(const LLSpatialGroup& rhs)  	{  		llerrs << "Illegal operation!" << llendl; @@ -370,12 +390,12 @@ public:  		V4_COUNT = 10  	} eV4Index; -	LLVector4a mBounds[2]; // bounding box (center, size) of this node and all its children (tight fit to objects) -	LLVector4a mExtents[2]; // extents (min, max) of this node and all its children -	LLVector4a mObjectExtents[2]; // extents (min, max) of objects in this node -	LLVector4a mObjectBounds[2]; // bounding box (center, size) of objects in this node -	LLVector4a mViewAngle; -	LLVector4a mLastUpdateViewAngle; +	LL_ALIGN_16(LLVector4a mBounds[2]); // bounding box (center, size) of this node and all its children (tight fit to objects) +	LL_ALIGN_16(LLVector4a mExtents[2]); // extents (min, max) of this node and all its children +	LL_ALIGN_16(LLVector4a mObjectExtents[2]); // extents (min, max) of objects in this node +	LL_ALIGN_16(LLVector4a mObjectBounds[2]); // bounding box (center, size) of objects in this node +	LL_ALIGN_16(LLVector4a mViewAngle); +	LL_ALIGN_16(LLVector4a mLastUpdateViewAngle);  	F32 mObjectBoxSize; //cached mObjectBounds[1].getLength3() diff --git a/indra/newview/llviewercamera.h b/indra/newview/llviewercamera.h index 184033de42..b857c7fe89 100644 --- a/indra/newview/llviewercamera.h +++ b/indra/newview/llviewercamera.h @@ -51,9 +51,19 @@ const BOOL NOT_FOR_SELECTION = FALSE;  extern template class LLViewerCamera* LLSingleton<class LLViewerCamera>::getInstance();  #endif +LL_ALIGN_PREFIX(16)  class LLViewerCamera : public LLCamera, public LLSingleton<LLViewerCamera>  {  public: +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	}  	typedef enum  	{ @@ -137,6 +147,7 @@ protected:  	S16					mZoomSubregion;  public: -}; +} LL_ALIGN_POSTFIX(16); +  #endif // LL_LLVIEWERCAMERA_H diff --git a/indra/newview/llviewerjointmesh.cpp b/indra/newview/llviewerjointmesh.cpp index f029ae5302..5d1aa870a3 100644 --- a/indra/newview/llviewerjointmesh.cpp +++ b/indra/newview/llviewerjointmesh.cpp @@ -729,8 +729,10 @@ void LLViewerJointMesh::updateFaceData(LLFace *face, F32 pixel_area, BOOL damp_w  				F32* vw = (F32*) vertex_weightsp.get();  				F32* cw = (F32*) clothing_weightsp.get();	 -				LLVector4a::memcpyNonAliased16(tc, (F32*) mMesh->getTexCoords(), num_verts*2*sizeof(F32)); -				LLVector4a::memcpyNonAliased16(vw, (F32*) mMesh->getWeights(), num_verts*sizeof(F32));	 +				S32 tc_size = (num_verts*2*sizeof(F32)+0xF) & ~0xF; +				LLVector4a::memcpyNonAliased16(tc, (F32*) mMesh->getTexCoords(), tc_size); +				S32 vw_size = (num_verts*sizeof(F32)+0xF) & ~0xF;	 +				LLVector4a::memcpyNonAliased16(vw, (F32*) mMesh->getWeights(), vw_size);	  				LLVector4a::memcpyNonAliased16(cw, (F32*) mMesh->getClothingWeights(), num_verts*4*sizeof(F32));	  			} diff --git a/indra/newview/llvoavatar.cpp b/indra/newview/llvoavatar.cpp index 33dc12c473..1b7009a5c2 100644 --- a/indra/newview/llvoavatar.cpp +++ b/indra/newview/llvoavatar.cpp @@ -2692,7 +2692,7 @@ void LLVOAvatar::idleUpdateMisc(bool detailed_update)  	if (isImpostor() && !mNeedsImpostorUpdate)  	{ -		LLVector4a ext[2]; +		LL_ALIGN_16(LLVector4a ext[2]);  		F32 distance;  		LLVector3 angle; diff --git a/indra/newview/llvoavatar.h b/indra/newview/llvoavatar.h index 6fb56a4c0b..4081a1408d 100644 --- a/indra/newview/llvoavatar.h +++ b/indra/newview/llvoavatar.h @@ -93,6 +93,16 @@ protected:   **/  public: +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	} +  	LLVOAvatar(const LLUUID &id, const LLPCode pcode, LLViewerRegion *regionp);  	virtual void		markDead();  	static void			initClass(); // Initialize data that's only init'd once per class. @@ -215,7 +225,7 @@ public:  	bool isBuilt() const { return mIsBuilt; }  private: //aligned members -	LLVector4a	mImpostorExtents[2]; +	LL_ALIGN_16(LLVector4a	mImpostorExtents[2]);  private:  	BOOL			mSupportsAlphaLayers; // For backwards compatibility, TRUE for 1.23+ clients diff --git a/indra/newview/llvoavatarself.h b/indra/newview/llvoavatarself.h index 543891ca63..2b273e616c 100644 --- a/indra/newview/llvoavatarself.h +++ b/indra/newview/llvoavatarself.h @@ -49,6 +49,16 @@ class LLVOAvatarSelf :   **/  public: +	void* operator new(size_t size) +	{ +		return ll_aligned_malloc_16(size); +	} + +	void operator delete(void* ptr) +	{ +		ll_aligned_free_16(ptr); +	} +  	LLVOAvatarSelf(const LLUUID &id, const LLPCode pcode, LLViewerRegion *regionp);  	virtual 				~LLVOAvatarSelf();  	virtual void			markDead(); diff --git a/indra/newview/viewer_manifest.py b/indra/newview/viewer_manifest.py index 7c6b5403e1..894d2f0925 100644 --- a/indra/newview/viewer_manifest.py +++ b/indra/newview/viewer_manifest.py @@ -1080,7 +1080,15 @@ class Linux_i686Manifest(LinuxManifest):              # previous call did, without having to explicitly state the              # version number.              self.path("libfontconfig.so.*.*") -            self.path("libtcmalloc.so*") #formerly called google perf tools +            try: +                self.path("libtcmalloc.so", "libtcmalloc.so") #formerly called google perf tools +                self.path("libtcmalloc.so.0", "libtcmalloc.so.0") #formerly called google perf tools +                self.path("libtcmalloc.so.0.1.0", "libtcmalloc.so.0.1.0") #formerly called google perf tools +                pass +            except: +                print "tcmalloc files not found, skipping" +                pass +              try:                      self.path("libfmod-3.75.so")                      pass | 
