16 files changed, 205 insertions, 47 deletions
diff --git a/indra/llmath/CMakeLists.txt b/indra/llmath/CMakeLists.txt
index b5e59c1ca3..5865ae030c 100644
--- a/indra/llmath/CMakeLists.txt
+++ b/indra/llmath/CMakeLists.txt
@@ -117,6 +117,7 @@ if (LL_TESTS)
   # INTEGRATION TESTS
   set(test_libs llmath llcommon ${LLCOMMON_LIBRARIES} ${WINDOWS_LIBRARIES})
   # TODO: Some of these need refactoring to be proper Unit tests rather than Integration tests.
+  LL_ADD_INTEGRATION_TEST(alignment "" "${test_libs}")
   LL_ADD_INTEGRATION_TEST(llbbox llbbox.cpp "${test_libs}")
   LL_ADD_INTEGRATION_TEST(llquaternion llquaternion.cpp "${test_libs}")
   LL_ADD_INTEGRATION_TEST(mathmisc "" "${test_libs}")
diff --git a/indra/llmath/llcamera.h b/indra/llmath/llcamera.h
index ec67b91d05..0b591be622 100644
--- a/indra/llmath/llcamera.h
+++ b/indra/llmath/llcamera.h
@@ -60,7 +60,7 @@ static const F32 MAX_FIELD_OF_VIEW = 175.f * DEG_TO_RAD;
 // roll(), pitch(), yaw()
 // etc...
 
-
+LL_ALIGN_PREFIX(16)
 class LLCamera
 : 	public LLCoordFrame
 {
@@ -108,7 +108,7 @@ public:
 	};
 
 private:
-	LLPlane mAgentPlanes[7];  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP
+	LL_ALIGN_16(LLPlane mAgentPlanes[7]);  //frustum planes in agent space a la gluUnproject (I'm a bastard, I know) - DaveP
 	U8 mPlaneMask[8];         // 8 for alignment	
 	
 	F32 mView;					// angle between top and bottom frustum planes in radians.
@@ -116,13 +116,13 @@ private:
 	S32 mViewHeightInPixels;	// for ViewHeightInPixels() only
 	F32 mNearPlane;
 	F32 mFarPlane;
-	LLPlane mLocalPlanes[4];
+	LL_ALIGN_16(LLPlane mLocalPlanes[4]);
 	F32 mFixedDistance;			// Always return this distance, unless < 0
 	LLVector3 mFrustCenter;		// center of frustum and radius squared for ultra-quick exclusion test
 	F32 mFrustRadiusSquared;
 	
-	LLPlane mWorldPlanes[PLANE_NUM];
-	LLPlane mHorizPlanes[HORIZ_PLANE_NUM];
+	LL_ALIGN_16(LLPlane mWorldPlanes[PLANE_NUM]);
+	LL_ALIGN_16(LLPlane mHorizPlanes[HORIZ_PLANE_NUM]);
 
 	U32 mPlaneCount;  //defaults to 6, if setUserClipPlane is called, uses user supplied clip plane in
 
@@ -208,7 +208,7 @@ protected:
 	void calculateFrustumPlanes(F32 left, F32 right, F32 top, F32 bottom);
 	void calculateFrustumPlanesFromWindow(F32 x1, F32 y1, F32 x2, F32 y2);
 	void calculateWorldFrustumPlanes();
-};
+} LL_ALIGN_POSTFIX(16);
 
 
 #endif
diff --git a/indra/llmath/llmath.h b/indra/llmath/llmath.h
index 9297bcbac2..b93f89d674 100644
--- a/indra/llmath/llmath.h
+++ b/indra/llmath/llmath.h
@@ -85,7 +85,7 @@ const F32	F_ALMOST_ONE	= 1.0f - F_ALMOST_ZERO;
 const F32 FP_MAG_THRESHOLD = 0.0000001f;
 
 // TODO: Replace with logic like is_approx_equal
-inline BOOL is_approx_zero( F32 f ) { return (-F_APPROXIMATELY_ZERO < f) && (f < F_APPROXIMATELY_ZERO); }
+inline bool is_approx_zero( F32 f ) { return (-F_APPROXIMATELY_ZERO < f) && (f < F_APPROXIMATELY_ZERO); }
 
 // These functions work by interpreting sign+exp+mantissa as an unsigned
 // integer.
@@ -111,13 +111,13 @@ inline BOOL is_approx_zero( F32 f ) { return (-F_APPROXIMATELY_ZERO < f) && (f <
 // WARNING: Infinity is comparable with F32_MAX and negative 
 // infinity is comparable with F32_MIN
 
-inline BOOL is_approx_equal(F32 x, F32 y)
+inline bool is_approx_equal(F32 x, F32 y)
 {
 	const S32 COMPARE_MANTISSA_UP_TO_BIT = 0x02;
 	return (std::abs((S32) ((U32&)x - (U32&)y) ) < COMPARE_MANTISSA_UP_TO_BIT);
 }
 
-inline BOOL is_approx_equal(F64 x, F64 y)
+inline bool is_approx_equal(F64 x, F64 y)
 {
 	const S64 COMPARE_MANTISSA_UP_TO_BIT = 0x02;
 	return (std::abs((S32) ((U64&)x - (U64&)y) ) < COMPARE_MANTISSA_UP_TO_BIT);
diff --git a/indra/llmath/llmatrix3a.h b/indra/llmath/llmatrix3a.h
index adb7e3389d..9916cfd2da 100644
--- a/indra/llmath/llmatrix3a.h
+++ b/indra/llmath/llmatrix3a.h
@@ -111,7 +111,7 @@ public:
 
 protected:
 
-	LLVector4a mColumns[3];
+	LL_ALIGN_16(LLVector4a mColumns[3]);
 
 };
 
diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h
index 27cf5b79f6..c4cefdb4fa 100644
--- a/indra/llmath/llmatrix4a.h
+++ b/indra/llmath/llmatrix4a.h
@@ -34,7 +34,7 @@
 class LLMatrix4a
 {
 public:
-	LLVector4a mMatrix[4];
+	LL_ALIGN_16(LLVector4a mMatrix[4]);
 
 	inline void clear()
 	{
diff --git a/indra/llmath/lloctree.h b/indra/llmath/lloctree.h
index 6c768863a6..c3f6f7de2a 100644
--- a/indra/llmath/lloctree.h
+++ b/indra/llmath/lloctree.h
@@ -89,7 +89,7 @@ public:
 	typedef LLOctreeNode<T>		oct_node;
 	typedef LLOctreeListener<T>	oct_listener;
 
-	/*void* operator new(size_t size)
+	void* operator new(size_t size)
 	{
 		return ll_aligned_malloc_16(size);
 	}
@@ -97,7 +97,7 @@ public:
 	void operator delete(void* ptr)
 	{
 		ll_aligned_free_16(ptr);
-	}*/
+	}
 
 	LLOctreeNode(	const LLVector4a& center, 
 					const LLVector4a& size, 
diff --git a/indra/llmath/llplane.h b/indra/llmath/llplane.h
index a611894721..3c32441b11 100644
--- a/indra/llmath/llplane.h
+++ b/indra/llmath/llplane.h
@@ -36,6 +36,8 @@
 // The plane normal = [A, B, C]
 // The closest approach = D / sqrt(A*A + B*B + C*C)
 
+
+LL_ALIGN_PREFIX(16)
 class LLPlane
 {
 public:
@@ -94,7 +96,7 @@ public:
 		
 private:
 	LLVector4a mV;
-};
+} LL_ALIGN_POSTFIX(16);
 
 
 
diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h
index c7cdf7b32c..01458521ec 100644
--- a/indra/llmath/llsimdmath.h
+++ b/indra/llmath/llsimdmath.h
@@ -67,11 +67,10 @@ template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
 
 #define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
 
-
-
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
+#include "llmemory.h"
 #include "llsimdtypes.h"
 #include "llsimdtypes.inl"
 
diff --git a/indra/llmath/llsimdtypes.inl b/indra/llmath/llsimdtypes.inl
index 712239e425..e905c84954 100644
--- a/indra/llmath/llsimdtypes.inl
+++ b/indra/llmath/llsimdtypes.inl
@@ -62,6 +62,7 @@ inline LLSimdScalar operator/(const LLSimdScalar& a, const LLSimdScalar& b)
 inline LLSimdScalar operator-(const LLSimdScalar& a)
 {
 	static LL_ALIGN_16(const U32 signMask[4]) = {0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+	ll_assert_aligned(signMask,16);
 	return _mm_xor_ps(*reinterpret_cast<const LLQuad*>(signMask), a);
 }
 
@@ -146,6 +147,7 @@ inline LLSimdScalar& LLSimdScalar::operator/=(const LLSimdScalar& rhs)
 inline LLSimdScalar LLSimdScalar::getAbs() const
 {
 	static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
+	ll_assert_aligned(F_ABS_MASK_4A,16);
 	return _mm_and_ps( mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
 }
 
diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp
index b66b7a7076..6edeb0fefe 100644
--- a/indra/llmath/llvector4a.cpp
+++ b/indra/llmath/llvector4a.cpp
@@ -24,6 +24,7 @@
  * $/LicenseInfo$
  */
 
+#include "llmemory.h"
 #include "llmath.h"
 #include "llquantize.h"
 
@@ -44,7 +45,10 @@ extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F
 	assert(dst != NULL);
 	assert(bytes > 0);
 	assert((bytes % sizeof(F32))== 0); 
-	
+	ll_assert_aligned(src,16);
+	ll_assert_aligned(dst,16);
+	assert(bytes%16==0);
+
 	F32* end = dst + (bytes / sizeof(F32) );
 
 	if (bytes > 64)
@@ -189,6 +193,8 @@ void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high )
 		LLVector4a oneOverDelta;
 		{
 			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
+			ll_assert_aligned(F_TWO_4A,16);
+			
 			LLVector4a two; two.load4a( F_TWO_4A );
 
 			// Here we use _mm_rcp_ps plus one round of newton-raphson
diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h
index 596082509d..0526793d3a 100644
--- a/indra/llmath/llvector4a.h
+++ b/indra/llmath/llvector4a.h
@@ -32,6 +32,7 @@ class LLRotation;
 
 #include <assert.h>
 #include "llpreprocessor.h"
+#include "llmemory.h"
 
 ///////////////////////////////////
 // FIRST TIME USERS PLEASE READ
@@ -46,6 +47,7 @@ class LLRotation;
 // LLVector3/LLVector4. 
 /////////////////////////////////
 
+LL_ALIGN_PREFIX(16)
 class LLVector4a
 {
 public:
@@ -82,6 +84,7 @@ public:
 	}
 
 	// Copy words 16-byte blocks from src to dst. Source and destination must not overlap. 
+	// Source and dest must be 16-byte aligned and size must be multiple of 16.
 	static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes);
 
 	////////////////////////////////////
@@ -90,6 +93,7 @@ public:
 	
 	LLVector4a()
 	{ //DO NOT INITIALIZE -- The overhead is completely unnecessary
+		ll_assert_aligned(this,16);
 	}
 	
 	LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f)
@@ -313,7 +317,7 @@ public:
 
 private:
 	LLQuad mQ;
-};
+} LL_ALIGN_POSTFIX(16);
 
 inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p)
 {
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
index 7ad22a5631..7c52ffef21 100644
--- a/indra/llmath/llvector4a.inl
+++ b/indra/llmath/llvector4a.inl
@@ -475,6 +475,7 @@ inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F3
 inline LLBool32 LLVector4a::isFinite3() const
 {
 	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+	ll_assert_aligned(nanOrInfMask,16);
 	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
 	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
 	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
diff --git a/indra/llmath/llvector4logical.h b/indra/llmath/llvector4logical.h
index dd66b09d43..c5698f7cea 100644
--- a/indra/llmath/llvector4logical.h
+++ b/indra/llmath/llvector4logical.h
@@ -27,6 +27,7 @@
 #ifndef	LL_VECTOR4LOGICAL_H
 #define	LL_VECTOR4LOGICAL_H
 
+#include "llmemory.h"
 
 ////////////////////////////
 // LLVector4Logical
@@ -77,6 +78,7 @@ public:
 	inline LLVector4Logical& invert()
 	{
 		static const LL_ALIGN_16(U32 allOnes[4]) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+		ll_assert_aligned(allOnes,16);
 		mQ = _mm_andnot_ps( mQ, *(LLQuad*)(allOnes) );
 		return *this;
 	}
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
index 06ac0aa1f6..53d56e96da 100644
--- a/indra/llmath/llvolume.cpp
+++ b/indra/llmath/llvolume.cpp
@@ -95,17 +95,6 @@ const S32 SCULPT_MIN_AREA_DETAIL = 1;
 
 extern BOOL gDebugGL;
 
-void assert_aligned(void* ptr, uintptr_t alignment)
-{
-#if 0
-	uintptr_t t = (uintptr_t) ptr;
-	if (t%alignment != 0)
-	{
-		llerrs << "Alignment check failed." << llendl;
-	}
-#endif
-}
-
 BOOL check_same_clock_dir( const LLVector3& pt1, const LLVector3& pt2, const LLVector3& pt3, const LLVector3& norm)
 {    
 	LLVector3 test = (pt2-pt1)%(pt3-pt2);
@@ -6962,14 +6951,14 @@ void LLVolumeFace::resizeVertices(S32 num_verts)
 	if (num_verts)
 	{
 		mPositions = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		assert_aligned(mPositions, 16);
+		ll_assert_aligned(mPositions, 16);
 		mNormals = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*num_verts);
-		assert_aligned(mNormals, 16);
+		ll_assert_aligned(mNormals, 16);
 
 		//pad texture coordinate block end to allow for QWORD reads
 		S32 size = ((num_verts*sizeof(LLVector2)) + 0xF) & ~0xF;
 		mTexCoords = (LLVector2*) ll_aligned_malloc_16(size);
-		assert_aligned(mTexCoords, 16);
+		ll_assert_aligned(mTexCoords, 16);
 	}
 	else
 	{
@@ -6993,14 +6982,17 @@ void LLVolumeFace::pushVertex(const LLVector4a& pos, const LLVector4a& norm, con
 //	S32 old_size = mNumVertices*16;
 
 	//positions
-	mPositions = (LLVector4a*) realloc(mPositions, new_size);
+	mPositions = (LLVector4a*) ll_aligned_realloc_16(mPositions, new_size);
+	ll_assert_aligned(mPositions,16);
 	
 	//normals
-	mNormals = (LLVector4a*) realloc(mNormals, new_size);
-	
+	mNormals = (LLVector4a*) ll_aligned_realloc_16(mNormals, new_size);
+	ll_assert_aligned(mNormals,16);
+
 	//tex coords
 	new_size = ((new_verts*8)+0xF) & ~0xF;
-	mTexCoords = (LLVector2*) realloc(mTexCoords, new_size);
+	mTexCoords = (LLVector2*) ll_aligned_realloc_16(mTexCoords, new_size);
+	ll_assert_aligned(mTexCoords,16);
 	
 
 	//just clear binormals
@@ -7053,7 +7045,8 @@ void LLVolumeFace::pushIndex(const U16& idx)
 	S32 old_size = ((mNumIndices*2)+0xF) & ~0xF;
 	if (new_size != old_size)
 	{
-		mIndices = (U16*) realloc(mIndices, new_size);
+		mIndices = (U16*) ll_aligned_realloc_16(mIndices, new_size);
+		ll_assert_aligned(mIndices,16);
 	}
 	
 	mIndices[mNumIndices++] = idx;
@@ -7094,12 +7087,12 @@ void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMat
 	}
 
 	//allocate new buffer space
-	mPositions = (LLVector4a*) realloc(mPositions, new_count*sizeof(LLVector4a));
-	assert_aligned(mPositions, 16);
-	mNormals = (LLVector4a*) realloc(mNormals, new_count*sizeof(LLVector4a));
-	assert_aligned(mNormals, 16);
-	mTexCoords = (LLVector2*) realloc(mTexCoords, (new_count*sizeof(LLVector2)+0xF) & ~0xF);
-	assert_aligned(mTexCoords, 16);
+	mPositions = (LLVector4a*) ll_aligned_realloc_16(mPositions, new_count*sizeof(LLVector4a));
+	ll_assert_aligned(mPositions, 16);
+	mNormals = (LLVector4a*) ll_aligned_realloc_16(mNormals, new_count*sizeof(LLVector4a));
+	ll_assert_aligned(mNormals, 16);
+	mTexCoords = (LLVector2*) ll_aligned_realloc_16(mTexCoords, (new_count*sizeof(LLVector2)+0xF) & ~0xF);
+	ll_assert_aligned(mTexCoords, 16);
 	
 	mNumVertices = new_count;
 
@@ -7145,7 +7138,7 @@ void LLVolumeFace::appendFace(const LLVolumeFace& face, LLMatrix4& mat_in, LLMat
 	new_count = mNumIndices + face.mNumIndices;
 
 	//allocate new index buffer
-	mIndices = (U16*) realloc(mIndices, (new_count*sizeof(U16)+0xF) & ~0xF);
+	mIndices = (U16*) ll_aligned_realloc_16(mIndices, (new_count*sizeof(U16)+0xF) & ~0xF);
 	
 	//get destination address into new index buffer
 	U16* dst_idx = mIndices+mNumIndices;
diff --git a/indra/llmath/llvolumeoctree.h b/indra/llmath/llvolumeoctree.h
index ed54fe0835..9ae34a0c4e 100644
--- a/indra/llmath/llvolumeoctree.h
+++ b/indra/llmath/llvolumeoctree.h
@@ -37,6 +37,16 @@
 class LLVolumeTriangle : public LLRefCount
 {
 public:
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	LLVolumeTriangle()
 	{
 		mBinIndex = -1;	
@@ -58,7 +68,7 @@ public:
 	
 	}
 
-	LLVector4a mPositionGroup;
+	LL_ALIGN_16(LLVector4a mPositionGroup);
 
 	const LLVector4a* mV[3];
 	U16 mIndex[3];
@@ -80,6 +90,16 @@ class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle>
 {
 public:
 	
+	void* operator new(size_t size)
+	{
+		return ll_aligned_malloc_16(size);
+	}
+
+	void operator delete(void* ptr)
+	{
+		ll_aligned_free_16(ptr);
+	}
+
 	LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node);
 	~LLVolumeOctreeListener();
 	
@@ -106,8 +126,8 @@ public:
 	
 
 public:
-	LLVector4a mBounds[2]; // bounding box (center, size) of this node and all its children (tight fit to objects)
-	LLVector4a mExtents[2]; // extents (min, max) of this node and all its children
+	LL_ALIGN_16(LLVector4a mBounds[2]); // bounding box (center, size) of this node and all its children (tight fit to objects)
+	LL_ALIGN_16(LLVector4a mExtents[2]); // extents (min, max) of this node and all its children
 };
 
 class LLOctreeTriangleRayIntersect : public LLOctreeTraveler<LLVolumeTriangle>
diff --git a/indra/llmath/tests/alignment_test.cpp b/indra/llmath/tests/alignment_test.cpp
new file mode 100644
index 0000000000..ac0c45ae6f
--- /dev/null
+++ b/indra/llmath/tests/alignment_test.cpp
@@ -0,0 +1,128 @@
+/**
+ * @file v3dmath_test.cpp
+ * @author Vir
+ * @date 2011-12
+ * @brief v3dmath test cases.
+ *
+ * $LicenseInfo:firstyear=2011&license=viewerlgpl$
+ * Second Life Viewer Source Code
+ * Copyright (C) 2011, Linden Research, Inc.
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License only.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * 
+ * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
+ * $/LicenseInfo$
+ */
+
+// Tests related to allocating objects with alignment constraints, particularly for SSE support.
+
+#include "linden_common.h"
+#include "../test/lltut.h"
+#include "../llmath.h"
+#include "../llsimdmath.h"
+#include "../llvector4a.h"
+
+void* operator new(size_t size)
+{
+	return ll_aligned_malloc_16(size);
+}
+
+void operator delete(void *p)
+{
+	ll_aligned_free_16(p);
+}
+
+namespace tut
+{
+
+#define is_aligned(ptr,alignment) ((reinterpret_cast<uintptr_t>(ptr))%(alignment)==0)
+#define is_aligned_relative(ptr,base_ptr,alignment) ((reinterpret_cast<uintptr_t>(ptr)-reinterpret_cast<uintptr_t>(base_ptr))%(alignment)==0)
+
+struct alignment_test {};
+
+typedef test_group<alignment_test> alignment_test_t;
+typedef alignment_test_t::object alignment_test_object_t;
+tut::alignment_test_t tut_alignment_test("LLAlignment");
+
+LL_ALIGN_PREFIX(16)
+class MyVector4a
+{
+	LLQuad mQ;
+} LL_ALIGN_POSTFIX(16);
+
+
+// Verify that aligned allocators perform as advertised.
+template<> template<>
+void alignment_test_object_t::test<1>()
+{
+#   ifdef LL_DEBUG
+	skip("This test fails on Windows when compiled in debug mode.");
+#   endif
+	
+	const int num_tests = 7;
+	void *align_ptr;
+	for (int i=0; i<num_tests; i++)
+	{
+		align_ptr = ll_aligned_malloc_16(sizeof(MyVector4a));
+		ensure("ll_aligned_malloc_16 failed", is_aligned(align_ptr,16));
+
+		align_ptr = ll_aligned_realloc_16(align_ptr,2*sizeof(MyVector4a));
+		ensure("ll_aligned_realloc_16 failed", is_aligned(align_ptr,16));
+
+		ll_aligned_free_16(align_ptr);
+
+		align_ptr = ll_aligned_malloc_32(sizeof(MyVector4a));
+		ensure("ll_aligned_malloc_32 failed", is_aligned(align_ptr,32));
+		ll_aligned_free_32(align_ptr);
+	}
+}
+
+// In-place allocation of objects and arrays.
+template<> template<>
+void alignment_test_object_t::test<2>()
+{
+	MyVector4a vec1;
+	ensure("LLAlignment vec1 unaligned", is_aligned(&vec1,16));
+	
+	MyVector4a veca[12];
+	ensure("LLAlignment veca unaligned", is_aligned(veca,16));
+}
+
+// Heap allocation of objects and arrays.
+template<> template<>
+void alignment_test_object_t::test<3>()
+{
+#   ifdef LL_DEBUG
+	skip("This test fails on Windows when compiled in debug mode.");
+#   endif
+	
+	const int ARR_SIZE = 7;
+	for(int i=0; i<ARR_SIZE; i++)
+	{
+		MyVector4a *vecp = new MyVector4a;
+		ensure("LLAlignment vecp unaligned", is_aligned(vecp,16));
+		delete vecp;
+	}
+
+	MyVector4a *veca = new MyVector4a[ARR_SIZE];
+	ensure("LLAligment veca base", is_aligned(veca,16));
+	for(int i=0; i<ARR_SIZE; i++)
+	{
+		std::cout << "veca[" << i << "]" << std::endl;
+		ensure("LLAlignment veca member unaligned", is_aligned(&veca[i],16));
+	}
+}
+
+}