diff options
Diffstat (limited to 'indra/llmath')
| -rw-r--r-- | indra/llmath/llmatrix3a.cpp | 140 | ||||
| -rw-r--r-- | indra/llmath/llmatrix3a.h | 134 | ||||
| -rw-r--r-- | indra/llmath/llmatrix3a.inl | 125 | ||||
| -rw-r--r-- | indra/llmath/llmatrix4a.h | 149 | ||||
| -rw-r--r-- | indra/llmath/llquaternion2.h | 111 | ||||
| -rw-r--r-- | indra/llmath/llquaternion2.inl | 108 | ||||
| -rw-r--r-- | indra/llmath/llsimdmath.h | 95 | ||||
| -rw-r--r-- | indra/llmath/llsimdtypes.h | 130 | ||||
| -rw-r--r-- | indra/llmath/llsimdtypes.inl | 163 | ||||
| -rw-r--r-- | indra/llmath/llvector4a.cpp | 228 | ||||
| -rw-r--r-- | indra/llmath/llvector4a.h | 331 | ||||
| -rw-r--r-- | indra/llmath/llvector4a.inl | 599 | ||||
| -rw-r--r-- | indra/llmath/llvector4logical.h | 130 | ||||
| -rw-r--r-- | indra/llmath/llvolumeoctree.cpp | 208 | ||||
| -rw-r--r-- | indra/llmath/llvolumeoctree.h | 138 | 
15 files changed, 2789 insertions, 0 deletions
| diff --git a/indra/llmath/llmatrix3a.cpp b/indra/llmath/llmatrix3a.cpp new file mode 100644 index 0000000000..b7468f4914 --- /dev/null +++ b/indra/llmath/llmatrix3a.cpp @@ -0,0 +1,140 @@ +/**  + * @file llvector4a.cpp + * @brief SIMD vector implementation + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2007-2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llmath.h" + +static LL_ALIGN_16(const F32 M_IDENT_3A[12]) =  +												{	1.f, 0.f, 0.f, 0.f, // Column 1 +													0.f, 1.f, 0.f, 0.f, // Column 2 +													0.f, 0.f, 1.f, 0.f }; // Column 3 + +extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast<const LLMatrix3a*> (M_IDENT_3A); + +void LLMatrix3a::setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs ) +{ +	const LLVector4a col0 = lhs.getColumn(0); +	const LLVector4a col1 = lhs.getColumn(1); +	const LLVector4a col2 = lhs.getColumn(2); + +	for ( int i = 0; i < 3; i++ ) +	{ +		LLVector4a xxxx = _mm_load_ss( rhs.mColumns[i].getF32ptr() ); +		xxxx.splat<0>( xxxx ); +		xxxx.mul( col0 ); + +		{ +			LLVector4a yyyy = _mm_load_ss( rhs.mColumns[i].getF32ptr() +  1 ); +			yyyy.splat<0>( yyyy ); +			yyyy.mul( col1 );  +			xxxx.add( yyyy ); +		} + +		{ +			LLVector4a zzzz = _mm_load_ss( rhs.mColumns[i].getF32ptr() +  2 ); +			zzzz.splat<0>( zzzz ); +			zzzz.mul( col2 ); +			xxxx.add( zzzz ); +		} + +		xxxx.store4a( mColumns[i].getF32ptr() ); +	} +	 +} + +/*static */void LLMatrix3a::batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst ) +{ +	const LLVector4a col0 = xform.getColumn(0); +	const LLVector4a col1 = xform.getColumn(1); +	const LLVector4a col2 = xform.getColumn(2); +	const LLVector4a* maxAddr = src + numVectors; + +	if ( numVectors & 0x1 ) +	{ +		LLVector4a xxxx = _mm_load_ss( (const F32*)src ); +		LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 ); +		LLVector4a zzzz = _mm_load_ss( (const F32*)src + 2 ); +		xxxx.splat<0>( xxxx ); +		yyyy.splat<0>( yyyy ); +		zzzz.splat<0>( zzzz ); +		xxxx.mul( col0 ); +		yyyy.mul( col1 );  +		zzzz.mul( col2 ); +		xxxx.add( yyyy ); +		xxxx.add( zzzz ); +		xxxx.store4a( (F32*)dst ); +		src++; +		dst++; +	} + + +	numVectors >>= 1; +	while ( src < maxAddr ) +	{ +		_mm_prefetch( (const char*)(src + 32 ), _MM_HINT_NTA ); +		_mm_prefetch( (const char*)(dst + 32), _MM_HINT_NTA ); +		LLVector4a xxxx = _mm_load_ss( (const F32*)src ); +		LLVector4a xxxx1= _mm_load_ss( (const F32*)(src + 1) ); + +		xxxx.splat<0>( xxxx ); +		xxxx1.splat<0>( xxxx1 ); +		xxxx.mul( col0 ); +		xxxx1.mul( col0 ); + +		{ +			LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 ); +			LLVector4a yyyy1 = _mm_load_ss( (const F32*)(src + 1) + 1); +			yyyy.splat<0>( yyyy ); +			yyyy1.splat<0>( yyyy1 ); +			yyyy.mul( col1 ); +			yyyy1.mul( col1 ); +			xxxx.add( yyyy ); +			xxxx1.add( yyyy1 ); +		} + +		{ +			LLVector4a zzzz = _mm_load_ss( (const F32*)(src) + 2 ); +			LLVector4a zzzz1 = _mm_load_ss( (const F32*)(++src) + 2 ); +			zzzz.splat<0>( zzzz ); +			zzzz1.splat<0>( zzzz1 ); +			zzzz.mul( col2 ); +			zzzz1.mul( col2 ); +			xxxx.add( zzzz ); +			xxxx1.add( zzzz1 ); +		} + +		xxxx.store4a(dst->getF32ptr()); +		src++; +		dst++; + +		xxxx1.store4a((F32*)dst++); +	} +} diff --git a/indra/llmath/llmatrix3a.h b/indra/llmath/llmatrix3a.h new file mode 100644 index 0000000000..56327f9f6d --- /dev/null +++ b/indra/llmath/llmatrix3a.h @@ -0,0 +1,134 @@ +/**  + * @file llmatrix3a.h + * @brief LLMatrix3a class header file - memory aligned and vectorized 3x3 matrix + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef	LL_LLMATRIX3A_H +#define	LL_LLMATRIX3A_H + +///////////////////////////// +// LLMatrix3a, LLRotation +///////////////////////////// +// This class stores a 3x3 (technically 4x3) matrix in column-major order +///////////////////////////// +///////////////////////////// +// These classes are intentionally minimal right now. If you need additional +// functionality, please contact someone with SSE experience (e.g., Falcon or +// Huseby). +///////////////////////////// + +// LLMatrix3a is the base class for LLRotation, which should be used instead any time you're dealing with a  +// rotation matrix. +class LLMatrix3a +{ +public: + +	// Utility function for quickly transforming an array of LLVector4a's +	// For transforming a single LLVector4a, see LLVector4a::setRotated +	static void batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst ); + +	// Utility function to obtain the identity matrix +	static inline const LLMatrix3a& getIdentity(); + +	////////////////////////// +	// Ctors +	////////////////////////// +	 +	// Ctor +	LLMatrix3a() {} + +	// Ctor for setting by columns +	inline LLMatrix3a( const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2 ); + +	////////////////////////// +	// Get/Set +	////////////////////////// + +	// Loads from an LLMatrix3 +	inline void loadu(const LLMatrix3& src); +	 +	// Set rows +	inline void setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2); +	 +	// Set columns +	inline void setColumns(const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2); + +	// Get the read-only access to a specified column. Valid columns are 0-2, but the  +	// function is unchecked. You've been warned. +	inline const LLVector4a& getColumn(const U32 column) const; + +	///////////////////////// +	// Matrix modification +	///////////////////////// +	 +	// Set this matrix to the product of lhs and rhs ( this = lhs * rhs ) +	void setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs ); + +	// Set this matrix to the transpose of src +	inline void setTranspose(const LLMatrix3a& src); + +	// Set this matrix to a*w + b*(1-w) +	inline void setLerp(const LLMatrix3a& a, const LLMatrix3a& b, F32 w); + +	///////////////////////// +	// Matrix inspection +	///////////////////////// + +	// Sets all 4 elements in 'dest' to the determinant of this matrix. +	// If you will be using the determinant in subsequent ops with LLVector4a, use this version +	inline void getDeterminant( LLVector4a& dest ) const; + +	// Returns the determinant as an LLSimdScalar. Use this if you will be using the determinant +	// primary for scalar operations. +	inline LLSimdScalar getDeterminant() const; + +	// Returns nonzero if rows 0-2 and colums 0-2 contain no NaN or INF values. Row 3 is ignored +	inline LLBool32 isFinite() const; + +	// Returns true if this matrix is equal to 'rhs' up to 'tolerance' +	inline bool isApproximatelyEqual( const LLMatrix3a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const; + +protected: + +	LLVector4a mColumns[3]; + +}; + +class LLRotation : public LLMatrix3a +{ +public: +	 +	LLRotation() {} +	 +	// Returns true if this rotation is orthonormal with det ~= 1 +	inline bool isOkRotation() const;		 +}; + +#endif diff --git a/indra/llmath/llmatrix3a.inl b/indra/llmath/llmatrix3a.inl new file mode 100644 index 0000000000..65fd949f78 --- /dev/null +++ b/indra/llmath/llmatrix3a.inl @@ -0,0 +1,125 @@ +/**  + * @file llmatrix3a.inl + * @brief LLMatrix3a inline definitions + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llmatrix3a.h" +#include "m3math.h" + +inline LLMatrix3a::LLMatrix3a( const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2 ) +{ +	setColumns( c0, c1, c2 ); +} + +inline void LLMatrix3a::loadu(const LLMatrix3& src) +{ +	mColumns[0].load3(src.mMatrix[0]); +	mColumns[1].load3(src.mMatrix[1]); +	mColumns[2].load3(src.mMatrix[2]); +} + +inline void LLMatrix3a::setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2) +{ +	mColumns[0] = r0; +	mColumns[1] = r1; +	mColumns[2] = r2; +	setTranspose( *this ); +} + +inline void LLMatrix3a::setColumns(const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2) +{ +	mColumns[0] = c0; +	mColumns[1] = c1; +	mColumns[2] = c2; +} + +inline void LLMatrix3a::setTranspose(const LLMatrix3a& src) +{ +	const LLQuad srcCol0 = src.mColumns[0]; +	const LLQuad srcCol1 = src.mColumns[1]; +	const LLQuad unpacklo = _mm_unpacklo_ps( srcCol0, srcCol1 ); +	mColumns[0] = _mm_movelh_ps( unpacklo, src.mColumns[2] ); +	mColumns[1] = _mm_shuffle_ps( _mm_movehl_ps( srcCol0, unpacklo ), src.mColumns[2], _MM_SHUFFLE(0, 1, 1, 0) ); +	mColumns[2] = _mm_shuffle_ps( _mm_unpackhi_ps( srcCol0, srcCol1 ), src.mColumns[2], _MM_SHUFFLE(0, 2, 1, 0) ); +} + +inline const LLVector4a& LLMatrix3a::getColumn(const U32 column) const +{ +	llassert( column < 3 ); +	return mColumns[column]; +} + +inline void LLMatrix3a::setLerp(const LLMatrix3a& a, const LLMatrix3a& b, F32 w) +{ +	mColumns[0].setLerp( a.mColumns[0], b.mColumns[0], w ); +	mColumns[1].setLerp( a.mColumns[1], b.mColumns[1], w ); +	mColumns[2].setLerp( a.mColumns[2], b.mColumns[2], w ); +} + +inline LLBool32 LLMatrix3a::isFinite() const +{ +	return mColumns[0].isFinite3() && mColumns[1].isFinite3() && mColumns[2].isFinite3(); +} + +inline void LLMatrix3a::getDeterminant( LLVector4a& dest ) const +{ +	LLVector4a col1xcol2; col1xcol2.setCross3( mColumns[1], mColumns[2] ); +	dest.setAllDot3( col1xcol2, mColumns[0] ); +} + +inline LLSimdScalar LLMatrix3a::getDeterminant() const +{ +	LLVector4a col1xcol2; col1xcol2.setCross3( mColumns[1], mColumns[2] ); +	return col1xcol2.dot3( mColumns[0] ); +} + +inline bool LLMatrix3a::isApproximatelyEqual( const LLMatrix3a& rhs, F32 tolerance /*= F_APPROXIMATELY_ZERO*/ ) const +{ +	return rhs.getColumn(0).equals3(mColumns[0], tolerance)  +		&& rhs.getColumn(1).equals3(mColumns[1], tolerance)  +		&& rhs.getColumn(2).equals3(mColumns[2], tolerance);  +} + +inline const LLMatrix3a& LLMatrix3a::getIdentity() +{ +	extern const LLMatrix3a LL_M3A_IDENTITY; +	return LL_M3A_IDENTITY; +} + +inline bool LLRotation::isOkRotation() const +{ +	LLMatrix3a transpose; transpose.setTranspose( *this ); +	LLMatrix3a product; product.setMul( *this, transpose ); + +	LLSimdScalar detMinusOne = getDeterminant() - 1.f; + +	return product.isApproximatelyEqual( LLMatrix3a::getIdentity() ) && (detMinusOne.getAbs() < F_APPROXIMATELY_ZERO); +} + diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h new file mode 100644 index 0000000000..0ead045d04 --- /dev/null +++ b/indra/llmath/llmatrix4a.h @@ -0,0 +1,149 @@ +/**  + * @file llmatrix4a.h + * @brief LLMatrix4a class header file - memory aligned and vectorized 4x4 matrix + * + * $LicenseInfo:firstyear=2007&license=viewergpl$ + *  + * Copyright (c) 2007-2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef	LL_LLMATRIX4A_H +#define	LL_LLMATRIX4A_H + +#include "llvector4a.h" +#include "m4math.h" +#include "m3math.h" + +class LLMatrix4a +{ +public: +	LLVector4a mMatrix[4]; + +	inline void clear() +	{ +		mMatrix[0].clear(); +		mMatrix[1].clear(); +		mMatrix[2].clear(); +		mMatrix[3].clear(); +	} + +	inline void loadu(const LLMatrix4& src) +	{ +		mMatrix[0] = _mm_loadu_ps(src.mMatrix[0]); +		mMatrix[1] = _mm_loadu_ps(src.mMatrix[1]); +		mMatrix[2] = _mm_loadu_ps(src.mMatrix[2]); +		mMatrix[3] = _mm_loadu_ps(src.mMatrix[3]); +		 +	} + +	inline void loadu(const LLMatrix3& src) +	{ +		mMatrix[0].load3(src.mMatrix[0]); +		mMatrix[1].load3(src.mMatrix[1]); +		mMatrix[2].load3(src.mMatrix[2]); +		mMatrix[3].set(0,0,0,1.f); +	} + +	inline void add(const LLMatrix4a& rhs) +	{ +		mMatrix[0].add(rhs.mMatrix[0]); +		mMatrix[1].add(rhs.mMatrix[1]); +		mMatrix[2].add(rhs.mMatrix[2]); +		mMatrix[3].add(rhs.mMatrix[3]); +	} + +	inline void setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2) +	{ +		mMatrix[0] = r0; +		mMatrix[1] = r1; +		mMatrix[2] = r2; +	} + +	inline void setMul(const LLMatrix4a& m, const F32 s) +	{ +		mMatrix[0].setMul(m.mMatrix[0], s); +		mMatrix[1].setMul(m.mMatrix[1], s); +		mMatrix[2].setMul(m.mMatrix[2], s); +		mMatrix[3].setMul(m.mMatrix[3], s); +	} + +	inline void setLerp(const LLMatrix4a& a, const LLMatrix4a& b, F32 w) +	{ +		LLVector4a d0,d1,d2,d3; +		d0.setSub(b.mMatrix[0], a.mMatrix[0]); +		d1.setSub(b.mMatrix[1], a.mMatrix[1]); +		d2.setSub(b.mMatrix[2], a.mMatrix[2]); +		d3.setSub(b.mMatrix[3], a.mMatrix[3]); + +		// this = a + d*w +		 +		d0.mul(w); +		d1.mul(w); +		d2.mul(w); +		d3.mul(w); + +		mMatrix[0].setAdd(a.mMatrix[0],d0); +		mMatrix[1].setAdd(a.mMatrix[1],d1); +		mMatrix[2].setAdd(a.mMatrix[2],d2); +		mMatrix[3].setAdd(a.mMatrix[3],d3); +	} + +	inline void rotate(const LLVector4a& v, LLVector4a& res) +	{ +		res = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)); +		res.mul(mMatrix[0]); +		 +		LLVector4a y; +		y = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)); +		y.mul(mMatrix[1]); + +		LLVector4a z; +		z = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)); +		z.mul(mMatrix[2]); + +		res.add(y); +		res.add(z); +	} + +	inline void affineTransform(const LLVector4a& v, LLVector4a& res) +	{ +		LLVector4a x,y,z; + +		x = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)); +		y = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)); +		z = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)); +		 +		x.mul(mMatrix[0]); +		y.mul(mMatrix[1]); +		z.mul(mMatrix[2]); + +		x.add(y); +		z.add(mMatrix[3]); +		res.setAdd(x,z); +	} +}; + +#endif diff --git a/indra/llmath/llquaternion2.h b/indra/llmath/llquaternion2.h new file mode 100644 index 0000000000..dbb4afe312 --- /dev/null +++ b/indra/llmath/llquaternion2.h @@ -0,0 +1,111 @@ +/**  + * @file llquaternion2.h + * @brief LLQuaternion2 class header file - SIMD-enabled quaternion class + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef	LL_QUATERNION2_H +#define	LL_QUATERNION2_H + +///////////////////////////// +// LLQuaternion2 +///////////////////////////// +// This class stores a quaternion x*i + y*j + z*k + w in <x, y, z, w> order +// (i.e., w in high order element of vector) +///////////////////////////// +///////////////////////////// +// These classes are intentionally minimal right now. If you need additional +// functionality, please contact someone with SSE experience (e.g., Falcon or +// Huseby). +///////////////////////////// +#include "llquaternion.h" + +class LLQuaternion2 +{ +public: + +	////////////////////////// +	// Ctors +	////////////////////////// +	 +	// Ctor +	LLQuaternion2() {} + +	// Ctor from LLQuaternion +	explicit LLQuaternion2( const class LLQuaternion& quat ); + +	////////////////////////// +	// Get/Set +	////////////////////////// + +	// Load from an LLQuaternion +	inline void operator=( const LLQuaternion& quat ) +	{ +		mQ.loadua( quat.mQ ); +	} + +	// Return the internal LLVector4a representation of the quaternion +	inline const LLVector4a& getVector4a() const; +	inline LLVector4a& getVector4aRw(); + +	///////////////////////// +	// Quaternion modification +	///////////////////////// +	 +	// Set this quaternion to the conjugate of src +	inline void setConjugate(const LLQuaternion2& src); + +	// Renormalizes the quaternion. Assumes it has nonzero length. +	inline void normalize(); + +	// Quantize this quaternion to 8 bit precision +	inline void quantize8(); + +	// Quantize this quaternion to 16 bit precision +	inline void quantize16(); + +	///////////////////////// +	// Quaternion inspection +	///////////////////////// + +	// Return true if this quaternion is equal to 'rhs'.  +	// Note! Quaternions exhibit "double-cover", so any rotation has two equally valid +	// quaternion representations and they will NOT compare equal. +	inline bool equals(const LLQuaternion2& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const; + +	// Return true if all components are finite and the quaternion is normalized +	inline bool isOkRotation() const; + +protected: + +	LLVector4a mQ; + +}; + +#endif diff --git a/indra/llmath/llquaternion2.inl b/indra/llmath/llquaternion2.inl new file mode 100644 index 0000000000..9a4274d6a4 --- /dev/null +++ b/indra/llmath/llquaternion2.inl @@ -0,0 +1,108 @@ +/**  + * @file llquaternion2.inl + * @brief LLQuaternion2 inline definitions + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llquaternion2.h" + +static const LLQuad LL_V4A_PLUS_ONE = {1.f, 1.f, 1.f, 1.f}; +static const LLQuad LL_V4A_MINUS_ONE = {-1.f, -1.f, -1.f, -1.f}; + +// Ctor from LLQuaternion +inline LLQuaternion2::LLQuaternion2( const LLQuaternion& quat ) +{ +	mQ.set(quat.mQ[VX], quat.mQ[VY], quat.mQ[VZ], quat.mQ[VW]); +} + +////////////////////////// +// Get/Set +////////////////////////// + +// Return the internal LLVector4a representation of the quaternion +inline const LLVector4a& LLQuaternion2::getVector4a() const +{ +	return mQ; +} + +inline LLVector4a& LLQuaternion2::getVector4aRw() +{ +	return mQ; +} + +///////////////////////// +// Quaternion modification +///////////////////////// + +// Set this quaternion to the conjugate of src +inline void LLQuaternion2::setConjugate(const LLQuaternion2& src) +{ +	static LL_ALIGN_16( const U32 F_QUAT_INV_MASK_4A[4] ) = { 0x80000000, 0x80000000, 0x80000000, 0x00000000 }; +	mQ = _mm_xor_ps(src.mQ, *reinterpret_cast<const LLQuad*>(&F_QUAT_INV_MASK_4A));	 +} + +// Renormalizes the quaternion. Assumes it has nonzero length. +inline void LLQuaternion2::normalize() +{ +	mQ.normalize4(); +} + +// Quantize this quaternion to 8 bit precision +inline void LLQuaternion2::quantize8() +{ +	mQ.quantize8( LL_V4A_MINUS_ONE, LL_V4A_PLUS_ONE ); +	normalize(); +} + +// Quantize this quaternion to 16 bit precision +inline void LLQuaternion2::quantize16() +{ +	mQ.quantize16( LL_V4A_MINUS_ONE, LL_V4A_PLUS_ONE ); +	normalize(); +} + + +///////////////////////// +// Quaternion inspection +///////////////////////// + +// Return true if this quaternion is equal to 'rhs'.  +// Note! Quaternions exhibit "double-cover", so any rotation has two equally valid +// quaternion representations and they will NOT compare equal. +inline bool LLQuaternion2::equals(const LLQuaternion2 &rhs, F32 tolerance/* = F_APPROXIMATELY_ZERO*/) const +{ +	return mQ.equals4(rhs.mQ, tolerance); +} + +// Return true if all components are finite and the quaternion is normalized +inline bool LLQuaternion2::isOkRotation() const +{ +	return mQ.isFinite4() && mQ.isNormalized4(); +} + diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h new file mode 100644 index 0000000000..9377bfdb53 --- /dev/null +++ b/indra/llmath/llsimdmath.h @@ -0,0 +1,95 @@ +/**  + * @file llsimdmath.h + * @brief Common header for SIMD-based math library (llvector4a, llmatrix3a, etc.) + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2007-2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef	LL_SIMD_MATH_H +#define	LL_SIMD_MATH_H + +#ifndef LLMATH_H +#error "Please include llmath.h before this file." +#endif + +#if ( ( LL_DARWIN || LL_LINUX ) && !(__SSE2__) ) || ( LL_WINDOWS && ( _M_IX86_FP < 2 ) ) +#error SSE2 not enabled. LLVector4a and related class will not compile. +#endif + +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address)  +{  +	return reinterpret_cast<T*>( +		(reinterpret_cast<U32>(address) + 0xF) & ~0xF); +} + +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)  +{  +	return reinterpret_cast<T*>( +		(reinterpret_cast<U32>(address) + 0x3F) & ~0x3F); +} + +#if LL_LINUX || LL_DARWIN + +#define			LL_ALIGN_PREFIX(x) +#define			LL_ALIGN_POSTFIX(x)		__attribute__((aligned(x))) + +#elif LL_WINDOWS + +#define			LL_ALIGN_PREFIX(x)		__declspec(align(x)) +#define			LL_ALIGN_POSTFIX(x) + +#else +#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined" +#endif + +#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16) + + + +#include <xmmintrin.h> +#include <emmintrin.h> + +#include "llsimdtypes.h" +#include "llsimdtypes.inl" + +class LLMatrix3a; +class LLRotation; +class LLMatrix3; + +#include "llquaternion.h" + +#include "llvector4logical.h" +#include "llvector4a.h" +#include "llmatrix3a.h" +#include "llquaternion2.h" +#include "llvector4a.inl" +#include "llmatrix3a.inl" +#include "llquaternion2.inl" + + +#endif //LL_SIMD_MATH_H diff --git a/indra/llmath/llsimdtypes.h b/indra/llmath/llsimdtypes.h new file mode 100644 index 0000000000..82e318c8bf --- /dev/null +++ b/indra/llmath/llsimdtypes.h @@ -0,0 +1,130 @@ +/**  + * @file llsimdtypes.h + * @brief Declaration of basic SIMD math related types + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2007-2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_SIMD_TYPES_H +#define LL_SIMD_TYPES_H + +#ifndef LL_SIMD_MATH_H +#error "Please include llmath.h before this file." +#endif + +typedef __m128	LLQuad; + + +#if LL_WINDOWS +#pragma warning(push) +#pragma warning( disable : 4800 3 ) // Disable warning about casting int to bool for this class. +#if defined(_MSC_VER) && (_MSC_VER < 1500) +// VC++ 2005 is missing these intrinsics +// __forceinline is MSVC specific and attempts to override compiler inlining judgment. This is so +// even in debug builds this call is a NOP. +__forceinline const __m128 _mm_castsi128_ps( const __m128i a ) { return reinterpret_cast<const __m128&>(a); } +__forceinline const __m128i _mm_castps_si128( const __m128 a ) { return reinterpret_cast<const __m128i&>(a); } +#endif // _MSC_VER + +#endif // LL_WINDOWS + +class LLBool32 +{ +public: +	inline LLBool32() {} +	inline LLBool32(int rhs) : m_bool(rhs) {} +	inline LLBool32(unsigned int rhs) : m_bool(rhs) {} +	inline LLBool32(bool rhs) { m_bool = static_cast<const int>(rhs); } +	inline LLBool32& operator= (bool rhs) { m_bool = (int)rhs; return *this; } +	inline bool operator== (bool rhs) const { return static_cast<const bool&>(m_bool) == rhs; } +	inline bool operator!= (bool rhs) const { return !operator==(rhs); } +	inline operator bool() const { return static_cast<const bool&>(m_bool); } + +private: +	int m_bool; +}; + +#if LL_WINDOWS +#pragma warning(pop) +#endif + +class LLSimdScalar +{ +public: +	inline LLSimdScalar() {} +	inline LLSimdScalar(LLQuad q)  +	{  +		mQ = q;  +	} + +	inline LLSimdScalar(F32 f)  +	{  +		mQ = _mm_set_ss(f);  +	} + +	static inline const LLSimdScalar& getZero() +	{ +		extern const LLQuad F_ZERO_4A; +		return reinterpret_cast<const LLSimdScalar&>(F_ZERO_4A); +	} + +	inline F32 getF32() const; + +	inline LLBool32 isApproximatelyEqual(const LLSimdScalar& rhs, F32 tolerance = F_APPROXIMATELY_ZERO) const; + +	inline LLSimdScalar getAbs() const; + +	inline void setMax( const LLSimdScalar& a, const LLSimdScalar& b ); +	 +	inline void setMin( const LLSimdScalar& a, const LLSimdScalar& b ); + +	inline LLSimdScalar& operator=(F32 rhs); + +	inline LLSimdScalar& operator+=(const LLSimdScalar& rhs); + +	inline LLSimdScalar& operator-=(const LLSimdScalar& rhs); + +	inline LLSimdScalar& operator*=(const LLSimdScalar& rhs); + +	inline LLSimdScalar& operator/=(const LLSimdScalar& rhs); + +	inline operator LLQuad() const +	{  +		return mQ;  +	} +	 +	inline const LLQuad& getQuad() const  +	{  +		return mQ;  +	} + +private: +	LLQuad mQ; +}; + +#endif //LL_SIMD_TYPES_H diff --git a/indra/llmath/llsimdtypes.inl b/indra/llmath/llsimdtypes.inl new file mode 100644 index 0000000000..69c858e310 --- /dev/null +++ b/indra/llmath/llsimdtypes.inl @@ -0,0 +1,163 @@ +/**  + * @file llsimdtypes.inl + * @brief Inlined definitions of basic SIMD math related types + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2007-2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + + + + +////////////////// +// LLSimdScalar +////////////////// + +inline LLSimdScalar operator+(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	LLSimdScalar t(a); +	t += b; +	return t; +} + +inline LLSimdScalar operator-(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	LLSimdScalar t(a); +	t -= b; +	return t; +} + +inline LLSimdScalar operator*(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	LLSimdScalar t(a); +	t *= b; +	return t; +} + +inline LLSimdScalar operator/(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	LLSimdScalar t(a); +	t /= b; +	return t; +} + +inline LLSimdScalar operator-(const LLSimdScalar& a) +{ +	static LL_ALIGN_16(const U32 signMask[4]) = {0x80000000, 0x80000000, 0x80000000, 0x80000000 }; +	return _mm_xor_ps(*reinterpret_cast<const LLQuad*>(signMask), a); +} + +inline LLBool32 operator==(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	return _mm_comieq_ss(a, b); +} + +inline LLBool32 operator!=(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	return _mm_comineq_ss(a, b); +} + +inline LLBool32 operator<(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	return _mm_comilt_ss(a, b); +} + +inline LLBool32 operator<=(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	return _mm_comile_ss(a, b); +} + +inline LLBool32 operator>(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	return _mm_comigt_ss(a, b); +} + +inline LLBool32 operator>=(const LLSimdScalar& a, const LLSimdScalar& b) +{ +	return _mm_comige_ss(a, b); +} + +inline LLBool32 LLSimdScalar::isApproximatelyEqual(const LLSimdScalar& rhs, F32 tolerance /* = F_APPROXIMATELY_ZERO */) const +{ +	const LLSimdScalar tol( tolerance ); +	const LLSimdScalar diff = _mm_sub_ss( mQ, rhs.mQ ); +	const LLSimdScalar absDiff = diff.getAbs(); +	return absDiff <= tol; +} + +inline void LLSimdScalar::setMax( const LLSimdScalar& a, const LLSimdScalar& b ) +{ +	mQ = _mm_max_ss( a, b ); +} + +inline void LLSimdScalar::setMin( const LLSimdScalar& a, const LLSimdScalar& b ) +{ +	mQ = _mm_min_ss( a, b ); +} + +inline LLSimdScalar& LLSimdScalar::operator=(F32 rhs)  +{  +	mQ = _mm_set_ss(rhs);  +	return *this;  +} + +inline LLSimdScalar& LLSimdScalar::operator+=(const LLSimdScalar& rhs)  +{ +	mQ = _mm_add_ss( mQ, rhs ); +	return *this; +} + +inline LLSimdScalar& LLSimdScalar::operator-=(const LLSimdScalar& rhs) +{ +	mQ = _mm_sub_ss( mQ, rhs ); +	return *this; +} + +inline LLSimdScalar& LLSimdScalar::operator*=(const LLSimdScalar& rhs) +{ +	mQ = _mm_mul_ss( mQ, rhs ); +	return *this; +} + +inline LLSimdScalar& LLSimdScalar::operator/=(const LLSimdScalar& rhs) +{ +	mQ = _mm_div_ss( mQ, rhs ); +	return *this; +} + +inline LLSimdScalar LLSimdScalar::getAbs() const +{ +	static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; +	return _mm_and_ps( mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A)); +} + +inline F32 LLSimdScalar::getF32() const +{  +	F32 ret;  +	_mm_store_ss(&ret, mQ);  +	return ret;  +} diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp new file mode 100644 index 0000000000..b62c17302f --- /dev/null +++ b/indra/llmath/llvector4a.cpp @@ -0,0 +1,228 @@ +/**  + * @file llvector4a.cpp + * @brief SIMD vector implementation + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2007-2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llmath.h" +#include "llquantize.h" + +extern const LLQuad F_ZERO_4A		= { 0, 0, 0, 0 }; +extern const LLQuad F_APPROXIMATELY_ZERO_4A = {  +	F_APPROXIMATELY_ZERO, +	F_APPROXIMATELY_ZERO, +	F_APPROXIMATELY_ZERO, +	F_APPROXIMATELY_ZERO +}; + +extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A ); +extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F_APPROXIMATELY_ZERO_4A ); + +/*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes) +{ +	assert(src != NULL); +	assert(dst != NULL); +	assert(bytes > 0); +	assert((bytes % sizeof(F32))== 0);  +	 +	F32* end = dst + (bytes / sizeof(F32) ); + +	if (bytes > 64) +	{ +		F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst); +		 +		//at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies +		F32* end_64 = end-16; +		 +		_mm_prefetch((char*)begin_64, _MM_HINT_NTA); +		_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA); +		_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA); +		_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA); +		 +		while (dst < begin_64) +		{ +			copy4a(dst, src); +			dst += 4; +			src += 4; +		} +		 +		while (dst < end_64) +		{ +			_mm_prefetch((char*)src + 512, _MM_HINT_NTA); +			_mm_prefetch((char*)dst + 512, _MM_HINT_NTA); +			copy4a(dst, src); +			copy4a(dst+4, src+4); +			copy4a(dst+8, src+8); +			copy4a(dst+12, src+12); +			 +			dst += 16; +			src += 16; +		} +	} + +	while (dst < end) +	{ +		copy4a(dst, src); +		dst += 4; +		src += 4; +	} +} + +void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec ) +{ +	const LLVector4a col0 = rot.getColumn(0); +	const LLVector4a col1 = rot.getColumn(1); +	const LLVector4a col2 = rot.getColumn(2); + +	LLVector4a result = _mm_load_ss( vec.getF32ptr() ); +	result.splat<0>( result ); +	result.mul( col0 ); + +	{ +		LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() +  1 ); +		yyyy.splat<0>( yyyy ); +		yyyy.mul( col1 );  +		result.add( yyyy ); +	} + +	{ +		LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() +  2 ); +		zzzz.splat<0>( zzzz ); +		zzzz.mul( col2 ); +		result.add( zzzz ); +	} + +	*this = result; +} + +void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec ) +{ +	const LLVector4a& quatVec = quat.getVector4a(); +	LLVector4a temp; temp.setCross3(quatVec, vec); +	temp.add( temp ); +	 +	const LLVector4a realPart( quatVec.getScalarAt<3>() ); +	LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart ); + +	mQ = vec; +	add( tempTimesReal ); +	 +	LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp ); +	add(imagCrossTemp); +} + +void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high ) +{ +	LLVector4a val(mQ); +	LLVector4a delta; delta.setSub( high, low ); + +	{ +		val.clamp(low, high); +		val.sub(low); + +		// 8-bit quantization means we can do with just 12 bits of reciprocal accuracy +		const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ); +// 		{ +// 			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; +// 			LLVector4a two; two.load4a( F_TWO_4A ); +//  +// 			// Here we use _mm_rcp_ps plus one round of newton-raphson +// 			// We wish to find 'x' such that x = 1/delta +// 			// As a first approximation, we take x0 = _mm_rcp_ps(delta) +// 			// Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) +// 			// See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf +// 			const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); +// 			oneOverDelta.setMul( delta, recipApprox ); +// 			oneOverDelta.setSub( two, oneOverDelta ); +// 			oneOverDelta.mul( recipApprox ); +// 		} + +		val.mul(oneOverDelta); +		val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A)); +	} + +	val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); + +	{ +		val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); +		val.mul(delta); +		val.add(low); +	} + +	{ +		LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); +		LLVector4a absVal; absVal.setAbs( val ); +		setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); +	}	 +} + +void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high ) +{ +	LLVector4a val(mQ); +	LLVector4a delta; delta.setSub( high, low ); + +	{ +		val.clamp(low, high); +		val.sub(low); + +		// 16-bit quantization means we need a round of Newton-Raphson +		LLVector4a oneOverDelta; +		{ +			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; +			LLVector4a two; two.load4a( F_TWO_4A ); + +			// Here we use _mm_rcp_ps plus one round of newton-raphson +			// We wish to find 'x' such that x = 1/delta +			// As a first approximation, we take x0 = _mm_rcp_ps(delta) +			// Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) +			// See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf +			const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); +			oneOverDelta.setMul( delta, recipApprox ); +			oneOverDelta.setSub( two, oneOverDelta ); +			oneOverDelta.mul( recipApprox ); +		} + +		val.mul(oneOverDelta); +		val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A)); +	} + +	val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); + +	{ +		val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); +		val.mul(delta); +		val.add(low); +	} + +	{ +		LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); +		LLVector4a absVal; absVal.setAbs( val ); +		setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); +	}	 +} diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h new file mode 100644 index 0000000000..76a3e999ce --- /dev/null +++ b/indra/llmath/llvector4a.h @@ -0,0 +1,331 @@ +/**  + * @file llvector4a.h + * @brief LLVector4a class header file - memory aligned and vectorized 4 component vector + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2007-2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef	LL_LLVECTOR4A_H +#define	LL_LLVECTOR4A_H + + +class LLRotation; + +#include <assert.h> +#include "llpreprocessor.h" + +/////////////////////////////////// +// FIRST TIME USERS PLEASE READ +////////////////////////////////// +// This is just the beginning of LLVector4a. There are many more useful functions +// yet to be implemented. For example, setNeg to negate a vector, rotate() to apply +// a matrix rotation, various functions to manipulate only the X, Y, and Z elements +// and many others (including a whole variety of accessors). So if you don't see a  +// function here that you need, please contact Falcon or someone else with SSE  +// experience (Richard, I think, has some and davep has a little as of the time  +// of this writing, July 08, 2010) about getting it implemented before you resort to +// LLVector3/LLVector4.  +///////////////////////////////// + +class LLVector4a +{ +public: + +	/////////////////////////////////// +	// STATIC METHODS +	/////////////////////////////////// +	 +	// Call initClass() at startup to avoid 15,000+ cycle penalties from denormalized numbers +	static void initClass() +	{ +		_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); +		_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); +	} + +	// Return a vector of all zeros +	static inline const LLVector4a& getZero() +	{ +		extern const LLVector4a LL_V4A_ZERO; +		return LL_V4A_ZERO; +	} + +	// Return a vector of all epsilon, where epsilon is a small float suitable for approximate equality checks +	static inline const LLVector4a& getEpsilon() +	{ +		extern const LLVector4a LL_V4A_EPSILON; +		return LL_V4A_EPSILON; +	} + +	// Copy 16 bytes from src to dst. Source and destination must be 16-byte aligned +	static inline void copy4a(F32* dst, const F32* src) +	{ +		_mm_store_ps(dst, _mm_load_ps(src)); +	} + +	// Copy words 16-byte blocks from src to dst. Source and destination must not overlap.  +	static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes); + +	//////////////////////////////////// +	// CONSTRUCTORS  +	//////////////////////////////////// +	 +	LLVector4a() +	{ //DO NOT INITIALIZE -- The overhead is completely unnecessary +	} +	 +	LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f) +	{ +		set(x,y,z,w); +	} +	 +	LLVector4a(F32 x) +	{ +		splat(x); +	} +	 +	LLVector4a(const LLSimdScalar& x) +	{ +		splat(x); +	} + +	LLVector4a(LLQuad q) +	{ +		mQ = q; +	} + +	//////////////////////////////////// +	// LOAD/STORE +	//////////////////////////////////// +	 +	// Load from 16-byte aligned src array (preferred method of loading) +	inline void load4a(const F32* src); +	 +	// Load from unaligned src array (NB: Significantly slower than load4a) +	inline void loadua(const F32* src); +	 +	// Load only three floats beginning at address 'src'. Slowest method. +	inline void load3(const F32* src); +	 +	// Store to a 16-byte aligned memory address +	inline void store4a(F32* dst) const; +	 +	//////////////////////////////////// +	// BASIC GET/SET  +	//////////////////////////////////// +	 +	// Return a "this" as an F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon) +	inline F32* getF32ptr(); +	 +	// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon) +	inline const F32* const getF32ptr() const; +	 +	// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates +	// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead +	inline F32 operator[](const S32 idx) const; + +	// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. +	inline LLSimdScalar getScalarAt(const S32 idx) const; + +	// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. +	template <int N> LL_FORCE_INLINE LLSimdScalar getScalarAt() const; +	template <> LL_FORCE_INLINE LLSimdScalar getScalarAt<0>() const; + +	// Set to an x, y, z and optional w provided +	inline void set(F32 x, F32 y, F32 z, F32 w = 0.f); +	 +	// Set to all zeros. This is preferred to using ::getZero() +	inline void clear(); +	 +	// Set all elements to 'x' +	inline void splat(const F32 x); + +	// Set all elements to 'x' +	inline void splat(const LLSimdScalar& x); +	 +	// Set all 4 elements to element N of src, with N known at compile time +	template <int N> void splat(const LLVector4a& src); +	 +	// Set all 4 elements to element i of v, with i NOT known at compile time +	inline void splat(const LLVector4a& v, U32 i); +	 +	// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask +	inline void setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse ); +	 +	//////////////////////////////////// +	// ALGEBRAIC +	//////////////////////////////////// +	 +	// Set this to the element-wise (a + b) +	inline void setAdd(const LLVector4a& a, const LLVector4a& b); +	 +	// Set this to element-wise (a - b) +	inline void setSub(const LLVector4a& a, const LLVector4a& b); +	 +	// Set this to element-wise multiply (a * b) +	inline void setMul(const LLVector4a& a, const LLVector4a& b); +	 +	// Set this to element-wise quotient (a / b) +	inline void setDiv(const LLVector4a& a, const LLVector4a& b); +	 +	// Set this to the element-wise absolute value of src +	inline void setAbs(const LLVector4a& src); +	 +	// Add to each component in this vector the corresponding component in rhs +	inline void add(const LLVector4a& rhs); +	 +	// Subtract from each component in this vector the corresponding component in rhs +	inline void sub(const LLVector4a& rhs); +	 +	// Multiply each component in this vector by the corresponding component in rhs +	inline void mul(const LLVector4a& rhs); +	 +	// Divide each component in this vector by the corresponding component in rhs +	inline void div(const LLVector4a& rhs); +	 +	// Multiply this vector by x in a scalar fashion +	inline void mul(const F32 x); + +	// Set this to (a x b) (geometric cross-product) +	inline void setCross3(const LLVector4a& a, const LLVector4a& b); +	 +	// Set all elements to the dot product of the x, y, and z elements in a and b +	inline void setAllDot3(const LLVector4a& a, const LLVector4a& b); + +	// Set all elements to the dot product of the x, y, z, and w elements in a and b +	inline void setAllDot4(const LLVector4a& a, const LLVector4a& b); + +	// Return the 3D dot product of this vector and b +	inline LLSimdScalar dot3(const LLVector4a& b) const; + +	// Return the 4D dot product of this vector and b +	inline LLSimdScalar dot4(const LLVector4a& b) const; + +	// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed +	// Note that this does not consider zero length vectors! +	inline void normalize3(); + +	// Same as normalize3() but with respect to all 4 components +	inline void normalize4(); + +	// Same as normalize3(), but returns length as a SIMD scalar +	inline LLSimdScalar normalize3withLength(); + +	// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed +	// Note that this does not consider zero length vectors! +	inline void normalize3fast(); + +	// Return true if this vector is normalized with respect to x,y,z up to tolerance +	inline LLBool32 isNormalized3( F32 tolerance = 1e-3 ) const; + +	// Return true if this vector is normalized with respect to all components up to tolerance +	inline LLBool32 isNormalized4( F32 tolerance = 1e-3 ) const; + +	// Set all elements to the length of vector 'v'  +	inline void setAllLength3( const LLVector4a& v ); + +	// Get this vector's length +	inline LLSimdScalar getLength3() const; +	 +	// Set the components of this vector to the minimum of the corresponding components of lhs and rhs +	inline void setMin(const LLVector4a& lhs, const LLVector4a& rhs); +	 +	// Set the components of this vector to the maximum of the corresponding components of lhs and rhs +	inline void setMax(const LLVector4a& lhs, const LLVector4a& rhs); +	 +	// Clamps this vector to be within the component-wise range low to high (inclusive) +	inline void clamp( const LLVector4a& low, const LLVector4a& high ); + +	// Set this to  (c * lhs) + rhs * ( 1 - c) +	inline void setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c); +	 +	// Return true (nonzero) if x, y, z (and w for Finite4) are all finite floats +	inline LLBool32 isFinite3() const;	 +	inline LLBool32 isFinite4() const; + +	// Set this vector to 'vec' rotated by the LLRotation or LLQuaternion2 provided +	void setRotated( const LLRotation& rot, const LLVector4a& vec ); +	void setRotated( const class LLQuaternion2& quat, const LLVector4a& vec ); + +	// Set this vector to 'vec' rotated by the INVERSE of the LLRotation or LLQuaternion2 provided +	inline void setRotatedInv( const LLRotation& rot, const LLVector4a& vec ); +	inline void setRotatedInv( const class LLQuaternion2& quat, const LLVector4a& vec ); + +	// Quantize this vector to 8 or 16 bit precision +	void quantize8( const LLVector4a& low, const LLVector4a& high ); +	void quantize16( const LLVector4a& low, const LLVector4a& high ); + +	//////////////////////////////////// +	// LOGICAL +	////////////////////////////////////	 +	// The functions in this section will compare the elements in this vector +	// to those in rhs and return an LLVector4Logical with all bits set in elements +	// where the comparison was true and all bits unset in elements where the comparison +	// was false. See llvector4logica.h +	//////////////////////////////////// +	// WARNING: Other than equals3 and equals4, these functions do NOT account +	// for floating point tolerance. You should include the appropriate tolerance +	// in the inputs. +	//////////////////////////////////// +	 +	inline LLVector4Logical greaterThan(const LLVector4a& rhs) const; + +	inline LLVector4Logical lessThan(const LLVector4a& rhs) const; +	 +	inline LLVector4Logical greaterEqual(const LLVector4a& rhs) const; + +	inline LLVector4Logical lessEqual(const LLVector4a& rhs) const; +	 +	inline LLVector4Logical equal(const LLVector4a& rhs) const; + +	// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance +	inline bool equals4(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const; + +	inline bool equals3(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const; + +	//////////////////////////////////// +	// OPERATORS +	////////////////////////////////////	 +	 +	// Do NOT add aditional operators without consulting someone with SSE experience +	inline const LLVector4a& operator= ( const LLVector4a& rhs ); +	 +	inline const LLVector4a& operator= ( const LLQuad& rhs ); + +	inline operator LLQuad() const;	 + +private: +	LLQuad mQ; +}; + +inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p) +{ +	min.setMin(min, p); +	max.setMax(max, p); +} + +#endif diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl new file mode 100644 index 0000000000..e52b550883 --- /dev/null +++ b/indra/llmath/llvector4a.inl @@ -0,0 +1,599 @@ +/**  + * @file llvector4a.inl + * @brief LLVector4a inline function implementations + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2007-2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +//////////////////////////////////// +// LOAD/STORE +//////////////////////////////////// + +// Load from 16-byte aligned src array (preferred method of loading) +inline void LLVector4a::load4a(const F32* src) +{ +	mQ = _mm_load_ps(src); +} + +// Load from unaligned src array (NB: Significantly slower than load4a) +inline void LLVector4a::loadua(const F32* src) +{ +	mQ = _mm_loadu_ps(src); +} + +// Load only three floats beginning at address 'src'. Slowest method. +inline void LLVector4a::load3(const F32* src) +{ +	// mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X } +	// NB: This differs from the convention of { Z, Y, X, W } +	mQ = _mm_set_ps(0.f, src[2], src[1], src[0]); +}	 + +// Store to a 16-byte aligned memory address +inline void LLVector4a::store4a(F32* dst) const +{ +	_mm_store_ps(dst, mQ); +} + +//////////////////////////////////// +// BASIC GET/SET  +//////////////////////////////////// + +// Return a "this" as an F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon) +F32* LLVector4a::getF32ptr() +{ +	return (F32*) &mQ; +} + +// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon) +const F32* const LLVector4a::getF32ptr() const +{ +	return (const F32* const) &mQ; +} + +// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates +// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead +inline F32 LLVector4a::operator[](const S32 idx) const +{ +	return ((F32*)&mQ)[idx]; +}	 + +// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. +inline LLSimdScalar LLVector4a::getScalarAt(const S32 idx) const +{ +	// Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop) +	switch (idx) +	{ +		case 0: +			return mQ; +		case 1: +			return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1)); +		case 2: +			return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2)); +		case 3: +		default: +			return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3)); +	} +} + +// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. +template <int N> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt() const +{ +	return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N)); +} + +template<> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const +{ +	return mQ; +} + +// Set to an x, y, z and optional w provided +inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w) +{ +	mQ = _mm_set_ps(w, z, y, x); +} + +// Set to all zeros +inline void LLVector4a::clear() +{ +	mQ = LLVector4a::getZero().mQ; +} + +inline void LLVector4a::splat(const F32 x) +{ +	mQ = _mm_set1_ps(x);	 +} + +inline void LLVector4a::splat(const LLSimdScalar& x) +{ +	mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) ); +} + +// Set all 4 elements to element N of src, with N known at compile time +template <int N> void LLVector4a::splat(const LLVector4a& src) +{ +	mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) ); +} + +// Set all 4 elements to element i of v, with i NOT known at compile time +inline void LLVector4a::splat(const LLVector4a& v, U32 i) +{ +	switch (i) +	{ +		case 0: +			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0)); +			break; +		case 1: +			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1)); +			break; +		case 2: +			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2)); +			break; +		case 3: +			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3)); +			break; +	} +} + +// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask +inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse ) +{ +	// ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse ) +	// E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b +	// (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b,  +	// as expected (01 from sourceIfTrue, 10 from sourceIfFalse) +	// Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/ +	mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) ); +} + +//////////////////////////////////// +// ALGEBRAIC +//////////////////////////////////// + +// Set this to the element-wise (a + b) +inline void LLVector4a::setAdd(const LLVector4a& a, const LLVector4a& b) +{ +	mQ = _mm_add_ps(a.mQ, b.mQ); +} + +// Set this to element-wise (a - b) +inline void LLVector4a::setSub(const LLVector4a& a, const LLVector4a& b) +{ +	mQ = _mm_sub_ps(a.mQ, b.mQ); +} + +// Set this to element-wise multiply (a * b) +inline void LLVector4a::setMul(const LLVector4a& a, const LLVector4a& b) +{ +	mQ = _mm_mul_ps(a.mQ, b.mQ); +} + +// Set this to element-wise quotient (a / b) +inline void LLVector4a::setDiv(const LLVector4a& a, const LLVector4a& b) +{ +	mQ = _mm_div_ps( a.mQ, b.mQ ); +} + +// Set this to the element-wise absolute value of src +inline void LLVector4a::setAbs(const LLVector4a& src) +{ +	static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; +	mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A)); +} + +// Add to each component in this vector the corresponding component in rhs +inline void LLVector4a::add(const LLVector4a& rhs) +{ +	mQ = _mm_add_ps(mQ, rhs.mQ);	 +} + +// Subtract from each component in this vector the corresponding component in rhs +inline void LLVector4a::sub(const LLVector4a& rhs) +{ +	mQ = _mm_sub_ps(mQ, rhs.mQ); +} + +// Multiply each component in this vector by the corresponding component in rhs +inline void LLVector4a::mul(const LLVector4a& rhs) +{ +	mQ = _mm_mul_ps(mQ, rhs.mQ);	 +} + +// Divide each component in this vector by the corresponding component in rhs +inline void LLVector4a::div(const LLVector4a& rhs) +{ +	// TODO: Check accuracy, maybe add divFast +	mQ = _mm_div_ps(mQ, rhs.mQ); +} + +// Multiply this vector by x in a scalar fashion +inline void LLVector4a::mul(const F32 x)  +{ +	LLVector4a t; +	t.splat(x); +	 +	mQ = _mm_mul_ps(mQ, t.mQ); +} + +// Set this to (a x b) (geometric cross-product) +inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b) +{ +	// Vectors are stored in memory in w, z, y, x order from high to low +	// Set vector1 = { a[W], a[X], a[Z], a[Y] } +	const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); +	// Set vector2 = { b[W], b[Y], b[X], b[Z] } +	const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); +	// mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] } +	mQ = _mm_mul_ps( vector1, vector2 ); +	// vector3 = { a[W], a[Y], a[X], a[Z] } +	const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); +	// vector4 = { b[W], b[X], b[Z], b[Y] } +	const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); +	// mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] } +	mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 )); +} + +/* This function works, but may be slightly slower than the one below on older machines + inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) + { + // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } + const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); + // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } + const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 )); + // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusY = _mm_add_ps(ab, wzxy); + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }  + const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY); + // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] } + const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 )); + // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } + mQ = _mm_add_ps(zSplat, xPlusYSplat); + }*/ + +// Set all elements to the dot product of the x, y, and z elements in a and b +inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) +{ +	// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } +	const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); +	// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } +	const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 )); +	// xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } +	const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy)); +	// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }  +	const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY); +	// zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] } +	const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 )); +	// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } +	mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat); +} + +// Set all elements to the dot product of the x, y, z, and w elements in a and b +inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b) +{ +	// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } +	const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); +	// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } +	const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 )); +	// zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } +	const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy)); +	// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }  +	const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY); +	const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY); + +	// mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } +	mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat); +} + +// Return the 3D dot product of this vector and b +inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const +{ +	const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); +	const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) ); +	const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) ); +	const LLQuad xPlusY = _mm_add_ps( ab, splatY ); +	return _mm_add_ps( xPlusY, splatZ );	 +} + +// Return the 4D dot product of this vector and b +inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const +{ +	// ab = { w, z, y, x } + 	const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); + 	// upperProdsInLowerElems = { y, x, y, x } +	const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab ); +	// sumOfPairs = { w+y, z+x, 2y, 2x } + 	const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab ); +	// shuffled = { z+x, z+x, z+x, z+x } +	const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) ); +	return _mm_add_ss( sumOfPairs, shuffled ); +} + +// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed +// Note that this does not consider zero length vectors! +inline void LLVector4a::normalize3() +{ +	// lenSqrd = a dot a +	LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); +	// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } +	const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); +	static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; +	static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; +	// Now we do one round of Newton-Raphson approximation to get full accuracy +	// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) +	// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) +	// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 +	// = 0.5 * w * (3 - a*w^2) +	// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula +	// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] +	const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); +	const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); +	const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); +	const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); +	mQ = _mm_mul_ps( mQ, nrApprox ); +} + +// Normalize this vector with respect to all components. Accurate to 22 bites of precision. +// Note that this does not consider zero length vectors! +inline void LLVector4a::normalize4() +{ +	// lenSqrd = a dot a +	LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this ); +	// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } +	const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); +	static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; +	static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; +	// Now we do one round of Newton-Raphson approximation to get full accuracy +	// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) +	// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) +	// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 +	// = 0.5 * w * (3 - a*w^2) +	// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula +	// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] +	const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); +	const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); +	const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); +	const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); +	mQ = _mm_mul_ps( mQ, nrApprox ); +} + +// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed +// Note that this does not consider zero length vectors! +inline LLSimdScalar LLVector4a::normalize3withLength() +{ +	// lenSqrd = a dot a +	LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); +	// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } +	const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); +	static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; +	static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; +	// Now we do one round of Newton-Raphson approximation to get full accuracy +	// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) +	// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) +	// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 +	// = 0.5 * w * (3 - a*w^2) +	// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula +	// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] +	const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); +	const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); +	const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); +	const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); +	mQ = _mm_mul_ps( mQ, nrApprox ); +	return _mm_sqrt_ss(lenSqrd); +} + +// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed +// Note that this does not consider zero length vectors! +inline void LLVector4a::normalize3fast() +{ +	LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); +	const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ); +	mQ = _mm_mul_ps( mQ, approxRsqrt ); +} + +// Return true if this vector is normalized with respect to x,y,z up to tolerance +inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const +{ +	static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; +	LLSimdScalar tol = _mm_load_ss( &tolerance ); +	tol = _mm_mul_ss( tol, tol ); +	LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this ); +	lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); +	lenSquared.setAbs(lenSquared); +	return _mm_comile_ss( lenSquared, tol );		 +} + +// Return true if this vector is normalized with respect to all components up to tolerance +inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const +{ +	static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; +	LLSimdScalar tol = _mm_load_ss( &tolerance ); +	tol = _mm_mul_ss( tol, tol ); +	LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this ); +	lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); +	lenSquared.setAbs(lenSquared); +	return _mm_comile_ss( lenSquared, tol );		 +} + +// Set all elements to the length of vector 'v'  +inline void LLVector4a::setAllLength3( const LLVector4a& v ) +{ +	LLVector4a lenSqrd; +	lenSqrd.setAllDot3(v, v); +	 +	mQ = _mm_sqrt_ps(lenSqrd.mQ); +} + +// Get this vector's length +inline LLSimdScalar LLVector4a::getLength3() const +{ +	return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) ); +} + +// Set the components of this vector to the minimum of the corresponding components of lhs and rhs +inline void LLVector4a::setMin(const LLVector4a& lhs, const LLVector4a& rhs) +{ +	mQ = _mm_min_ps(lhs.mQ, rhs.mQ); +} + +// Set the components of this vector to the maximum of the corresponding components of lhs and rhs +inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs) +{ +	mQ = _mm_max_ps(lhs.mQ, rhs.mQ); +} + +// Set this to  (c * lhs) + rhs * ( 1 - c) +inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c) +{ +	LLVector4a a = lhs; +	a.mul(c); +	 +	LLVector4a b = rhs; +	b.mul(1.f-c); +	 +	setAdd(a, b); +} + +inline LLBool32 LLVector4a::isFinite3() const +{ +	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; +	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); +	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); +	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); +	return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ ); +} +	 +inline LLBool32 LLVector4a::isFinite4() const +{ +	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; +	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); +	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); +	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); +	return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW ); +} + +inline void LLVector4a::setRotatedInv( const LLRotation& rot, const LLVector4a& vec ) +{ +	LLRotation inv; inv.setTranspose( rot ); +	setRotated( inv, vec ); +} + +inline void LLVector4a::setRotatedInv( const LLQuaternion2& quat, const LLVector4a& vec ) +{ +	LLQuaternion2 invRot; invRot.setConjugate( quat ); +	setRotated(invRot, vec); +} + +inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high ) +{ +	const LLVector4Logical highMask = greaterThan( high ); +	const LLVector4Logical lowMask = lessThan( low ); + +	setSelectWithMask( highMask, high, *this ); +	setSelectWithMask( lowMask, low, *this ); +} + + +//////////////////////////////////// +// LOGICAL +////////////////////////////////////	 +// The functions in this section will compare the elements in this vector +// to those in rhs and return an LLVector4Logical with all bits set in elements +// where the comparison was true and all bits unset in elements where the comparison +// was false. See llvector4logica.h +//////////////////////////////////// +// WARNING: Other than equals3 and equals4, these functions do NOT account +// for floating point tolerance. You should include the appropriate tolerance +// in the inputs. +//////////////////////////////////// + +inline LLVector4Logical LLVector4a::greaterThan(const LLVector4a& rhs) const +{	 +	return _mm_cmpgt_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::lessThan(const LLVector4a& rhs) const +{ +	return _mm_cmplt_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::greaterEqual(const LLVector4a& rhs) const +{ +	return _mm_cmpge_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::lessEqual(const LLVector4a& rhs) const +{ +	return _mm_cmple_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::equal(const LLVector4a& rhs) const +{ +	return _mm_cmpeq_ps(mQ, rhs.mQ); +} + +// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance +inline bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance ) const +{ +	LLVector4a diff; diff.setSub( *this, rhs ); +	diff.setAbs( diff ); +	const LLQuad tol = _mm_set1_ps( tolerance ); +	const LLQuad cmp = _mm_cmplt_ps( diff, tol ); +	return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW; +} + +inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const +{ +	LLVector4a diff; diff.setSub( *this, rhs ); +	diff.setAbs( diff ); +	const LLQuad tol = _mm_set1_ps( tolerance ); +	const LLQuad t = _mm_cmplt_ps( diff, tol );  +	return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ; +	 +} + +//////////////////////////////////// +// OPERATORS +////////////////////////////////////	 + +// Do NOT add aditional operators without consulting someone with SSE experience +inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs ) +{ +	mQ = rhs.mQ; +	return *this; +} + +inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs ) +{ +	mQ = rhs; +	return *this; +} + +inline LLVector4a::operator LLQuad() const +{ +	return mQ; +} diff --git a/indra/llmath/llvector4logical.h b/indra/llmath/llvector4logical.h new file mode 100644 index 0000000000..1c7ee1d79f --- /dev/null +++ b/indra/llmath/llvector4logical.h @@ -0,0 +1,130 @@ +/**  + * @file llvector4logical.h + * @brief LLVector4Logical class header file - Companion class to LLVector4a for logical and bit-twiddling operations + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + *  + * Copyright (c) 2007-2010, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef	LL_VECTOR4LOGICAL_H +#define	LL_VECTOR4LOGICAL_H + + +//////////////////////////// +// LLVector4Logical +//////////////////////////// +// This class is incomplete. If you need additional functionality, +// for example setting/unsetting particular elements or performing +// other boolean operations, feel free to implement. If you need +// assistance in determining the most optimal implementation, +// contact someone with SSE experience (Falcon, Richard, Davep, e.g.) +//////////////////////////// + +static LL_ALIGN_16(const U32 S_V4LOGICAL_MASK_TABLE[4*4]) = +{ +	0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, +	0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, +	0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, +	0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF +}; + +class LLVector4Logical +{ +public: +	 +	enum { +		MASK_X = 1, +		MASK_Y = 1 << 1, +		MASK_Z = 1 << 2, +		MASK_W = 1 << 3, +		MASK_XYZ = MASK_X | MASK_Y | MASK_Z, +		MASK_XYZW = MASK_XYZ | MASK_W +	}; +	 +	// Empty default ctor +	LLVector4Logical() {} +	 +	LLVector4Logical( const LLQuad& quad ) +	{ +		mQ = quad; +	} +	 +	// Create and return a mask consisting of the lowest order bit of each element +	inline U32 getGatheredBits() const +	{ +		return _mm_movemask_ps(mQ); +	};	 +	 +	// Invert this mask +	inline LLVector4Logical& invert() +	{ +		static const LL_ALIGN_16(U32 allOnes[4]) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +		mQ = _mm_andnot_ps( mQ, *(LLQuad*)(allOnes) ); +		return *this; +	} +	 +	inline LLBool32 areAllSet( U32 mask ) const +	{ +		return ( getGatheredBits() & mask) == mask; +	} +	 +	inline LLBool32 areAllSet() const +	{ +		return areAllSet( MASK_XYZW ); +	} +		 +	inline LLBool32 areAnySet( U32 mask ) const +	{ +		return getGatheredBits() & mask; +	} +	 +	inline LLBool32 areAnySet() const +	{ +		return areAnySet( MASK_XYZW ); +	} +	 +	inline operator LLQuad() const +	{ +		return mQ; +	} + +	inline void clear()  +	{ +		mQ = _mm_setzero_ps(); +	} + +	template<int N> void setElement() +	{ +		mQ = _mm_or_ps( mQ, *reinterpret_cast<const LLQuad*>(S_V4LOGICAL_MASK_TABLE + 4*N) ); +	} +	 +private: +	 +	LLQuad mQ; +}; + +#endif //LL_VECTOR4ALOGICAL_H diff --git a/indra/llmath/llvolumeoctree.cpp b/indra/llmath/llvolumeoctree.cpp new file mode 100644 index 0000000000..194b1faf81 --- /dev/null +++ b/indra/llmath/llvolumeoctree.cpp @@ -0,0 +1,208 @@ +/**  + + * @file llvolumeoctree.cpp + * + * $LicenseInfo:firstyear=2002&license=viewergpl$ + *  + * Copyright (c) 2002-2009, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llvolumeoctree.h" +#include "llvector4a.h" + +BOOL LLLineSegmentBoxIntersect(const LLVector4a& start, const LLVector4a& end, const LLVector4a& center, const LLVector4a& size) +{ +	LLVector4a fAWdU; +	LLVector4a dir; +	LLVector4a diff; + +	dir.setSub(end, start); +	dir.mul(0.5f); + +	diff.setAdd(end,start); +	diff.mul(0.5f); +	diff.sub(center); +	fAWdU.setAbs(dir);  + +	LLVector4a rhs; +	rhs.setAdd(size, fAWdU); + +	LLVector4a lhs; +	lhs.setAbs(diff); + +	U32 grt = lhs.greaterThan(rhs).getGatheredBits(); + +	if (grt & 0x7) +	{ +		return false; +	} +	 +	LLVector4a f; +	f.setCross3(dir, diff); +	f.setAbs(f); + +	LLVector4a v0, v1; + +	v0 = _mm_shuffle_ps(size, size,_MM_SHUFFLE(3,0,0,1)); +	v1 = _mm_shuffle_ps(fAWdU, fAWdU, _MM_SHUFFLE(3,1,2,2)); +	lhs.setMul(v0, v1); + +	v0 = _mm_shuffle_ps(size, size, _MM_SHUFFLE(3,1,2,2)); +	v1 = _mm_shuffle_ps(fAWdU, fAWdU, _MM_SHUFFLE(3,0,0,1)); +	rhs.setMul(v0, v1); +	rhs.add(lhs); +	 +	grt = f.greaterThan(rhs).getGatheredBits(); + +	return (grt & 0x7) ? false : true; +} + + +LLVolumeOctreeListener::LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node) +{ +	node->addListener(this); + +	mBounds = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*4); +	mExtents = mBounds+2; +} + +LLVolumeOctreeListener::~LLVolumeOctreeListener() +{ +	ll_aligned_free_16(mBounds); +} +	 +void LLVolumeOctreeListener::handleChildAddition(const LLOctreeNode<LLVolumeTriangle>* parent,  +	LLOctreeNode<LLVolumeTriangle>* child) +{ +	new LLVolumeOctreeListener(child); +} + + +LLOctreeTriangleRayIntersect::LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir,  +							   const LLVolumeFace* face, F32* closest_t, +							   LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal) +   : mFace(face), +     mStart(start), +	 mDir(dir), +	 mIntersection(intersection), +	 mTexCoord(tex_coord), +	 mNormal(normal), +	 mBinormal(bi_normal), +	 mClosestT(closest_t), +	 mHitFace(false) +{ +	mEnd.setAdd(mStart, mDir); +} + +void LLOctreeTriangleRayIntersect::traverse(const LLOctreeNode<LLVolumeTriangle>* node) +{ +	LLVolumeOctreeListener* vl = (LLVolumeOctreeListener*) node->getListener(0); + +	/*const F32* start = mStart.getF32(); +	const F32* end = mEnd.getF32(); +	const F32* center = vl->mBounds[0].getF32(); +	const F32* size = vl->mBounds[1].getF32();*/ + +	//if (LLLineSegmentBoxIntersect(mStart.getF32(), mEnd.getF32(), vl->mBounds[0].getF32(), vl->mBounds[1].getF32())) +	if (LLLineSegmentBoxIntersect(mStart, mEnd, vl->mBounds[0], vl->mBounds[1])) +	{ +		node->accept(this); +		for (S32 i = 0; i < node->getChildCount(); ++i) +		{ +			traverse(node->getChild(i)); +		} +	} +} + +void LLOctreeTriangleRayIntersect::visit(const LLOctreeNode<LLVolumeTriangle>* node) +{ +	for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter =  +			node->getData().begin(); iter != node->getData().end(); ++iter) +	{ +		const LLVolumeTriangle* tri = *iter; + +		F32 a, b, t; +		 +		if (LLTriangleRayIntersect(*tri->mV[0], *tri->mV[1], *tri->mV[2], +				mStart, mDir, a, b, t)) +		{ +			if ((t >= 0.f) &&      // if hit is after start +				(t <= 1.f) &&      // and before end +				(t < *mClosestT))   // and this hit is closer +			{ +				*mClosestT = t; +				mHitFace = true; + +				if (mIntersection != NULL) +				{ +					LLVector4a intersect = mDir; +					intersect.mul(*mClosestT); +					intersect.add(mStart); +					mIntersection->set(intersect.getF32ptr()); +				} + + +				if (mTexCoord != NULL) +				{ +					LLVector2* tc = (LLVector2*) mFace->mTexCoords; +					*mTexCoord = ((1.f - a - b)  * tc[tri->mIndex[0]] + +						a              * tc[tri->mIndex[1]] + +						b              * tc[tri->mIndex[2]]); + +				} + +				if (mNormal != NULL) +				{ +					LLVector4* norm = (LLVector4*) mFace->mNormals; + +					*mNormal    = ((1.f - a - b)  * LLVector3(norm[tri->mIndex[0]]) +  +						a              * LLVector3(norm[tri->mIndex[1]]) + +						b              * LLVector3(norm[tri->mIndex[2]])); +				} + +				if (mBinormal != NULL) +				{ +					LLVector4* binormal = (LLVector4*) mFace->mBinormals; +					*mBinormal = ((1.f - a - b)  * LLVector3(binormal[tri->mIndex[0]]) +  +							a              * LLVector3(binormal[tri->mIndex[1]]) + +							b              * LLVector3(binormal[tri->mIndex[2]])); +				} +			} +		} +	} +} + +const LLVector4a& LLVolumeTriangle::getPositionGroup() const +{ +	return *mPositionGroup; +} + +const F32& LLVolumeTriangle::getBinRadius() const +{ +	return mRadius; +} + + diff --git a/indra/llmath/llvolumeoctree.h b/indra/llmath/llvolumeoctree.h new file mode 100644 index 0000000000..0031626498 --- /dev/null +++ b/indra/llmath/llvolumeoctree.h @@ -0,0 +1,138 @@ +/**  + * @file llvolumeoctree.h + * @brief LLVolume octree classes. + * + * $LicenseInfo:firstyear=2002&license=viewergpl$ + *  + * Copyright (c) 2002-2009, Linden Research, Inc. + *  + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab.  Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + *  + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + *  + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + *  + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_LLVOLUME_OCTREE_H +#define LL_LLVOLUME_OCTREE_H + +#include "linden_common.h" +#include "llmemory.h" + +#include "lloctree.h" +#include "llvolume.h" +#include "llvector4a.h" + +class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle> +{ +public: +	 +	LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node); +	~LLVolumeOctreeListener(); +	 +	LLVolumeOctreeListener(const LLVolumeOctreeListener& rhs) +	{ +		*this = rhs; +	} + +	const LLVolumeOctreeListener& operator=(const LLVolumeOctreeListener& rhs) +	{ +		llerrs << "Illegal operation!" << llendl; +		return *this; +	} + +	 //LISTENER FUNCTIONS +	virtual void handleChildAddition(const LLOctreeNode<LLVolumeTriangle>* parent,  +		LLOctreeNode<LLVolumeTriangle>* child); +	virtual void handleStateChange(const LLTreeNode<LLVolumeTriangle>* node) { } +	virtual void handleChildRemoval(const LLOctreeNode<LLVolumeTriangle>* parent,  +			const LLOctreeNode<LLVolumeTriangle>* child) {	} +	virtual void handleInsertion(const LLTreeNode<LLVolumeTriangle>* node, LLVolumeTriangle* tri) { } +	virtual void handleRemoval(const LLTreeNode<LLVolumeTriangle>* node, LLVolumeTriangle* tri) { } +	virtual void handleDestruction(const LLTreeNode<LLVolumeTriangle>* node) { } +	 + +public: +	LLVector4a* mBounds; // bounding box (center, size) of this node and all its children (tight fit to objects) +	LLVector4a* mExtents; // extents (min, max) of this node and all its children +}; + +class LLOctreeTriangleRayIntersect : public LLOctreeTraveler<LLVolumeTriangle> +{ +public: +	const LLVolumeFace* mFace; +	LLVector4a mStart; +	LLVector4a mDir; +	LLVector4a mEnd; +	LLVector3* mIntersection; +	LLVector2* mTexCoord; +	LLVector3* mNormal; +	LLVector3* mBinormal; +	F32* mClosestT; +	bool mHitFace; + +	LLOctreeTriangleRayIntersect() { }; + +	LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir,  +								   const LLVolumeFace* face, F32* closest_t, +								   LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal); + +	void traverse(const LLOctreeNode<LLVolumeTriangle>* node); + +	virtual void visit(const LLOctreeNode<LLVolumeTriangle>* node); +}; + +class LLVolumeTriangle : public LLRefCount +{ +public: +	LLVolumeTriangle() +	{ +		mPositionGroup = (LLVector4a*) ll_aligned_malloc_16(16); +	} + +	LLVolumeTriangle(const LLVolumeTriangle& rhs) +	{ +		*this = rhs; +	} + +	const LLVolumeTriangle& operator=(const LLVolumeTriangle& rhs) +	{ +		llerrs << "Illegal operation!" << llendl; +		return *this; +	} + +	~LLVolumeTriangle() +	{ +		ll_aligned_free_16(mPositionGroup); +	} + +	const LLVector4a* mV[3]; +	U16 mIndex[3]; + +	LLVector4a* mPositionGroup; + +	F32 mRadius; + +	virtual const LLVector4a& getPositionGroup() const; +	virtual const F32& getBinRadius() const; +}; + + +#endif | 
