diff options
Diffstat (limited to 'indra/llmath')
-rw-r--r-- | indra/llmath/llmatrix3a.cpp | 140 | ||||
-rw-r--r-- | indra/llmath/llmatrix3a.h | 134 | ||||
-rw-r--r-- | indra/llmath/llmatrix3a.inl | 125 | ||||
-rw-r--r-- | indra/llmath/llmatrix4a.h | 149 | ||||
-rw-r--r-- | indra/llmath/llquaternion2.h | 111 | ||||
-rw-r--r-- | indra/llmath/llquaternion2.inl | 108 | ||||
-rw-r--r-- | indra/llmath/llsimdmath.h | 95 | ||||
-rw-r--r-- | indra/llmath/llsimdtypes.h | 130 | ||||
-rw-r--r-- | indra/llmath/llsimdtypes.inl | 163 | ||||
-rw-r--r-- | indra/llmath/llvector4a.cpp | 228 | ||||
-rw-r--r-- | indra/llmath/llvector4a.h | 331 | ||||
-rw-r--r-- | indra/llmath/llvector4a.inl | 599 | ||||
-rw-r--r-- | indra/llmath/llvector4logical.h | 130 | ||||
-rw-r--r-- | indra/llmath/llvolumeoctree.cpp | 208 | ||||
-rw-r--r-- | indra/llmath/llvolumeoctree.h | 138 |
15 files changed, 2789 insertions, 0 deletions
diff --git a/indra/llmath/llmatrix3a.cpp b/indra/llmath/llmatrix3a.cpp new file mode 100644 index 0000000000..b7468f4914 --- /dev/null +++ b/indra/llmath/llmatrix3a.cpp @@ -0,0 +1,140 @@ +/** + * @file llvector4a.cpp + * @brief SIMD vector implementation + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llmath.h" + +static LL_ALIGN_16(const F32 M_IDENT_3A[12]) = + { 1.f, 0.f, 0.f, 0.f, // Column 1 + 0.f, 1.f, 0.f, 0.f, // Column 2 + 0.f, 0.f, 1.f, 0.f }; // Column 3 + +extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast<const LLMatrix3a*> (M_IDENT_3A); + +void LLMatrix3a::setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs ) +{ + const LLVector4a col0 = lhs.getColumn(0); + const LLVector4a col1 = lhs.getColumn(1); + const LLVector4a col2 = lhs.getColumn(2); + + for ( int i = 0; i < 3; i++ ) + { + LLVector4a xxxx = _mm_load_ss( rhs.mColumns[i].getF32ptr() ); + xxxx.splat<0>( xxxx ); + xxxx.mul( col0 ); + + { + LLVector4a yyyy = _mm_load_ss( rhs.mColumns[i].getF32ptr() + 1 ); + yyyy.splat<0>( yyyy ); + yyyy.mul( col1 ); + xxxx.add( yyyy ); + } + + { + LLVector4a zzzz = _mm_load_ss( rhs.mColumns[i].getF32ptr() + 2 ); + zzzz.splat<0>( zzzz ); + zzzz.mul( col2 ); + xxxx.add( zzzz ); + } + + xxxx.store4a( mColumns[i].getF32ptr() ); + } + +} + +/*static */void LLMatrix3a::batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst ) +{ + const LLVector4a col0 = xform.getColumn(0); + const LLVector4a col1 = xform.getColumn(1); + const LLVector4a col2 = xform.getColumn(2); + const LLVector4a* maxAddr = src + numVectors; + + if ( numVectors & 0x1 ) + { + LLVector4a xxxx = _mm_load_ss( (const F32*)src ); + LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 ); + LLVector4a zzzz = _mm_load_ss( (const F32*)src + 2 ); + xxxx.splat<0>( xxxx ); + yyyy.splat<0>( yyyy ); + zzzz.splat<0>( zzzz ); + xxxx.mul( col0 ); + yyyy.mul( col1 ); + zzzz.mul( col2 ); + xxxx.add( yyyy ); + xxxx.add( zzzz ); + xxxx.store4a( (F32*)dst ); + src++; + dst++; + } + + + numVectors >>= 1; + while ( src < maxAddr ) + { + _mm_prefetch( (const char*)(src + 32 ), _MM_HINT_NTA ); + _mm_prefetch( (const char*)(dst + 32), _MM_HINT_NTA ); + LLVector4a xxxx = _mm_load_ss( (const F32*)src ); + LLVector4a xxxx1= _mm_load_ss( (const F32*)(src + 1) ); + + xxxx.splat<0>( xxxx ); + xxxx1.splat<0>( xxxx1 ); + xxxx.mul( col0 ); + xxxx1.mul( col0 ); + + { + LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 ); + LLVector4a yyyy1 = _mm_load_ss( (const F32*)(src + 1) + 1); + yyyy.splat<0>( yyyy ); + yyyy1.splat<0>( yyyy1 ); + yyyy.mul( col1 ); + yyyy1.mul( col1 ); + xxxx.add( yyyy ); + xxxx1.add( yyyy1 ); + } + + { + LLVector4a zzzz = _mm_load_ss( (const F32*)(src) + 2 ); + LLVector4a zzzz1 = _mm_load_ss( (const F32*)(++src) + 2 ); + zzzz.splat<0>( zzzz ); + zzzz1.splat<0>( zzzz1 ); + zzzz.mul( col2 ); + zzzz1.mul( col2 ); + xxxx.add( zzzz ); + xxxx1.add( zzzz1 ); + } + + xxxx.store4a(dst->getF32ptr()); + src++; + dst++; + + xxxx1.store4a((F32*)dst++); + } +} diff --git a/indra/llmath/llmatrix3a.h b/indra/llmath/llmatrix3a.h new file mode 100644 index 0000000000..56327f9f6d --- /dev/null +++ b/indra/llmath/llmatrix3a.h @@ -0,0 +1,134 @@ +/** + * @file llmatrix3a.h + * @brief LLMatrix3a class header file - memory aligned and vectorized 3x3 matrix + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_LLMATRIX3A_H +#define LL_LLMATRIX3A_H + +///////////////////////////// +// LLMatrix3a, LLRotation +///////////////////////////// +// This class stores a 3x3 (technically 4x3) matrix in column-major order +///////////////////////////// +///////////////////////////// +// These classes are intentionally minimal right now. If you need additional +// functionality, please contact someone with SSE experience (e.g., Falcon or +// Huseby). +///////////////////////////// + +// LLMatrix3a is the base class for LLRotation, which should be used instead any time you're dealing with a +// rotation matrix. +class LLMatrix3a +{ +public: + + // Utility function for quickly transforming an array of LLVector4a's + // For transforming a single LLVector4a, see LLVector4a::setRotated + static void batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst ); + + // Utility function to obtain the identity matrix + static inline const LLMatrix3a& getIdentity(); + + ////////////////////////// + // Ctors + ////////////////////////// + + // Ctor + LLMatrix3a() {} + + // Ctor for setting by columns + inline LLMatrix3a( const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2 ); + + ////////////////////////// + // Get/Set + ////////////////////////// + + // Loads from an LLMatrix3 + inline void loadu(const LLMatrix3& src); + + // Set rows + inline void setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2); + + // Set columns + inline void setColumns(const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2); + + // Get the read-only access to a specified column. Valid columns are 0-2, but the + // function is unchecked. You've been warned. + inline const LLVector4a& getColumn(const U32 column) const; + + ///////////////////////// + // Matrix modification + ///////////////////////// + + // Set this matrix to the product of lhs and rhs ( this = lhs * rhs ) + void setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs ); + + // Set this matrix to the transpose of src + inline void setTranspose(const LLMatrix3a& src); + + // Set this matrix to a*w + b*(1-w) + inline void setLerp(const LLMatrix3a& a, const LLMatrix3a& b, F32 w); + + ///////////////////////// + // Matrix inspection + ///////////////////////// + + // Sets all 4 elements in 'dest' to the determinant of this matrix. + // If you will be using the determinant in subsequent ops with LLVector4a, use this version + inline void getDeterminant( LLVector4a& dest ) const; + + // Returns the determinant as an LLSimdScalar. Use this if you will be using the determinant + // primary for scalar operations. + inline LLSimdScalar getDeterminant() const; + + // Returns nonzero if rows 0-2 and colums 0-2 contain no NaN or INF values. Row 3 is ignored + inline LLBool32 isFinite() const; + + // Returns true if this matrix is equal to 'rhs' up to 'tolerance' + inline bool isApproximatelyEqual( const LLMatrix3a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const; + +protected: + + LLVector4a mColumns[3]; + +}; + +class LLRotation : public LLMatrix3a +{ +public: + + LLRotation() {} + + // Returns true if this rotation is orthonormal with det ~= 1 + inline bool isOkRotation() const; +}; + +#endif diff --git a/indra/llmath/llmatrix3a.inl b/indra/llmath/llmatrix3a.inl new file mode 100644 index 0000000000..65fd949f78 --- /dev/null +++ b/indra/llmath/llmatrix3a.inl @@ -0,0 +1,125 @@ +/** + * @file llmatrix3a.inl + * @brief LLMatrix3a inline definitions + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llmatrix3a.h" +#include "m3math.h" + +inline LLMatrix3a::LLMatrix3a( const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2 ) +{ + setColumns( c0, c1, c2 ); +} + +inline void LLMatrix3a::loadu(const LLMatrix3& src) +{ + mColumns[0].load3(src.mMatrix[0]); + mColumns[1].load3(src.mMatrix[1]); + mColumns[2].load3(src.mMatrix[2]); +} + +inline void LLMatrix3a::setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2) +{ + mColumns[0] = r0; + mColumns[1] = r1; + mColumns[2] = r2; + setTranspose( *this ); +} + +inline void LLMatrix3a::setColumns(const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2) +{ + mColumns[0] = c0; + mColumns[1] = c1; + mColumns[2] = c2; +} + +inline void LLMatrix3a::setTranspose(const LLMatrix3a& src) +{ + const LLQuad srcCol0 = src.mColumns[0]; + const LLQuad srcCol1 = src.mColumns[1]; + const LLQuad unpacklo = _mm_unpacklo_ps( srcCol0, srcCol1 ); + mColumns[0] = _mm_movelh_ps( unpacklo, src.mColumns[2] ); + mColumns[1] = _mm_shuffle_ps( _mm_movehl_ps( srcCol0, unpacklo ), src.mColumns[2], _MM_SHUFFLE(0, 1, 1, 0) ); + mColumns[2] = _mm_shuffle_ps( _mm_unpackhi_ps( srcCol0, srcCol1 ), src.mColumns[2], _MM_SHUFFLE(0, 2, 1, 0) ); +} + +inline const LLVector4a& LLMatrix3a::getColumn(const U32 column) const +{ + llassert( column < 3 ); + return mColumns[column]; +} + +inline void LLMatrix3a::setLerp(const LLMatrix3a& a, const LLMatrix3a& b, F32 w) +{ + mColumns[0].setLerp( a.mColumns[0], b.mColumns[0], w ); + mColumns[1].setLerp( a.mColumns[1], b.mColumns[1], w ); + mColumns[2].setLerp( a.mColumns[2], b.mColumns[2], w ); +} + +inline LLBool32 LLMatrix3a::isFinite() const +{ + return mColumns[0].isFinite3() && mColumns[1].isFinite3() && mColumns[2].isFinite3(); +} + +inline void LLMatrix3a::getDeterminant( LLVector4a& dest ) const +{ + LLVector4a col1xcol2; col1xcol2.setCross3( mColumns[1], mColumns[2] ); + dest.setAllDot3( col1xcol2, mColumns[0] ); +} + +inline LLSimdScalar LLMatrix3a::getDeterminant() const +{ + LLVector4a col1xcol2; col1xcol2.setCross3( mColumns[1], mColumns[2] ); + return col1xcol2.dot3( mColumns[0] ); +} + +inline bool LLMatrix3a::isApproximatelyEqual( const LLMatrix3a& rhs, F32 tolerance /*= F_APPROXIMATELY_ZERO*/ ) const +{ + return rhs.getColumn(0).equals3(mColumns[0], tolerance) + && rhs.getColumn(1).equals3(mColumns[1], tolerance) + && rhs.getColumn(2).equals3(mColumns[2], tolerance); +} + +inline const LLMatrix3a& LLMatrix3a::getIdentity() +{ + extern const LLMatrix3a LL_M3A_IDENTITY; + return LL_M3A_IDENTITY; +} + +inline bool LLRotation::isOkRotation() const +{ + LLMatrix3a transpose; transpose.setTranspose( *this ); + LLMatrix3a product; product.setMul( *this, transpose ); + + LLSimdScalar detMinusOne = getDeterminant() - 1.f; + + return product.isApproximatelyEqual( LLMatrix3a::getIdentity() ) && (detMinusOne.getAbs() < F_APPROXIMATELY_ZERO); +} + diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h new file mode 100644 index 0000000000..0ead045d04 --- /dev/null +++ b/indra/llmath/llmatrix4a.h @@ -0,0 +1,149 @@ +/** + * @file llmatrix4a.h + * @brief LLMatrix4a class header file - memory aligned and vectorized 4x4 matrix + * + * $LicenseInfo:firstyear=2007&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_LLMATRIX4A_H +#define LL_LLMATRIX4A_H + +#include "llvector4a.h" +#include "m4math.h" +#include "m3math.h" + +class LLMatrix4a +{ +public: + LLVector4a mMatrix[4]; + + inline void clear() + { + mMatrix[0].clear(); + mMatrix[1].clear(); + mMatrix[2].clear(); + mMatrix[3].clear(); + } + + inline void loadu(const LLMatrix4& src) + { + mMatrix[0] = _mm_loadu_ps(src.mMatrix[0]); + mMatrix[1] = _mm_loadu_ps(src.mMatrix[1]); + mMatrix[2] = _mm_loadu_ps(src.mMatrix[2]); + mMatrix[3] = _mm_loadu_ps(src.mMatrix[3]); + + } + + inline void loadu(const LLMatrix3& src) + { + mMatrix[0].load3(src.mMatrix[0]); + mMatrix[1].load3(src.mMatrix[1]); + mMatrix[2].load3(src.mMatrix[2]); + mMatrix[3].set(0,0,0,1.f); + } + + inline void add(const LLMatrix4a& rhs) + { + mMatrix[0].add(rhs.mMatrix[0]); + mMatrix[1].add(rhs.mMatrix[1]); + mMatrix[2].add(rhs.mMatrix[2]); + mMatrix[3].add(rhs.mMatrix[3]); + } + + inline void setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2) + { + mMatrix[0] = r0; + mMatrix[1] = r1; + mMatrix[2] = r2; + } + + inline void setMul(const LLMatrix4a& m, const F32 s) + { + mMatrix[0].setMul(m.mMatrix[0], s); + mMatrix[1].setMul(m.mMatrix[1], s); + mMatrix[2].setMul(m.mMatrix[2], s); + mMatrix[3].setMul(m.mMatrix[3], s); + } + + inline void setLerp(const LLMatrix4a& a, const LLMatrix4a& b, F32 w) + { + LLVector4a d0,d1,d2,d3; + d0.setSub(b.mMatrix[0], a.mMatrix[0]); + d1.setSub(b.mMatrix[1], a.mMatrix[1]); + d2.setSub(b.mMatrix[2], a.mMatrix[2]); + d3.setSub(b.mMatrix[3], a.mMatrix[3]); + + // this = a + d*w + + d0.mul(w); + d1.mul(w); + d2.mul(w); + d3.mul(w); + + mMatrix[0].setAdd(a.mMatrix[0],d0); + mMatrix[1].setAdd(a.mMatrix[1],d1); + mMatrix[2].setAdd(a.mMatrix[2],d2); + mMatrix[3].setAdd(a.mMatrix[3],d3); + } + + inline void rotate(const LLVector4a& v, LLVector4a& res) + { + res = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)); + res.mul(mMatrix[0]); + + LLVector4a y; + y = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)); + y.mul(mMatrix[1]); + + LLVector4a z; + z = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)); + z.mul(mMatrix[2]); + + res.add(y); + res.add(z); + } + + inline void affineTransform(const LLVector4a& v, LLVector4a& res) + { + LLVector4a x,y,z; + + x = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)); + y = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)); + z = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)); + + x.mul(mMatrix[0]); + y.mul(mMatrix[1]); + z.mul(mMatrix[2]); + + x.add(y); + z.add(mMatrix[3]); + res.setAdd(x,z); + } +}; + +#endif diff --git a/indra/llmath/llquaternion2.h b/indra/llmath/llquaternion2.h new file mode 100644 index 0000000000..dbb4afe312 --- /dev/null +++ b/indra/llmath/llquaternion2.h @@ -0,0 +1,111 @@ +/** + * @file llquaternion2.h + * @brief LLQuaternion2 class header file - SIMD-enabled quaternion class + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_QUATERNION2_H +#define LL_QUATERNION2_H + +///////////////////////////// +// LLQuaternion2 +///////////////////////////// +// This class stores a quaternion x*i + y*j + z*k + w in <x, y, z, w> order +// (i.e., w in high order element of vector) +///////////////////////////// +///////////////////////////// +// These classes are intentionally minimal right now. If you need additional +// functionality, please contact someone with SSE experience (e.g., Falcon or +// Huseby). +///////////////////////////// +#include "llquaternion.h" + +class LLQuaternion2 +{ +public: + + ////////////////////////// + // Ctors + ////////////////////////// + + // Ctor + LLQuaternion2() {} + + // Ctor from LLQuaternion + explicit LLQuaternion2( const class LLQuaternion& quat ); + + ////////////////////////// + // Get/Set + ////////////////////////// + + // Load from an LLQuaternion + inline void operator=( const LLQuaternion& quat ) + { + mQ.loadua( quat.mQ ); + } + + // Return the internal LLVector4a representation of the quaternion + inline const LLVector4a& getVector4a() const; + inline LLVector4a& getVector4aRw(); + + ///////////////////////// + // Quaternion modification + ///////////////////////// + + // Set this quaternion to the conjugate of src + inline void setConjugate(const LLQuaternion2& src); + + // Renormalizes the quaternion. Assumes it has nonzero length. + inline void normalize(); + + // Quantize this quaternion to 8 bit precision + inline void quantize8(); + + // Quantize this quaternion to 16 bit precision + inline void quantize16(); + + ///////////////////////// + // Quaternion inspection + ///////////////////////// + + // Return true if this quaternion is equal to 'rhs'. + // Note! Quaternions exhibit "double-cover", so any rotation has two equally valid + // quaternion representations and they will NOT compare equal. + inline bool equals(const LLQuaternion2& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const; + + // Return true if all components are finite and the quaternion is normalized + inline bool isOkRotation() const; + +protected: + + LLVector4a mQ; + +}; + +#endif diff --git a/indra/llmath/llquaternion2.inl b/indra/llmath/llquaternion2.inl new file mode 100644 index 0000000000..9a4274d6a4 --- /dev/null +++ b/indra/llmath/llquaternion2.inl @@ -0,0 +1,108 @@ +/** + * @file llquaternion2.inl + * @brief LLQuaternion2 inline definitions + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llquaternion2.h" + +static const LLQuad LL_V4A_PLUS_ONE = {1.f, 1.f, 1.f, 1.f}; +static const LLQuad LL_V4A_MINUS_ONE = {-1.f, -1.f, -1.f, -1.f}; + +// Ctor from LLQuaternion +inline LLQuaternion2::LLQuaternion2( const LLQuaternion& quat ) +{ + mQ.set(quat.mQ[VX], quat.mQ[VY], quat.mQ[VZ], quat.mQ[VW]); +} + +////////////////////////// +// Get/Set +////////////////////////// + +// Return the internal LLVector4a representation of the quaternion +inline const LLVector4a& LLQuaternion2::getVector4a() const +{ + return mQ; +} + +inline LLVector4a& LLQuaternion2::getVector4aRw() +{ + return mQ; +} + +///////////////////////// +// Quaternion modification +///////////////////////// + +// Set this quaternion to the conjugate of src +inline void LLQuaternion2::setConjugate(const LLQuaternion2& src) +{ + static LL_ALIGN_16( const U32 F_QUAT_INV_MASK_4A[4] ) = { 0x80000000, 0x80000000, 0x80000000, 0x00000000 }; + mQ = _mm_xor_ps(src.mQ, *reinterpret_cast<const LLQuad*>(&F_QUAT_INV_MASK_4A)); +} + +// Renormalizes the quaternion. Assumes it has nonzero length. +inline void LLQuaternion2::normalize() +{ + mQ.normalize4(); +} + +// Quantize this quaternion to 8 bit precision +inline void LLQuaternion2::quantize8() +{ + mQ.quantize8( LL_V4A_MINUS_ONE, LL_V4A_PLUS_ONE ); + normalize(); +} + +// Quantize this quaternion to 16 bit precision +inline void LLQuaternion2::quantize16() +{ + mQ.quantize16( LL_V4A_MINUS_ONE, LL_V4A_PLUS_ONE ); + normalize(); +} + + +///////////////////////// +// Quaternion inspection +///////////////////////// + +// Return true if this quaternion is equal to 'rhs'. +// Note! Quaternions exhibit "double-cover", so any rotation has two equally valid +// quaternion representations and they will NOT compare equal. +inline bool LLQuaternion2::equals(const LLQuaternion2 &rhs, F32 tolerance/* = F_APPROXIMATELY_ZERO*/) const +{ + return mQ.equals4(rhs.mQ, tolerance); +} + +// Return true if all components are finite and the quaternion is normalized +inline bool LLQuaternion2::isOkRotation() const +{ + return mQ.isFinite4() && mQ.isNormalized4(); +} + diff --git a/indra/llmath/llsimdmath.h b/indra/llmath/llsimdmath.h new file mode 100644 index 0000000000..9377bfdb53 --- /dev/null +++ b/indra/llmath/llsimdmath.h @@ -0,0 +1,95 @@ +/** + * @file llsimdmath.h + * @brief Common header for SIMD-based math library (llvector4a, llmatrix3a, etc.) + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_SIMD_MATH_H +#define LL_SIMD_MATH_H + +#ifndef LLMATH_H +#error "Please include llmath.h before this file." +#endif + +#if ( ( LL_DARWIN || LL_LINUX ) && !(__SSE2__) ) || ( LL_WINDOWS && ( _M_IX86_FP < 2 ) ) +#error SSE2 not enabled. LLVector4a and related class will not compile. +#endif + +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address) +{ + return reinterpret_cast<T*>( + (reinterpret_cast<U32>(address) + 0xF) & ~0xF); +} + +template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address) +{ + return reinterpret_cast<T*>( + (reinterpret_cast<U32>(address) + 0x3F) & ~0x3F); +} + +#if LL_LINUX || LL_DARWIN + +#define LL_ALIGN_PREFIX(x) +#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x))) + +#elif LL_WINDOWS + +#define LL_ALIGN_PREFIX(x) __declspec(align(x)) +#define LL_ALIGN_POSTFIX(x) + +#else +#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined" +#endif + +#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16) + + + +#include <xmmintrin.h> +#include <emmintrin.h> + +#include "llsimdtypes.h" +#include "llsimdtypes.inl" + +class LLMatrix3a; +class LLRotation; +class LLMatrix3; + +#include "llquaternion.h" + +#include "llvector4logical.h" +#include "llvector4a.h" +#include "llmatrix3a.h" +#include "llquaternion2.h" +#include "llvector4a.inl" +#include "llmatrix3a.inl" +#include "llquaternion2.inl" + + +#endif //LL_SIMD_MATH_H diff --git a/indra/llmath/llsimdtypes.h b/indra/llmath/llsimdtypes.h new file mode 100644 index 0000000000..82e318c8bf --- /dev/null +++ b/indra/llmath/llsimdtypes.h @@ -0,0 +1,130 @@ +/** + * @file llsimdtypes.h + * @brief Declaration of basic SIMD math related types + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_SIMD_TYPES_H +#define LL_SIMD_TYPES_H + +#ifndef LL_SIMD_MATH_H +#error "Please include llmath.h before this file." +#endif + +typedef __m128 LLQuad; + + +#if LL_WINDOWS +#pragma warning(push) +#pragma warning( disable : 4800 3 ) // Disable warning about casting int to bool for this class. +#if defined(_MSC_VER) && (_MSC_VER < 1500) +// VC++ 2005 is missing these intrinsics +// __forceinline is MSVC specific and attempts to override compiler inlining judgment. This is so +// even in debug builds this call is a NOP. +__forceinline const __m128 _mm_castsi128_ps( const __m128i a ) { return reinterpret_cast<const __m128&>(a); } +__forceinline const __m128i _mm_castps_si128( const __m128 a ) { return reinterpret_cast<const __m128i&>(a); } +#endif // _MSC_VER + +#endif // LL_WINDOWS + +class LLBool32 +{ +public: + inline LLBool32() {} + inline LLBool32(int rhs) : m_bool(rhs) {} + inline LLBool32(unsigned int rhs) : m_bool(rhs) {} + inline LLBool32(bool rhs) { m_bool = static_cast<const int>(rhs); } + inline LLBool32& operator= (bool rhs) { m_bool = (int)rhs; return *this; } + inline bool operator== (bool rhs) const { return static_cast<const bool&>(m_bool) == rhs; } + inline bool operator!= (bool rhs) const { return !operator==(rhs); } + inline operator bool() const { return static_cast<const bool&>(m_bool); } + +private: + int m_bool; +}; + +#if LL_WINDOWS +#pragma warning(pop) +#endif + +class LLSimdScalar +{ +public: + inline LLSimdScalar() {} + inline LLSimdScalar(LLQuad q) + { + mQ = q; + } + + inline LLSimdScalar(F32 f) + { + mQ = _mm_set_ss(f); + } + + static inline const LLSimdScalar& getZero() + { + extern const LLQuad F_ZERO_4A; + return reinterpret_cast<const LLSimdScalar&>(F_ZERO_4A); + } + + inline F32 getF32() const; + + inline LLBool32 isApproximatelyEqual(const LLSimdScalar& rhs, F32 tolerance = F_APPROXIMATELY_ZERO) const; + + inline LLSimdScalar getAbs() const; + + inline void setMax( const LLSimdScalar& a, const LLSimdScalar& b ); + + inline void setMin( const LLSimdScalar& a, const LLSimdScalar& b ); + + inline LLSimdScalar& operator=(F32 rhs); + + inline LLSimdScalar& operator+=(const LLSimdScalar& rhs); + + inline LLSimdScalar& operator-=(const LLSimdScalar& rhs); + + inline LLSimdScalar& operator*=(const LLSimdScalar& rhs); + + inline LLSimdScalar& operator/=(const LLSimdScalar& rhs); + + inline operator LLQuad() const + { + return mQ; + } + + inline const LLQuad& getQuad() const + { + return mQ; + } + +private: + LLQuad mQ; +}; + +#endif //LL_SIMD_TYPES_H diff --git a/indra/llmath/llsimdtypes.inl b/indra/llmath/llsimdtypes.inl new file mode 100644 index 0000000000..69c858e310 --- /dev/null +++ b/indra/llmath/llsimdtypes.inl @@ -0,0 +1,163 @@ +/** + * @file llsimdtypes.inl + * @brief Inlined definitions of basic SIMD math related types + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + + + + +////////////////// +// LLSimdScalar +////////////////// + +inline LLSimdScalar operator+(const LLSimdScalar& a, const LLSimdScalar& b) +{ + LLSimdScalar t(a); + t += b; + return t; +} + +inline LLSimdScalar operator-(const LLSimdScalar& a, const LLSimdScalar& b) +{ + LLSimdScalar t(a); + t -= b; + return t; +} + +inline LLSimdScalar operator*(const LLSimdScalar& a, const LLSimdScalar& b) +{ + LLSimdScalar t(a); + t *= b; + return t; +} + +inline LLSimdScalar operator/(const LLSimdScalar& a, const LLSimdScalar& b) +{ + LLSimdScalar t(a); + t /= b; + return t; +} + +inline LLSimdScalar operator-(const LLSimdScalar& a) +{ + static LL_ALIGN_16(const U32 signMask[4]) = {0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + return _mm_xor_ps(*reinterpret_cast<const LLQuad*>(signMask), a); +} + +inline LLBool32 operator==(const LLSimdScalar& a, const LLSimdScalar& b) +{ + return _mm_comieq_ss(a, b); +} + +inline LLBool32 operator!=(const LLSimdScalar& a, const LLSimdScalar& b) +{ + return _mm_comineq_ss(a, b); +} + +inline LLBool32 operator<(const LLSimdScalar& a, const LLSimdScalar& b) +{ + return _mm_comilt_ss(a, b); +} + +inline LLBool32 operator<=(const LLSimdScalar& a, const LLSimdScalar& b) +{ + return _mm_comile_ss(a, b); +} + +inline LLBool32 operator>(const LLSimdScalar& a, const LLSimdScalar& b) +{ + return _mm_comigt_ss(a, b); +} + +inline LLBool32 operator>=(const LLSimdScalar& a, const LLSimdScalar& b) +{ + return _mm_comige_ss(a, b); +} + +inline LLBool32 LLSimdScalar::isApproximatelyEqual(const LLSimdScalar& rhs, F32 tolerance /* = F_APPROXIMATELY_ZERO */) const +{ + const LLSimdScalar tol( tolerance ); + const LLSimdScalar diff = _mm_sub_ss( mQ, rhs.mQ ); + const LLSimdScalar absDiff = diff.getAbs(); + return absDiff <= tol; +} + +inline void LLSimdScalar::setMax( const LLSimdScalar& a, const LLSimdScalar& b ) +{ + mQ = _mm_max_ss( a, b ); +} + +inline void LLSimdScalar::setMin( const LLSimdScalar& a, const LLSimdScalar& b ) +{ + mQ = _mm_min_ss( a, b ); +} + +inline LLSimdScalar& LLSimdScalar::operator=(F32 rhs) +{ + mQ = _mm_set_ss(rhs); + return *this; +} + +inline LLSimdScalar& LLSimdScalar::operator+=(const LLSimdScalar& rhs) +{ + mQ = _mm_add_ss( mQ, rhs ); + return *this; +} + +inline LLSimdScalar& LLSimdScalar::operator-=(const LLSimdScalar& rhs) +{ + mQ = _mm_sub_ss( mQ, rhs ); + return *this; +} + +inline LLSimdScalar& LLSimdScalar::operator*=(const LLSimdScalar& rhs) +{ + mQ = _mm_mul_ss( mQ, rhs ); + return *this; +} + +inline LLSimdScalar& LLSimdScalar::operator/=(const LLSimdScalar& rhs) +{ + mQ = _mm_div_ss( mQ, rhs ); + return *this; +} + +inline LLSimdScalar LLSimdScalar::getAbs() const +{ + static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; + return _mm_and_ps( mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A)); +} + +inline F32 LLSimdScalar::getF32() const +{ + F32 ret; + _mm_store_ss(&ret, mQ); + return ret; +} diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp new file mode 100644 index 0000000000..b62c17302f --- /dev/null +++ b/indra/llmath/llvector4a.cpp @@ -0,0 +1,228 @@ +/** + * @file llvector4a.cpp + * @brief SIMD vector implementation + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llmath.h" +#include "llquantize.h" + +extern const LLQuad F_ZERO_4A = { 0, 0, 0, 0 }; +extern const LLQuad F_APPROXIMATELY_ZERO_4A = { + F_APPROXIMATELY_ZERO, + F_APPROXIMATELY_ZERO, + F_APPROXIMATELY_ZERO, + F_APPROXIMATELY_ZERO +}; + +extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A ); +extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F_APPROXIMATELY_ZERO_4A ); + +/*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes) +{ + assert(src != NULL); + assert(dst != NULL); + assert(bytes > 0); + assert((bytes % sizeof(F32))== 0); + + F32* end = dst + (bytes / sizeof(F32) ); + + if (bytes > 64) + { + F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst); + + //at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies + F32* end_64 = end-16; + + _mm_prefetch((char*)begin_64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA); + + while (dst < begin_64) + { + copy4a(dst, src); + dst += 4; + src += 4; + } + + while (dst < end_64) + { + _mm_prefetch((char*)src + 512, _MM_HINT_NTA); + _mm_prefetch((char*)dst + 512, _MM_HINT_NTA); + copy4a(dst, src); + copy4a(dst+4, src+4); + copy4a(dst+8, src+8); + copy4a(dst+12, src+12); + + dst += 16; + src += 16; + } + } + + while (dst < end) + { + copy4a(dst, src); + dst += 4; + src += 4; + } +} + +void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec ) +{ + const LLVector4a col0 = rot.getColumn(0); + const LLVector4a col1 = rot.getColumn(1); + const LLVector4a col2 = rot.getColumn(2); + + LLVector4a result = _mm_load_ss( vec.getF32ptr() ); + result.splat<0>( result ); + result.mul( col0 ); + + { + LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() + 1 ); + yyyy.splat<0>( yyyy ); + yyyy.mul( col1 ); + result.add( yyyy ); + } + + { + LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() + 2 ); + zzzz.splat<0>( zzzz ); + zzzz.mul( col2 ); + result.add( zzzz ); + } + + *this = result; +} + +void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec ) +{ + const LLVector4a& quatVec = quat.getVector4a(); + LLVector4a temp; temp.setCross3(quatVec, vec); + temp.add( temp ); + + const LLVector4a realPart( quatVec.getScalarAt<3>() ); + LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart ); + + mQ = vec; + add( tempTimesReal ); + + LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp ); + add(imagCrossTemp); +} + +void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high ) +{ + LLVector4a val(mQ); + LLVector4a delta; delta.setSub( high, low ); + + { + val.clamp(low, high); + val.sub(low); + + // 8-bit quantization means we can do with just 12 bits of reciprocal accuracy + const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ); +// { +// static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; +// LLVector4a two; two.load4a( F_TWO_4A ); +// +// // Here we use _mm_rcp_ps plus one round of newton-raphson +// // We wish to find 'x' such that x = 1/delta +// // As a first approximation, we take x0 = _mm_rcp_ps(delta) +// // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) +// // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf +// const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); +// oneOverDelta.setMul( delta, recipApprox ); +// oneOverDelta.setSub( two, oneOverDelta ); +// oneOverDelta.mul( recipApprox ); +// } + + val.mul(oneOverDelta); + val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A)); + } + + val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); + + { + val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); + val.mul(delta); + val.add(low); + } + + { + LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); + LLVector4a absVal; absVal.setAbs( val ); + setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); + } +} + +void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high ) +{ + LLVector4a val(mQ); + LLVector4a delta; delta.setSub( high, low ); + + { + val.clamp(low, high); + val.sub(low); + + // 16-bit quantization means we need a round of Newton-Raphson + LLVector4a oneOverDelta; + { + static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; + LLVector4a two; two.load4a( F_TWO_4A ); + + // Here we use _mm_rcp_ps plus one round of newton-raphson + // We wish to find 'x' such that x = 1/delta + // As a first approximation, we take x0 = _mm_rcp_ps(delta) + // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) + // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf + const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); + oneOverDelta.setMul( delta, recipApprox ); + oneOverDelta.setSub( two, oneOverDelta ); + oneOverDelta.mul( recipApprox ); + } + + val.mul(oneOverDelta); + val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A)); + } + + val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); + + { + val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); + val.mul(delta); + val.add(low); + } + + { + LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); + LLVector4a absVal; absVal.setAbs( val ); + setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); + } +} diff --git a/indra/llmath/llvector4a.h b/indra/llmath/llvector4a.h new file mode 100644 index 0000000000..76a3e999ce --- /dev/null +++ b/indra/llmath/llvector4a.h @@ -0,0 +1,331 @@ +/** + * @file llvector4a.h + * @brief LLVector4a class header file - memory aligned and vectorized 4 component vector + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_LLVECTOR4A_H +#define LL_LLVECTOR4A_H + + +class LLRotation; + +#include <assert.h> +#include "llpreprocessor.h" + +/////////////////////////////////// +// FIRST TIME USERS PLEASE READ +////////////////////////////////// +// This is just the beginning of LLVector4a. There are many more useful functions +// yet to be implemented. For example, setNeg to negate a vector, rotate() to apply +// a matrix rotation, various functions to manipulate only the X, Y, and Z elements +// and many others (including a whole variety of accessors). So if you don't see a +// function here that you need, please contact Falcon or someone else with SSE +// experience (Richard, I think, has some and davep has a little as of the time +// of this writing, July 08, 2010) about getting it implemented before you resort to +// LLVector3/LLVector4. +///////////////////////////////// + +class LLVector4a +{ +public: + + /////////////////////////////////// + // STATIC METHODS + /////////////////////////////////// + + // Call initClass() at startup to avoid 15,000+ cycle penalties from denormalized numbers + static void initClass() + { + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + } + + // Return a vector of all zeros + static inline const LLVector4a& getZero() + { + extern const LLVector4a LL_V4A_ZERO; + return LL_V4A_ZERO; + } + + // Return a vector of all epsilon, where epsilon is a small float suitable for approximate equality checks + static inline const LLVector4a& getEpsilon() + { + extern const LLVector4a LL_V4A_EPSILON; + return LL_V4A_EPSILON; + } + + // Copy 16 bytes from src to dst. Source and destination must be 16-byte aligned + static inline void copy4a(F32* dst, const F32* src) + { + _mm_store_ps(dst, _mm_load_ps(src)); + } + + // Copy words 16-byte blocks from src to dst. Source and destination must not overlap. + static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes); + + //////////////////////////////////// + // CONSTRUCTORS + //////////////////////////////////// + + LLVector4a() + { //DO NOT INITIALIZE -- The overhead is completely unnecessary + } + + LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f) + { + set(x,y,z,w); + } + + LLVector4a(F32 x) + { + splat(x); + } + + LLVector4a(const LLSimdScalar& x) + { + splat(x); + } + + LLVector4a(LLQuad q) + { + mQ = q; + } + + //////////////////////////////////// + // LOAD/STORE + //////////////////////////////////// + + // Load from 16-byte aligned src array (preferred method of loading) + inline void load4a(const F32* src); + + // Load from unaligned src array (NB: Significantly slower than load4a) + inline void loadua(const F32* src); + + // Load only three floats beginning at address 'src'. Slowest method. + inline void load3(const F32* src); + + // Store to a 16-byte aligned memory address + inline void store4a(F32* dst) const; + + //////////////////////////////////// + // BASIC GET/SET + //////////////////////////////////// + + // Return a "this" as an F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon) + inline F32* getF32ptr(); + + // Return a "this" as a const F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon) + inline const F32* const getF32ptr() const; + + // Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates + // the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead + inline F32 operator[](const S32 idx) const; + + // Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. + inline LLSimdScalar getScalarAt(const S32 idx) const; + + // Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. + template <int N> LL_FORCE_INLINE LLSimdScalar getScalarAt() const; + template <> LL_FORCE_INLINE LLSimdScalar getScalarAt<0>() const; + + // Set to an x, y, z and optional w provided + inline void set(F32 x, F32 y, F32 z, F32 w = 0.f); + + // Set to all zeros. This is preferred to using ::getZero() + inline void clear(); + + // Set all elements to 'x' + inline void splat(const F32 x); + + // Set all elements to 'x' + inline void splat(const LLSimdScalar& x); + + // Set all 4 elements to element N of src, with N known at compile time + template <int N> void splat(const LLVector4a& src); + + // Set all 4 elements to element i of v, with i NOT known at compile time + inline void splat(const LLVector4a& v, U32 i); + + // Select bits from sourceIfTrue and sourceIfFalse according to bits in mask + inline void setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse ); + + //////////////////////////////////// + // ALGEBRAIC + //////////////////////////////////// + + // Set this to the element-wise (a + b) + inline void setAdd(const LLVector4a& a, const LLVector4a& b); + + // Set this to element-wise (a - b) + inline void setSub(const LLVector4a& a, const LLVector4a& b); + + // Set this to element-wise multiply (a * b) + inline void setMul(const LLVector4a& a, const LLVector4a& b); + + // Set this to element-wise quotient (a / b) + inline void setDiv(const LLVector4a& a, const LLVector4a& b); + + // Set this to the element-wise absolute value of src + inline void setAbs(const LLVector4a& src); + + // Add to each component in this vector the corresponding component in rhs + inline void add(const LLVector4a& rhs); + + // Subtract from each component in this vector the corresponding component in rhs + inline void sub(const LLVector4a& rhs); + + // Multiply each component in this vector by the corresponding component in rhs + inline void mul(const LLVector4a& rhs); + + // Divide each component in this vector by the corresponding component in rhs + inline void div(const LLVector4a& rhs); + + // Multiply this vector by x in a scalar fashion + inline void mul(const F32 x); + + // Set this to (a x b) (geometric cross-product) + inline void setCross3(const LLVector4a& a, const LLVector4a& b); + + // Set all elements to the dot product of the x, y, and z elements in a and b + inline void setAllDot3(const LLVector4a& a, const LLVector4a& b); + + // Set all elements to the dot product of the x, y, z, and w elements in a and b + inline void setAllDot4(const LLVector4a& a, const LLVector4a& b); + + // Return the 3D dot product of this vector and b + inline LLSimdScalar dot3(const LLVector4a& b) const; + + // Return the 4D dot product of this vector and b + inline LLSimdScalar dot4(const LLVector4a& b) const; + + // Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed + // Note that this does not consider zero length vectors! + inline void normalize3(); + + // Same as normalize3() but with respect to all 4 components + inline void normalize4(); + + // Same as normalize3(), but returns length as a SIMD scalar + inline LLSimdScalar normalize3withLength(); + + // Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed + // Note that this does not consider zero length vectors! + inline void normalize3fast(); + + // Return true if this vector is normalized with respect to x,y,z up to tolerance + inline LLBool32 isNormalized3( F32 tolerance = 1e-3 ) const; + + // Return true if this vector is normalized with respect to all components up to tolerance + inline LLBool32 isNormalized4( F32 tolerance = 1e-3 ) const; + + // Set all elements to the length of vector 'v' + inline void setAllLength3( const LLVector4a& v ); + + // Get this vector's length + inline LLSimdScalar getLength3() const; + + // Set the components of this vector to the minimum of the corresponding components of lhs and rhs + inline void setMin(const LLVector4a& lhs, const LLVector4a& rhs); + + // Set the components of this vector to the maximum of the corresponding components of lhs and rhs + inline void setMax(const LLVector4a& lhs, const LLVector4a& rhs); + + // Clamps this vector to be within the component-wise range low to high (inclusive) + inline void clamp( const LLVector4a& low, const LLVector4a& high ); + + // Set this to (c * lhs) + rhs * ( 1 - c) + inline void setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c); + + // Return true (nonzero) if x, y, z (and w for Finite4) are all finite floats + inline LLBool32 isFinite3() const; + inline LLBool32 isFinite4() const; + + // Set this vector to 'vec' rotated by the LLRotation or LLQuaternion2 provided + void setRotated( const LLRotation& rot, const LLVector4a& vec ); + void setRotated( const class LLQuaternion2& quat, const LLVector4a& vec ); + + // Set this vector to 'vec' rotated by the INVERSE of the LLRotation or LLQuaternion2 provided + inline void setRotatedInv( const LLRotation& rot, const LLVector4a& vec ); + inline void setRotatedInv( const class LLQuaternion2& quat, const LLVector4a& vec ); + + // Quantize this vector to 8 or 16 bit precision + void quantize8( const LLVector4a& low, const LLVector4a& high ); + void quantize16( const LLVector4a& low, const LLVector4a& high ); + + //////////////////////////////////// + // LOGICAL + //////////////////////////////////// + // The functions in this section will compare the elements in this vector + // to those in rhs and return an LLVector4Logical with all bits set in elements + // where the comparison was true and all bits unset in elements where the comparison + // was false. See llvector4logica.h + //////////////////////////////////// + // WARNING: Other than equals3 and equals4, these functions do NOT account + // for floating point tolerance. You should include the appropriate tolerance + // in the inputs. + //////////////////////////////////// + + inline LLVector4Logical greaterThan(const LLVector4a& rhs) const; + + inline LLVector4Logical lessThan(const LLVector4a& rhs) const; + + inline LLVector4Logical greaterEqual(const LLVector4a& rhs) const; + + inline LLVector4Logical lessEqual(const LLVector4a& rhs) const; + + inline LLVector4Logical equal(const LLVector4a& rhs) const; + + // Returns true if this and rhs are componentwise equal up to the specified absolute tolerance + inline bool equals4(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const; + + inline bool equals3(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const; + + //////////////////////////////////// + // OPERATORS + //////////////////////////////////// + + // Do NOT add aditional operators without consulting someone with SSE experience + inline const LLVector4a& operator= ( const LLVector4a& rhs ); + + inline const LLVector4a& operator= ( const LLQuad& rhs ); + + inline operator LLQuad() const; + +private: + LLQuad mQ; +}; + +inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p) +{ + min.setMin(min, p); + max.setMax(max, p); +} + +#endif diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl new file mode 100644 index 0000000000..e52b550883 --- /dev/null +++ b/indra/llmath/llvector4a.inl @@ -0,0 +1,599 @@ +/** + * @file llvector4a.inl + * @brief LLVector4a inline function implementations + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +//////////////////////////////////// +// LOAD/STORE +//////////////////////////////////// + +// Load from 16-byte aligned src array (preferred method of loading) +inline void LLVector4a::load4a(const F32* src) +{ + mQ = _mm_load_ps(src); +} + +// Load from unaligned src array (NB: Significantly slower than load4a) +inline void LLVector4a::loadua(const F32* src) +{ + mQ = _mm_loadu_ps(src); +} + +// Load only three floats beginning at address 'src'. Slowest method. +inline void LLVector4a::load3(const F32* src) +{ + // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X } + // NB: This differs from the convention of { Z, Y, X, W } + mQ = _mm_set_ps(0.f, src[2], src[1], src[0]); +} + +// Store to a 16-byte aligned memory address +inline void LLVector4a::store4a(F32* dst) const +{ + _mm_store_ps(dst, mQ); +} + +//////////////////////////////////// +// BASIC GET/SET +//////////////////////////////////// + +// Return a "this" as an F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon) +F32* LLVector4a::getF32ptr() +{ + return (F32*) &mQ; +} + +// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon) +const F32* const LLVector4a::getF32ptr() const +{ + return (const F32* const) &mQ; +} + +// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates +// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead +inline F32 LLVector4a::operator[](const S32 idx) const +{ + return ((F32*)&mQ)[idx]; +} + +// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. +inline LLSimdScalar LLVector4a::getScalarAt(const S32 idx) const +{ + // Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop) + switch (idx) + { + case 0: + return mQ; + case 1: + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1)); + case 2: + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2)); + case 3: + default: + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3)); + } +} + +// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. +template <int N> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt() const +{ + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N)); +} + +template<> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const +{ + return mQ; +} + +// Set to an x, y, z and optional w provided +inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w) +{ + mQ = _mm_set_ps(w, z, y, x); +} + +// Set to all zeros +inline void LLVector4a::clear() +{ + mQ = LLVector4a::getZero().mQ; +} + +inline void LLVector4a::splat(const F32 x) +{ + mQ = _mm_set1_ps(x); +} + +inline void LLVector4a::splat(const LLSimdScalar& x) +{ + mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) ); +} + +// Set all 4 elements to element N of src, with N known at compile time +template <int N> void LLVector4a::splat(const LLVector4a& src) +{ + mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) ); +} + +// Set all 4 elements to element i of v, with i NOT known at compile time +inline void LLVector4a::splat(const LLVector4a& v, U32 i) +{ + switch (i) + { + case 0: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0)); + break; + case 1: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1)); + break; + case 2: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2)); + break; + case 3: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3)); + break; + } +} + +// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask +inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse ) +{ + // ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse ) + // E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b + // (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b, + // as expected (01 from sourceIfTrue, 10 from sourceIfFalse) + // Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/ + mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) ); +} + +//////////////////////////////////// +// ALGEBRAIC +//////////////////////////////////// + +// Set this to the element-wise (a + b) +inline void LLVector4a::setAdd(const LLVector4a& a, const LLVector4a& b) +{ + mQ = _mm_add_ps(a.mQ, b.mQ); +} + +// Set this to element-wise (a - b) +inline void LLVector4a::setSub(const LLVector4a& a, const LLVector4a& b) +{ + mQ = _mm_sub_ps(a.mQ, b.mQ); +} + +// Set this to element-wise multiply (a * b) +inline void LLVector4a::setMul(const LLVector4a& a, const LLVector4a& b) +{ + mQ = _mm_mul_ps(a.mQ, b.mQ); +} + +// Set this to element-wise quotient (a / b) +inline void LLVector4a::setDiv(const LLVector4a& a, const LLVector4a& b) +{ + mQ = _mm_div_ps( a.mQ, b.mQ ); +} + +// Set this to the element-wise absolute value of src +inline void LLVector4a::setAbs(const LLVector4a& src) +{ + static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; + mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A)); +} + +// Add to each component in this vector the corresponding component in rhs +inline void LLVector4a::add(const LLVector4a& rhs) +{ + mQ = _mm_add_ps(mQ, rhs.mQ); +} + +// Subtract from each component in this vector the corresponding component in rhs +inline void LLVector4a::sub(const LLVector4a& rhs) +{ + mQ = _mm_sub_ps(mQ, rhs.mQ); +} + +// Multiply each component in this vector by the corresponding component in rhs +inline void LLVector4a::mul(const LLVector4a& rhs) +{ + mQ = _mm_mul_ps(mQ, rhs.mQ); +} + +// Divide each component in this vector by the corresponding component in rhs +inline void LLVector4a::div(const LLVector4a& rhs) +{ + // TODO: Check accuracy, maybe add divFast + mQ = _mm_div_ps(mQ, rhs.mQ); +} + +// Multiply this vector by x in a scalar fashion +inline void LLVector4a::mul(const F32 x) +{ + LLVector4a t; + t.splat(x); + + mQ = _mm_mul_ps(mQ, t.mQ); +} + +// Set this to (a x b) (geometric cross-product) +inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b) +{ + // Vectors are stored in memory in w, z, y, x order from high to low + // Set vector1 = { a[W], a[X], a[Z], a[Y] } + const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); + // Set vector2 = { b[W], b[Y], b[X], b[Z] } + const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); + // mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] } + mQ = _mm_mul_ps( vector1, vector2 ); + // vector3 = { a[W], a[Y], a[X], a[Z] } + const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); + // vector4 = { b[W], b[X], b[Z], b[Y] } + const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); + // mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] } + mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 )); +} + +/* This function works, but may be slightly slower than the one below on older machines + inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) + { + // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } + const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); + // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } + const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 )); + // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusY = _mm_add_ps(ab, wzxy); + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY); + // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] } + const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 )); + // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } + mQ = _mm_add_ps(zSplat, xPlusYSplat); + }*/ + +// Set all elements to the dot product of the x, y, and z elements in a and b +inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) +{ + // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } + const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); + // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } + const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 )); + // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy)); + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY); + // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] } + const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 )); + // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } + mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat); +} + +// Set all elements to the dot product of the x, y, z, and w elements in a and b +inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b) +{ + // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } + const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); + // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } + const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 )); + // zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy)); + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY); + const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY); + + // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } + mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat); +} + +// Return the 3D dot product of this vector and b +inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const +{ + const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); + const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) ); + const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) ); + const LLQuad xPlusY = _mm_add_ps( ab, splatY ); + return _mm_add_ps( xPlusY, splatZ ); +} + +// Return the 4D dot product of this vector and b +inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const +{ + // ab = { w, z, y, x } + const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); + // upperProdsInLowerElems = { y, x, y, x } + const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab ); + // sumOfPairs = { w+y, z+x, 2y, 2x } + const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab ); + // shuffled = { z+x, z+x, z+x, z+x } + const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) ); + return _mm_add_ss( sumOfPairs, shuffled ); +} + +// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed +// Note that this does not consider zero length vectors! +inline void LLVector4a::normalize3() +{ + // lenSqrd = a dot a + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } + const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; + // Now we do one round of Newton-Raphson approximation to get full accuracy + // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) + // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) + // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 + // = 0.5 * w * (3 - a*w^2) + // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula + // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] + const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); + const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); + const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); + const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); + mQ = _mm_mul_ps( mQ, nrApprox ); +} + +// Normalize this vector with respect to all components. Accurate to 22 bites of precision. +// Note that this does not consider zero length vectors! +inline void LLVector4a::normalize4() +{ + // lenSqrd = a dot a + LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this ); + // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } + const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; + // Now we do one round of Newton-Raphson approximation to get full accuracy + // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) + // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) + // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 + // = 0.5 * w * (3 - a*w^2) + // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula + // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] + const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); + const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); + const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); + const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); + mQ = _mm_mul_ps( mQ, nrApprox ); +} + +// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed +// Note that this does not consider zero length vectors! +inline LLSimdScalar LLVector4a::normalize3withLength() +{ + // lenSqrd = a dot a + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } + const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; + // Now we do one round of Newton-Raphson approximation to get full accuracy + // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) + // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) + // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 + // = 0.5 * w * (3 - a*w^2) + // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula + // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] + const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); + const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); + const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); + const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); + mQ = _mm_mul_ps( mQ, nrApprox ); + return _mm_sqrt_ss(lenSqrd); +} + +// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed +// Note that this does not consider zero length vectors! +inline void LLVector4a::normalize3fast() +{ + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + mQ = _mm_mul_ps( mQ, approxRsqrt ); +} + +// Return true if this vector is normalized with respect to x,y,z up to tolerance +inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const +{ + static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; + LLSimdScalar tol = _mm_load_ss( &tolerance ); + tol = _mm_mul_ss( tol, tol ); + LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this ); + lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); + lenSquared.setAbs(lenSquared); + return _mm_comile_ss( lenSquared, tol ); +} + +// Return true if this vector is normalized with respect to all components up to tolerance +inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const +{ + static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; + LLSimdScalar tol = _mm_load_ss( &tolerance ); + tol = _mm_mul_ss( tol, tol ); + LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this ); + lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); + lenSquared.setAbs(lenSquared); + return _mm_comile_ss( lenSquared, tol ); +} + +// Set all elements to the length of vector 'v' +inline void LLVector4a::setAllLength3( const LLVector4a& v ) +{ + LLVector4a lenSqrd; + lenSqrd.setAllDot3(v, v); + + mQ = _mm_sqrt_ps(lenSqrd.mQ); +} + +// Get this vector's length +inline LLSimdScalar LLVector4a::getLength3() const +{ + return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) ); +} + +// Set the components of this vector to the minimum of the corresponding components of lhs and rhs +inline void LLVector4a::setMin(const LLVector4a& lhs, const LLVector4a& rhs) +{ + mQ = _mm_min_ps(lhs.mQ, rhs.mQ); +} + +// Set the components of this vector to the maximum of the corresponding components of lhs and rhs +inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs) +{ + mQ = _mm_max_ps(lhs.mQ, rhs.mQ); +} + +// Set this to (c * lhs) + rhs * ( 1 - c) +inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c) +{ + LLVector4a a = lhs; + a.mul(c); + + LLVector4a b = rhs; + b.mul(1.f-c); + + setAdd(a, b); +} + +inline LLBool32 LLVector4a::isFinite3() const +{ + static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); + const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); + const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); + return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ ); +} + +inline LLBool32 LLVector4a::isFinite4() const +{ + static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); + const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); + const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); + return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW ); +} + +inline void LLVector4a::setRotatedInv( const LLRotation& rot, const LLVector4a& vec ) +{ + LLRotation inv; inv.setTranspose( rot ); + setRotated( inv, vec ); +} + +inline void LLVector4a::setRotatedInv( const LLQuaternion2& quat, const LLVector4a& vec ) +{ + LLQuaternion2 invRot; invRot.setConjugate( quat ); + setRotated(invRot, vec); +} + +inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high ) +{ + const LLVector4Logical highMask = greaterThan( high ); + const LLVector4Logical lowMask = lessThan( low ); + + setSelectWithMask( highMask, high, *this ); + setSelectWithMask( lowMask, low, *this ); +} + + +//////////////////////////////////// +// LOGICAL +//////////////////////////////////// +// The functions in this section will compare the elements in this vector +// to those in rhs and return an LLVector4Logical with all bits set in elements +// where the comparison was true and all bits unset in elements where the comparison +// was false. See llvector4logica.h +//////////////////////////////////// +// WARNING: Other than equals3 and equals4, these functions do NOT account +// for floating point tolerance. You should include the appropriate tolerance +// in the inputs. +//////////////////////////////////// + +inline LLVector4Logical LLVector4a::greaterThan(const LLVector4a& rhs) const +{ + return _mm_cmpgt_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::lessThan(const LLVector4a& rhs) const +{ + return _mm_cmplt_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::greaterEqual(const LLVector4a& rhs) const +{ + return _mm_cmpge_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::lessEqual(const LLVector4a& rhs) const +{ + return _mm_cmple_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::equal(const LLVector4a& rhs) const +{ + return _mm_cmpeq_ps(mQ, rhs.mQ); +} + +// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance +inline bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance ) const +{ + LLVector4a diff; diff.setSub( *this, rhs ); + diff.setAbs( diff ); + const LLQuad tol = _mm_set1_ps( tolerance ); + const LLQuad cmp = _mm_cmplt_ps( diff, tol ); + return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW; +} + +inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const +{ + LLVector4a diff; diff.setSub( *this, rhs ); + diff.setAbs( diff ); + const LLQuad tol = _mm_set1_ps( tolerance ); + const LLQuad t = _mm_cmplt_ps( diff, tol ); + return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ; + +} + +//////////////////////////////////// +// OPERATORS +//////////////////////////////////// + +// Do NOT add aditional operators without consulting someone with SSE experience +inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs ) +{ + mQ = rhs.mQ; + return *this; +} + +inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs ) +{ + mQ = rhs; + return *this; +} + +inline LLVector4a::operator LLQuad() const +{ + return mQ; +} diff --git a/indra/llmath/llvector4logical.h b/indra/llmath/llvector4logical.h new file mode 100644 index 0000000000..1c7ee1d79f --- /dev/null +++ b/indra/llmath/llvector4logical.h @@ -0,0 +1,130 @@ +/** + * @file llvector4logical.h + * @brief LLVector4Logical class header file - Companion class to LLVector4a for logical and bit-twiddling operations + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_VECTOR4LOGICAL_H +#define LL_VECTOR4LOGICAL_H + + +//////////////////////////// +// LLVector4Logical +//////////////////////////// +// This class is incomplete. If you need additional functionality, +// for example setting/unsetting particular elements or performing +// other boolean operations, feel free to implement. If you need +// assistance in determining the most optimal implementation, +// contact someone with SSE experience (Falcon, Richard, Davep, e.g.) +//////////////////////////// + +static LL_ALIGN_16(const U32 S_V4LOGICAL_MASK_TABLE[4*4]) = +{ + 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF +}; + +class LLVector4Logical +{ +public: + + enum { + MASK_X = 1, + MASK_Y = 1 << 1, + MASK_Z = 1 << 2, + MASK_W = 1 << 3, + MASK_XYZ = MASK_X | MASK_Y | MASK_Z, + MASK_XYZW = MASK_XYZ | MASK_W + }; + + // Empty default ctor + LLVector4Logical() {} + + LLVector4Logical( const LLQuad& quad ) + { + mQ = quad; + } + + // Create and return a mask consisting of the lowest order bit of each element + inline U32 getGatheredBits() const + { + return _mm_movemask_ps(mQ); + }; + + // Invert this mask + inline LLVector4Logical& invert() + { + static const LL_ALIGN_16(U32 allOnes[4]) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + mQ = _mm_andnot_ps( mQ, *(LLQuad*)(allOnes) ); + return *this; + } + + inline LLBool32 areAllSet( U32 mask ) const + { + return ( getGatheredBits() & mask) == mask; + } + + inline LLBool32 areAllSet() const + { + return areAllSet( MASK_XYZW ); + } + + inline LLBool32 areAnySet( U32 mask ) const + { + return getGatheredBits() & mask; + } + + inline LLBool32 areAnySet() const + { + return areAnySet( MASK_XYZW ); + } + + inline operator LLQuad() const + { + return mQ; + } + + inline void clear() + { + mQ = _mm_setzero_ps(); + } + + template<int N> void setElement() + { + mQ = _mm_or_ps( mQ, *reinterpret_cast<const LLQuad*>(S_V4LOGICAL_MASK_TABLE + 4*N) ); + } + +private: + + LLQuad mQ; +}; + +#endif //LL_VECTOR4ALOGICAL_H diff --git a/indra/llmath/llvolumeoctree.cpp b/indra/llmath/llvolumeoctree.cpp new file mode 100644 index 0000000000..194b1faf81 --- /dev/null +++ b/indra/llmath/llvolumeoctree.cpp @@ -0,0 +1,208 @@ +/** + + * @file llvolumeoctree.cpp + * + * $LicenseInfo:firstyear=2002&license=viewergpl$ + * + * Copyright (c) 2002-2009, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llvolumeoctree.h" +#include "llvector4a.h" + +BOOL LLLineSegmentBoxIntersect(const LLVector4a& start, const LLVector4a& end, const LLVector4a& center, const LLVector4a& size) +{ + LLVector4a fAWdU; + LLVector4a dir; + LLVector4a diff; + + dir.setSub(end, start); + dir.mul(0.5f); + + diff.setAdd(end,start); + diff.mul(0.5f); + diff.sub(center); + fAWdU.setAbs(dir); + + LLVector4a rhs; + rhs.setAdd(size, fAWdU); + + LLVector4a lhs; + lhs.setAbs(diff); + + U32 grt = lhs.greaterThan(rhs).getGatheredBits(); + + if (grt & 0x7) + { + return false; + } + + LLVector4a f; + f.setCross3(dir, diff); + f.setAbs(f); + + LLVector4a v0, v1; + + v0 = _mm_shuffle_ps(size, size,_MM_SHUFFLE(3,0,0,1)); + v1 = _mm_shuffle_ps(fAWdU, fAWdU, _MM_SHUFFLE(3,1,2,2)); + lhs.setMul(v0, v1); + + v0 = _mm_shuffle_ps(size, size, _MM_SHUFFLE(3,1,2,2)); + v1 = _mm_shuffle_ps(fAWdU, fAWdU, _MM_SHUFFLE(3,0,0,1)); + rhs.setMul(v0, v1); + rhs.add(lhs); + + grt = f.greaterThan(rhs).getGatheredBits(); + + return (grt & 0x7) ? false : true; +} + + +LLVolumeOctreeListener::LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node) +{ + node->addListener(this); + + mBounds = (LLVector4a*) ll_aligned_malloc_16(sizeof(LLVector4a)*4); + mExtents = mBounds+2; +} + +LLVolumeOctreeListener::~LLVolumeOctreeListener() +{ + ll_aligned_free_16(mBounds); +} + +void LLVolumeOctreeListener::handleChildAddition(const LLOctreeNode<LLVolumeTriangle>* parent, + LLOctreeNode<LLVolumeTriangle>* child) +{ + new LLVolumeOctreeListener(child); +} + + +LLOctreeTriangleRayIntersect::LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir, + const LLVolumeFace* face, F32* closest_t, + LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal) + : mFace(face), + mStart(start), + mDir(dir), + mIntersection(intersection), + mTexCoord(tex_coord), + mNormal(normal), + mBinormal(bi_normal), + mClosestT(closest_t), + mHitFace(false) +{ + mEnd.setAdd(mStart, mDir); +} + +void LLOctreeTriangleRayIntersect::traverse(const LLOctreeNode<LLVolumeTriangle>* node) +{ + LLVolumeOctreeListener* vl = (LLVolumeOctreeListener*) node->getListener(0); + + /*const F32* start = mStart.getF32(); + const F32* end = mEnd.getF32(); + const F32* center = vl->mBounds[0].getF32(); + const F32* size = vl->mBounds[1].getF32();*/ + + //if (LLLineSegmentBoxIntersect(mStart.getF32(), mEnd.getF32(), vl->mBounds[0].getF32(), vl->mBounds[1].getF32())) + if (LLLineSegmentBoxIntersect(mStart, mEnd, vl->mBounds[0], vl->mBounds[1])) + { + node->accept(this); + for (S32 i = 0; i < node->getChildCount(); ++i) + { + traverse(node->getChild(i)); + } + } +} + +void LLOctreeTriangleRayIntersect::visit(const LLOctreeNode<LLVolumeTriangle>* node) +{ + for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter = + node->getData().begin(); iter != node->getData().end(); ++iter) + { + const LLVolumeTriangle* tri = *iter; + + F32 a, b, t; + + if (LLTriangleRayIntersect(*tri->mV[0], *tri->mV[1], *tri->mV[2], + mStart, mDir, a, b, t)) + { + if ((t >= 0.f) && // if hit is after start + (t <= 1.f) && // and before end + (t < *mClosestT)) // and this hit is closer + { + *mClosestT = t; + mHitFace = true; + + if (mIntersection != NULL) + { + LLVector4a intersect = mDir; + intersect.mul(*mClosestT); + intersect.add(mStart); + mIntersection->set(intersect.getF32ptr()); + } + + + if (mTexCoord != NULL) + { + LLVector2* tc = (LLVector2*) mFace->mTexCoords; + *mTexCoord = ((1.f - a - b) * tc[tri->mIndex[0]] + + a * tc[tri->mIndex[1]] + + b * tc[tri->mIndex[2]]); + + } + + if (mNormal != NULL) + { + LLVector4* norm = (LLVector4*) mFace->mNormals; + + *mNormal = ((1.f - a - b) * LLVector3(norm[tri->mIndex[0]]) + + a * LLVector3(norm[tri->mIndex[1]]) + + b * LLVector3(norm[tri->mIndex[2]])); + } + + if (mBinormal != NULL) + { + LLVector4* binormal = (LLVector4*) mFace->mBinormals; + *mBinormal = ((1.f - a - b) * LLVector3(binormal[tri->mIndex[0]]) + + a * LLVector3(binormal[tri->mIndex[1]]) + + b * LLVector3(binormal[tri->mIndex[2]])); + } + } + } + } +} + +const LLVector4a& LLVolumeTriangle::getPositionGroup() const +{ + return *mPositionGroup; +} + +const F32& LLVolumeTriangle::getBinRadius() const +{ + return mRadius; +} + + diff --git a/indra/llmath/llvolumeoctree.h b/indra/llmath/llvolumeoctree.h new file mode 100644 index 0000000000..0031626498 --- /dev/null +++ b/indra/llmath/llvolumeoctree.h @@ -0,0 +1,138 @@ +/** + * @file llvolumeoctree.h + * @brief LLVolume octree classes. + * + * $LicenseInfo:firstyear=2002&license=viewergpl$ + * + * Copyright (c) 2002-2009, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_LLVOLUME_OCTREE_H +#define LL_LLVOLUME_OCTREE_H + +#include "linden_common.h" +#include "llmemory.h" + +#include "lloctree.h" +#include "llvolume.h" +#include "llvector4a.h" + +class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle> +{ +public: + + LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node); + ~LLVolumeOctreeListener(); + + LLVolumeOctreeListener(const LLVolumeOctreeListener& rhs) + { + *this = rhs; + } + + const LLVolumeOctreeListener& operator=(const LLVolumeOctreeListener& rhs) + { + llerrs << "Illegal operation!" << llendl; + return *this; + } + + //LISTENER FUNCTIONS + virtual void handleChildAddition(const LLOctreeNode<LLVolumeTriangle>* parent, + LLOctreeNode<LLVolumeTriangle>* child); + virtual void handleStateChange(const LLTreeNode<LLVolumeTriangle>* node) { } + virtual void handleChildRemoval(const LLOctreeNode<LLVolumeTriangle>* parent, + const LLOctreeNode<LLVolumeTriangle>* child) { } + virtual void handleInsertion(const LLTreeNode<LLVolumeTriangle>* node, LLVolumeTriangle* tri) { } + virtual void handleRemoval(const LLTreeNode<LLVolumeTriangle>* node, LLVolumeTriangle* tri) { } + virtual void handleDestruction(const LLTreeNode<LLVolumeTriangle>* node) { } + + +public: + LLVector4a* mBounds; // bounding box (center, size) of this node and all its children (tight fit to objects) + LLVector4a* mExtents; // extents (min, max) of this node and all its children +}; + +class LLOctreeTriangleRayIntersect : public LLOctreeTraveler<LLVolumeTriangle> +{ +public: + const LLVolumeFace* mFace; + LLVector4a mStart; + LLVector4a mDir; + LLVector4a mEnd; + LLVector3* mIntersection; + LLVector2* mTexCoord; + LLVector3* mNormal; + LLVector3* mBinormal; + F32* mClosestT; + bool mHitFace; + + LLOctreeTriangleRayIntersect() { }; + + LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir, + const LLVolumeFace* face, F32* closest_t, + LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal); + + void traverse(const LLOctreeNode<LLVolumeTriangle>* node); + + virtual void visit(const LLOctreeNode<LLVolumeTriangle>* node); +}; + +class LLVolumeTriangle : public LLRefCount +{ +public: + LLVolumeTriangle() + { + mPositionGroup = (LLVector4a*) ll_aligned_malloc_16(16); + } + + LLVolumeTriangle(const LLVolumeTriangle& rhs) + { + *this = rhs; + } + + const LLVolumeTriangle& operator=(const LLVolumeTriangle& rhs) + { + llerrs << "Illegal operation!" << llendl; + return *this; + } + + ~LLVolumeTriangle() + { + ll_aligned_free_16(mPositionGroup); + } + + const LLVector4a* mV[3]; + U16 mIndex[3]; + + LLVector4a* mPositionGroup; + + F32 mRadius; + + virtual const LLVector4a& getPositionGroup() const; + virtual const F32& getBinRadius() const; +}; + + +#endif |