diff options
Diffstat (limited to 'indra/llmath/llmatrix3a.cpp')
-rw-r--r-- | indra/llmath/llmatrix3a.cpp | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/indra/llmath/llmatrix3a.cpp b/indra/llmath/llmatrix3a.cpp new file mode 100644 index 0000000000..b7468f4914 --- /dev/null +++ b/indra/llmath/llmatrix3a.cpp @@ -0,0 +1,140 @@ +/** + * @file llvector4a.cpp + * @brief SIMD vector implementation + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llmath.h" + +static LL_ALIGN_16(const F32 M_IDENT_3A[12]) = + { 1.f, 0.f, 0.f, 0.f, // Column 1 + 0.f, 1.f, 0.f, 0.f, // Column 2 + 0.f, 0.f, 1.f, 0.f }; // Column 3 + +extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast<const LLMatrix3a*> (M_IDENT_3A); + +void LLMatrix3a::setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs ) +{ + const LLVector4a col0 = lhs.getColumn(0); + const LLVector4a col1 = lhs.getColumn(1); + const LLVector4a col2 = lhs.getColumn(2); + + for ( int i = 0; i < 3; i++ ) + { + LLVector4a xxxx = _mm_load_ss( rhs.mColumns[i].getF32ptr() ); + xxxx.splat<0>( xxxx ); + xxxx.mul( col0 ); + + { + LLVector4a yyyy = _mm_load_ss( rhs.mColumns[i].getF32ptr() + 1 ); + yyyy.splat<0>( yyyy ); + yyyy.mul( col1 ); + xxxx.add( yyyy ); + } + + { + LLVector4a zzzz = _mm_load_ss( rhs.mColumns[i].getF32ptr() + 2 ); + zzzz.splat<0>( zzzz ); + zzzz.mul( col2 ); + xxxx.add( zzzz ); + } + + xxxx.store4a( mColumns[i].getF32ptr() ); + } + +} + +/*static */void LLMatrix3a::batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst ) +{ + const LLVector4a col0 = xform.getColumn(0); + const LLVector4a col1 = xform.getColumn(1); + const LLVector4a col2 = xform.getColumn(2); + const LLVector4a* maxAddr = src + numVectors; + + if ( numVectors & 0x1 ) + { + LLVector4a xxxx = _mm_load_ss( (const F32*)src ); + LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 ); + LLVector4a zzzz = _mm_load_ss( (const F32*)src + 2 ); + xxxx.splat<0>( xxxx ); + yyyy.splat<0>( yyyy ); + zzzz.splat<0>( zzzz ); + xxxx.mul( col0 ); + yyyy.mul( col1 ); + zzzz.mul( col2 ); + xxxx.add( yyyy ); + xxxx.add( zzzz ); + xxxx.store4a( (F32*)dst ); + src++; + dst++; + } + + + numVectors >>= 1; + while ( src < maxAddr ) + { + _mm_prefetch( (const char*)(src + 32 ), _MM_HINT_NTA ); + _mm_prefetch( (const char*)(dst + 32), _MM_HINT_NTA ); + LLVector4a xxxx = _mm_load_ss( (const F32*)src ); + LLVector4a xxxx1= _mm_load_ss( (const F32*)(src + 1) ); + + xxxx.splat<0>( xxxx ); + xxxx1.splat<0>( xxxx1 ); + xxxx.mul( col0 ); + xxxx1.mul( col0 ); + + { + LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 ); + LLVector4a yyyy1 = _mm_load_ss( (const F32*)(src + 1) + 1); + yyyy.splat<0>( yyyy ); + yyyy1.splat<0>( yyyy1 ); + yyyy.mul( col1 ); + yyyy1.mul( col1 ); + xxxx.add( yyyy ); + xxxx1.add( yyyy1 ); + } + + { + LLVector4a zzzz = _mm_load_ss( (const F32*)(src) + 2 ); + LLVector4a zzzz1 = _mm_load_ss( (const F32*)(++src) + 2 ); + zzzz.splat<0>( zzzz ); + zzzz1.splat<0>( zzzz1 ); + zzzz.mul( col2 ); + zzzz1.mul( col2 ); + xxxx.add( zzzz ); + xxxx1.add( zzzz1 ); + } + + xxxx.store4a(dst->getF32ptr()); + src++; + dst++; + + xxxx1.store4a((F32*)dst++); + } +} |