diff options
author | Dave Parks <davep@lindenlab.com> | 2010-08-26 14:23:12 -0500 |
---|---|---|
committer | Dave Parks <davep@lindenlab.com> | 2010-08-26 14:23:12 -0500 |
commit | 71de5f622a7917f78823a7e7840194e1b0f8f070 (patch) | |
tree | 1bfa580e5262ae6a8271ade6aee08f21a019c688 /indra/llmath/llvector4a.cpp | |
parent | fc7a3f6daab94331aab52983caee5c68db8cd772 (diff) |
Add missing files from viewer-experimental
Diffstat (limited to 'indra/llmath/llvector4a.cpp')
-rw-r--r-- | indra/llmath/llvector4a.cpp | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp new file mode 100644 index 0000000000..b62c17302f --- /dev/null +++ b/indra/llmath/llvector4a.cpp @@ -0,0 +1,228 @@ +/** + * @file llvector4a.cpp + * @brief SIMD vector implementation + * + * $LicenseInfo:firstyear=2010&license=viewergpl$ + * + * Copyright (c) 2007-2010, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at + * http://secondlifegrid.net/programs/open_source/licensing/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include "llmath.h" +#include "llquantize.h" + +extern const LLQuad F_ZERO_4A = { 0, 0, 0, 0 }; +extern const LLQuad F_APPROXIMATELY_ZERO_4A = { + F_APPROXIMATELY_ZERO, + F_APPROXIMATELY_ZERO, + F_APPROXIMATELY_ZERO, + F_APPROXIMATELY_ZERO +}; + +extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A ); +extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F_APPROXIMATELY_ZERO_4A ); + +/*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes) +{ + assert(src != NULL); + assert(dst != NULL); + assert(bytes > 0); + assert((bytes % sizeof(F32))== 0); + + F32* end = dst + (bytes / sizeof(F32) ); + + if (bytes > 64) + { + F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst); + + //at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies + F32* end_64 = end-16; + + _mm_prefetch((char*)begin_64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA); + _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA); + + while (dst < begin_64) + { + copy4a(dst, src); + dst += 4; + src += 4; + } + + while (dst < end_64) + { + _mm_prefetch((char*)src + 512, _MM_HINT_NTA); + _mm_prefetch((char*)dst + 512, _MM_HINT_NTA); + copy4a(dst, src); + copy4a(dst+4, src+4); + copy4a(dst+8, src+8); + copy4a(dst+12, src+12); + + dst += 16; + src += 16; + } + } + + while (dst < end) + { + copy4a(dst, src); + dst += 4; + src += 4; + } +} + +void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec ) +{ + const LLVector4a col0 = rot.getColumn(0); + const LLVector4a col1 = rot.getColumn(1); + const LLVector4a col2 = rot.getColumn(2); + + LLVector4a result = _mm_load_ss( vec.getF32ptr() ); + result.splat<0>( result ); + result.mul( col0 ); + + { + LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() + 1 ); + yyyy.splat<0>( yyyy ); + yyyy.mul( col1 ); + result.add( yyyy ); + } + + { + LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() + 2 ); + zzzz.splat<0>( zzzz ); + zzzz.mul( col2 ); + result.add( zzzz ); + } + + *this = result; +} + +void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec ) +{ + const LLVector4a& quatVec = quat.getVector4a(); + LLVector4a temp; temp.setCross3(quatVec, vec); + temp.add( temp ); + + const LLVector4a realPart( quatVec.getScalarAt<3>() ); + LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart ); + + mQ = vec; + add( tempTimesReal ); + + LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp ); + add(imagCrossTemp); +} + +void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high ) +{ + LLVector4a val(mQ); + LLVector4a delta; delta.setSub( high, low ); + + { + val.clamp(low, high); + val.sub(low); + + // 8-bit quantization means we can do with just 12 bits of reciprocal accuracy + const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ); +// { +// static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; +// LLVector4a two; two.load4a( F_TWO_4A ); +// +// // Here we use _mm_rcp_ps plus one round of newton-raphson +// // We wish to find 'x' such that x = 1/delta +// // As a first approximation, we take x0 = _mm_rcp_ps(delta) +// // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) +// // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf +// const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); +// oneOverDelta.setMul( delta, recipApprox ); +// oneOverDelta.setSub( two, oneOverDelta ); +// oneOverDelta.mul( recipApprox ); +// } + + val.mul(oneOverDelta); + val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A)); + } + + val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); + + { + val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); + val.mul(delta); + val.add(low); + } + + { + LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); + LLVector4a absVal; absVal.setAbs( val ); + setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); + } +} + +void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high ) +{ + LLVector4a val(mQ); + LLVector4a delta; delta.setSub( high, low ); + + { + val.clamp(low, high); + val.sub(low); + + // 16-bit quantization means we need a round of Newton-Raphson + LLVector4a oneOverDelta; + { + static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; + LLVector4a two; two.load4a( F_TWO_4A ); + + // Here we use _mm_rcp_ps plus one round of newton-raphson + // We wish to find 'x' such that x = 1/delta + // As a first approximation, we take x0 = _mm_rcp_ps(delta) + // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) + // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf + const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); + oneOverDelta.setMul( delta, recipApprox ); + oneOverDelta.setSub( two, oneOverDelta ); + oneOverDelta.mul( recipApprox ); + } + + val.mul(oneOverDelta); + val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A)); + } + + val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); + + { + val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); + val.mul(delta); + val.add(low); + } + + { + LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); + LLVector4a absVal; absVal.setAbs( val ); + setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); + } +} |