summaryrefslogtreecommitdiff
path: root/indra/llmath/llvector4a.cpp
diff options
context:
space:
mode:
authorDave Parks <davep@lindenlab.com>2010-08-26 14:23:12 -0500
committerDave Parks <davep@lindenlab.com>2010-08-26 14:23:12 -0500
commit71de5f622a7917f78823a7e7840194e1b0f8f070 (patch)
tree1bfa580e5262ae6a8271ade6aee08f21a019c688 /indra/llmath/llvector4a.cpp
parentfc7a3f6daab94331aab52983caee5c68db8cd772 (diff)
Add missing files from viewer-experimental
Diffstat (limited to 'indra/llmath/llvector4a.cpp')
-rw-r--r--indra/llmath/llvector4a.cpp228
1 files changed, 228 insertions, 0 deletions
diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp
new file mode 100644
index 0000000000..b62c17302f
--- /dev/null
+++ b/indra/llmath/llvector4a.cpp
@@ -0,0 +1,228 @@
+/**
+ * @file llvector4a.cpp
+ * @brief SIMD vector implementation
+ *
+ * $LicenseInfo:firstyear=2010&license=viewergpl$
+ *
+ * Copyright (c) 2007-2010, Linden Research, Inc.
+ *
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab. Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
+ *
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at
+ * http://secondlifegrid.net/programs/open_source/licensing/flossexception
+ *
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ *
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#include "llmath.h"
+#include "llquantize.h"
+
+extern const LLQuad F_ZERO_4A = { 0, 0, 0, 0 };
+extern const LLQuad F_APPROXIMATELY_ZERO_4A = {
+ F_APPROXIMATELY_ZERO,
+ F_APPROXIMATELY_ZERO,
+ F_APPROXIMATELY_ZERO,
+ F_APPROXIMATELY_ZERO
+};
+
+extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A );
+extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F_APPROXIMATELY_ZERO_4A );
+
+/*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes)
+{
+ assert(src != NULL);
+ assert(dst != NULL);
+ assert(bytes > 0);
+ assert((bytes % sizeof(F32))== 0);
+
+ F32* end = dst + (bytes / sizeof(F32) );
+
+ if (bytes > 64)
+ {
+ F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
+
+ //at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies
+ F32* end_64 = end-16;
+
+ _mm_prefetch((char*)begin_64, _MM_HINT_NTA);
+ _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
+ _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
+ _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
+
+ while (dst < begin_64)
+ {
+ copy4a(dst, src);
+ dst += 4;
+ src += 4;
+ }
+
+ while (dst < end_64)
+ {
+ _mm_prefetch((char*)src + 512, _MM_HINT_NTA);
+ _mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
+ copy4a(dst, src);
+ copy4a(dst+4, src+4);
+ copy4a(dst+8, src+8);
+ copy4a(dst+12, src+12);
+
+ dst += 16;
+ src += 16;
+ }
+ }
+
+ while (dst < end)
+ {
+ copy4a(dst, src);
+ dst += 4;
+ src += 4;
+ }
+}
+
+void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec )
+{
+ const LLVector4a col0 = rot.getColumn(0);
+ const LLVector4a col1 = rot.getColumn(1);
+ const LLVector4a col2 = rot.getColumn(2);
+
+ LLVector4a result = _mm_load_ss( vec.getF32ptr() );
+ result.splat<0>( result );
+ result.mul( col0 );
+
+ {
+ LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() + 1 );
+ yyyy.splat<0>( yyyy );
+ yyyy.mul( col1 );
+ result.add( yyyy );
+ }
+
+ {
+ LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() + 2 );
+ zzzz.splat<0>( zzzz );
+ zzzz.mul( col2 );
+ result.add( zzzz );
+ }
+
+ *this = result;
+}
+
+void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec )
+{
+ const LLVector4a& quatVec = quat.getVector4a();
+ LLVector4a temp; temp.setCross3(quatVec, vec);
+ temp.add( temp );
+
+ const LLVector4a realPart( quatVec.getScalarAt<3>() );
+ LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart );
+
+ mQ = vec;
+ add( tempTimesReal );
+
+ LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp );
+ add(imagCrossTemp);
+}
+
+void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high )
+{
+ LLVector4a val(mQ);
+ LLVector4a delta; delta.setSub( high, low );
+
+ {
+ val.clamp(low, high);
+ val.sub(low);
+
+ // 8-bit quantization means we can do with just 12 bits of reciprocal accuracy
+ const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ);
+// {
+// static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
+// LLVector4a two; two.load4a( F_TWO_4A );
+//
+// // Here we use _mm_rcp_ps plus one round of newton-raphson
+// // We wish to find 'x' such that x = 1/delta
+// // As a first approximation, we take x0 = _mm_rcp_ps(delta)
+// // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
+// // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
+// const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
+// oneOverDelta.setMul( delta, recipApprox );
+// oneOverDelta.setSub( two, oneOverDelta );
+// oneOverDelta.mul( recipApprox );
+// }
+
+ val.mul(oneOverDelta);
+ val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A));
+ }
+
+ val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
+
+ {
+ val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
+ val.mul(delta);
+ val.add(low);
+ }
+
+ {
+ LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
+ LLVector4a absVal; absVal.setAbs( val );
+ setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
+ }
+}
+
+void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high )
+{
+ LLVector4a val(mQ);
+ LLVector4a delta; delta.setSub( high, low );
+
+ {
+ val.clamp(low, high);
+ val.sub(low);
+
+ // 16-bit quantization means we need a round of Newton-Raphson
+ LLVector4a oneOverDelta;
+ {
+ static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
+ LLVector4a two; two.load4a( F_TWO_4A );
+
+ // Here we use _mm_rcp_ps plus one round of newton-raphson
+ // We wish to find 'x' such that x = 1/delta
+ // As a first approximation, we take x0 = _mm_rcp_ps(delta)
+ // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
+ // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
+ const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
+ oneOverDelta.setMul( delta, recipApprox );
+ oneOverDelta.setSub( two, oneOverDelta );
+ oneOverDelta.mul( recipApprox );
+ }
+
+ val.mul(oneOverDelta);
+ val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A));
+ }
+
+ val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
+
+ {
+ val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
+ val.mul(delta);
+ val.add(low);
+ }
+
+ {
+ LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
+ LLVector4a absVal; absVal.setAbs( val );
+ setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
+ }
+}