summaryrefslogtreecommitdiff
path: root/indra/llmath/llvector4a.inl
diff options
context:
space:
mode:
authorAnsariel <ansariel.hiller@phoenixviewer.com>2024-05-22 19:04:52 +0200
committerAnsariel <ansariel.hiller@phoenixviewer.com>2024-05-22 19:04:52 +0200
commit1b67dd855c41f5a0cda7ec2a68d98071986ca703 (patch)
treeab243607f74f78200787bba5b9b88f07ef1b966f /indra/llmath/llvector4a.inl
parent6d6eabca44d08d5b97bfe3e941d2b9687c2246ea (diff)
parente1623bb276f83a43ce7a197e388720c05bdefe61 (diff)
Merge remote-tracking branch 'origin/main' into DRTVWR-600-maint-A
# Conflicts: # autobuild.xml # indra/cmake/CMakeLists.txt # indra/cmake/GoogleMock.cmake # indra/llaudio/llaudioengine_fmodstudio.cpp # indra/llaudio/llaudioengine_fmodstudio.h # indra/llaudio/lllistener_fmodstudio.cpp # indra/llaudio/lllistener_fmodstudio.h # indra/llaudio/llstreamingaudio_fmodstudio.cpp # indra/llaudio/llstreamingaudio_fmodstudio.h # indra/llcharacter/llmultigesture.cpp # indra/llcharacter/llmultigesture.h # indra/llimage/llimage.cpp # indra/llimage/llimagepng.cpp # indra/llimage/llimageworker.cpp # indra/llimage/tests/llimageworker_test.cpp # indra/llmessage/tests/llmockhttpclient.h # indra/llprimitive/llgltfmaterial.h # indra/llrender/llfontfreetype.cpp # indra/llui/llcombobox.cpp # indra/llui/llfolderview.cpp # indra/llui/llfolderviewmodel.h # indra/llui/lllineeditor.cpp # indra/llui/lllineeditor.h # indra/llui/lltextbase.cpp # indra/llui/lltextbase.h # indra/llui/lltexteditor.cpp # indra/llui/lltextvalidate.cpp # indra/llui/lltextvalidate.h # indra/llui/lluictrl.h # indra/llui/llview.cpp # indra/llwindow/llwindowmacosx.cpp # indra/newview/app_settings/settings.xml # indra/newview/llappearancemgr.cpp # indra/newview/llappearancemgr.h # indra/newview/llavatarpropertiesprocessor.cpp # indra/newview/llavatarpropertiesprocessor.h # indra/newview/llbreadcrumbview.cpp # indra/newview/llbreadcrumbview.h # indra/newview/llbreastmotion.cpp # indra/newview/llbreastmotion.h # indra/newview/llconversationmodel.h # indra/newview/lldensityctrl.cpp # indra/newview/lldensityctrl.h # indra/newview/llface.inl # indra/newview/llfloatereditsky.cpp # indra/newview/llfloatereditwater.cpp # indra/newview/llfloateremojipicker.h # indra/newview/llfloaterimsessiontab.cpp # indra/newview/llfloaterprofiletexture.cpp # indra/newview/llfloaterprofiletexture.h # indra/newview/llgesturemgr.cpp # indra/newview/llgesturemgr.h # indra/newview/llimpanel.cpp # indra/newview/llimpanel.h # indra/newview/llinventorybridge.cpp # indra/newview/llinventorybridge.h # indra/newview/llinventoryclipboard.cpp # indra/newview/llinventoryclipboard.h # indra/newview/llinventoryfunctions.cpp # indra/newview/llinventoryfunctions.h # indra/newview/llinventorygallery.cpp # indra/newview/lllistbrowser.cpp # indra/newview/lllistbrowser.h # indra/newview/llpanelobjectinventory.cpp # indra/newview/llpanelprofile.cpp # indra/newview/llpanelprofile.h # indra/newview/llpreviewgesture.cpp # indra/newview/llsavedsettingsglue.cpp # indra/newview/llsavedsettingsglue.h # indra/newview/lltooldraganddrop.cpp # indra/newview/llurllineeditorctrl.cpp # indra/newview/llvectorperfoptions.cpp # indra/newview/llvectorperfoptions.h # indra/newview/llviewerparceloverlay.cpp # indra/newview/llviewertexlayer.cpp # indra/newview/llviewertexturelist.cpp # indra/newview/macmain.h # indra/test/test.cpp
Diffstat (limited to 'indra/llmath/llvector4a.inl')
-rw-r--r--indra/llmath/llvector4a.inl532
1 files changed, 266 insertions, 266 deletions
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
index 8be1c1b114..36dbec078c 100644
--- a/indra/llmath/llvector4a.inl
+++ b/indra/llmath/llvector4a.inl
@@ -1,25 +1,25 @@
-/**
+/**
* @file llvector4a.inl
* @brief LLVector4a inline function implementations
*
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
* Second Life Viewer Source Code
* Copyright (C) 2010, Linden Research, Inc.
- *
+ *
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation;
* version 2.1 of the License only.
- *
+ *
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
+ *
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
+ *
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
* $/LicenseInfo$
*/
@@ -31,138 +31,138 @@
// Load from 16-byte aligned src array (preferred method of loading)
inline void LLVector4a::load4a(const F32* src)
{
- mQ = _mm_load_ps(src);
+ mQ = _mm_load_ps(src);
}
// Load from unaligned src array (NB: Significantly slower than load4a)
inline void LLVector4a::loadua(const F32* src)
{
- mQ = _mm_loadu_ps(src);
+ mQ = _mm_loadu_ps(src);
}
// Load only three floats beginning at address 'src'. Slowest method.
inline void LLVector4a::load3(const F32* src)
{
- // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X }
- // NB: This differs from the convention of { Z, Y, X, W }
- mQ = _mm_set_ps(0.f, src[2], src[1], src[0]);
-}
+ // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X }
+ // NB: This differs from the convention of { Z, Y, X, W }
+ mQ = _mm_set_ps(0.f, src[2], src[1], src[0]);
+}
// Store to a 16-byte aligned memory address
inline void LLVector4a::store4a(F32* dst) const
{
- _mm_store_ps(dst, mQ);
+ _mm_store_ps(dst, mQ);
}
////////////////////////////////////
-// BASIC GET/SET
+// BASIC GET/SET
////////////////////////////////////
// Return a "this" as an F32 pointer.
F32* LLVector4a::getF32ptr()
{
- return (F32*) &mQ;
+ return (F32*) &mQ;
}
// Return a "this" as a const F32 pointer.
const F32* const LLVector4a::getF32ptr() const
{
- return (const F32* const) &mQ;
+ return (const F32* const) &mQ;
}
// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead
inline F32 LLVector4a::operator[](const S32 idx) const
{
- return ((F32*)&mQ)[idx];
-}
+ return ((F32*)&mQ)[idx];
+}
// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
inline LLSimdScalar LLVector4a::getScalarAt(const S32 idx) const
{
- // Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop)
- switch (idx)
- {
- case 0:
- return mQ;
- case 1:
- return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1));
- case 2:
- return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2));
- case 3:
- default:
- return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3));
- }
+ // Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop)
+ switch (idx)
+ {
+ case 0:
+ return mQ;
+ case 1:
+ return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1));
+ case 2:
+ return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2));
+ case 3:
+ default:
+ return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3));
+ }
}
// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
template <int N> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt() const
{
- return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N));
+ return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N));
}
template<> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const
{
- return mQ;
+ return mQ;
}
// Set to an x, y, z and optional w provided
inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w)
{
- mQ = _mm_set_ps(w, z, y, x);
+ mQ = _mm_set_ps(w, z, y, x);
}
// Set to all zeros
inline void LLVector4a::clear()
{
- mQ = LLVector4a::getZero().mQ;
+ mQ = LLVector4a::getZero().mQ;
}
inline void LLVector4a::splat(const F32 x)
{
- mQ = _mm_set1_ps(x);
+ mQ = _mm_set1_ps(x);
}
inline void LLVector4a::splat(const LLSimdScalar& x)
{
- mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) );
+ mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) );
}
// Set all 4 elements to element N of src, with N known at compile time
template <int N> void LLVector4a::splat(const LLVector4a& src)
{
- mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) );
+ mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) );
}
// Set all 4 elements to element i of v, with i NOT known at compile time
inline void LLVector4a::splat(const LLVector4a& v, U32 i)
{
- switch (i)
- {
- case 0:
- mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0));
- break;
- case 1:
- mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1));
- break;
- case 2:
- mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2));
- break;
- case 3:
- mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3));
- break;
- }
+ switch (i)
+ {
+ case 0:
+ mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0));
+ break;
+ case 1:
+ mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1));
+ break;
+ case 2:
+ mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2));
+ break;
+ case 3:
+ mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3));
+ break;
+ }
}
// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask
inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse )
{
- // ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse )
- // E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b
- // (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b,
- // as expected (01 from sourceIfTrue, 10 from sourceIfFalse)
- // Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/
- mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) );
+ // ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse )
+ // E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b
+ // (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b,
+ // as expected (01 from sourceIfTrue, 10 from sourceIfFalse)
+ // Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/
+ mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) );
}
////////////////////////////////////
@@ -172,84 +172,84 @@ inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const L
// Set this to the element-wise (a + b)
inline void LLVector4a::setAdd(const LLVector4a& a, const LLVector4a& b)
{
- mQ = _mm_add_ps(a.mQ, b.mQ);
+ mQ = _mm_add_ps(a.mQ, b.mQ);
}
// Set this to element-wise (a - b)
inline void LLVector4a::setSub(const LLVector4a& a, const LLVector4a& b)
{
- mQ = _mm_sub_ps(a.mQ, b.mQ);
+ mQ = _mm_sub_ps(a.mQ, b.mQ);
}
// Set this to element-wise multiply (a * b)
inline void LLVector4a::setMul(const LLVector4a& a, const LLVector4a& b)
{
- mQ = _mm_mul_ps(a.mQ, b.mQ);
+ mQ = _mm_mul_ps(a.mQ, b.mQ);
}
// Set this to element-wise quotient (a / b)
inline void LLVector4a::setDiv(const LLVector4a& a, const LLVector4a& b)
{
- mQ = _mm_div_ps( a.mQ, b.mQ );
+ mQ = _mm_div_ps( a.mQ, b.mQ );
}
// Set this to the element-wise absolute value of src
inline void LLVector4a::setAbs(const LLVector4a& src)
{
- static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
- mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
+ static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
+ mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
}
// Add to each component in this vector the corresponding component in rhs
inline void LLVector4a::add(const LLVector4a& rhs)
{
- mQ = _mm_add_ps(mQ, rhs.mQ);
+ mQ = _mm_add_ps(mQ, rhs.mQ);
}
// Subtract from each component in this vector the corresponding component in rhs
inline void LLVector4a::sub(const LLVector4a& rhs)
{
- mQ = _mm_sub_ps(mQ, rhs.mQ);
+ mQ = _mm_sub_ps(mQ, rhs.mQ);
}
// Multiply each component in this vector by the corresponding component in rhs
inline void LLVector4a::mul(const LLVector4a& rhs)
{
- mQ = _mm_mul_ps(mQ, rhs.mQ);
+ mQ = _mm_mul_ps(mQ, rhs.mQ);
}
// Divide each component in this vector by the corresponding component in rhs
inline void LLVector4a::div(const LLVector4a& rhs)
{
- // TODO: Check accuracy, maybe add divFast
- mQ = _mm_div_ps(mQ, rhs.mQ);
+ // TODO: Check accuracy, maybe add divFast
+ mQ = _mm_div_ps(mQ, rhs.mQ);
}
// Multiply this vector by x in a scalar fashion
-inline void LLVector4a::mul(const F32 x)
+inline void LLVector4a::mul(const F32 x)
{
- LLVector4a t;
- t.splat(x);
-
- mQ = _mm_mul_ps(mQ, t.mQ);
+ LLVector4a t;
+ t.splat(x);
+
+ mQ = _mm_mul_ps(mQ, t.mQ);
}
// Set this to (a x b) (geometric cross-product)
inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
{
- // Vectors are stored in memory in w, z, y, x order from high to low
- // Set vector1 = { a[W], a[X], a[Z], a[Y] }
- const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
- // Set vector2 = { b[W], b[Y], b[X], b[Z] }
- const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
- // mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] }
- mQ = _mm_mul_ps( vector1, vector2 );
- // vector3 = { a[W], a[Y], a[X], a[Z] }
- const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
- // vector4 = { b[W], b[X], b[Z], b[Y] }
- const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
- // mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] }
- mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 ));
+ // Vectors are stored in memory in w, z, y, x order from high to low
+ // Set vector1 = { a[W], a[X], a[Z], a[Y] }
+ const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
+ // Set vector2 = { b[W], b[Y], b[X], b[Z] }
+ const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
+ // mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] }
+ mQ = _mm_mul_ps( vector1, vector2 );
+ // vector3 = { a[W], a[Y], a[X], a[Z] }
+ const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
+ // vector4 = { b[W], b[X], b[Z], b[Y] }
+ const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
+ // mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] }
+ mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 ));
}
/* This function works, but may be slightly slower than the one below on older machines
@@ -261,7 +261,7 @@ inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 ));
// xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
const LLQuad xPlusY = _mm_add_ps(ab, wzxy);
- // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
+ // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
// zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 ));
@@ -272,267 +272,267 @@ inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
// Set all elements to the dot product of the x, y, and z elements in a and b
inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
{
- // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
- const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
- // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
- const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 ));
- // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
- const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy));
- // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
- const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
- // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
- const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
- // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
- mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
+ // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
+ const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
+ // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
+ const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 ));
+ // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
+ const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy));
+ // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
+ const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
+ // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
+ const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
+ // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
+ mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
}
// Set all elements to the dot product of the x, y, z, and w elements in a and b
inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
{
- // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
- const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
- // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
- const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 ));
- // zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
- const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy));
- // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
- const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY);
- const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY);
+ // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
+ const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
+ // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
+ const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 ));
+ // zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
+ const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy));
+ // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
+ const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY);
+ const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY);
- // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
- mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
+ // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
+ mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
}
// Return the 3D dot product of this vector and b
inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
{
- const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
- const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
- const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
- const LLQuad xPlusY = _mm_add_ps( ab, splatY );
- return _mm_add_ps( xPlusY, splatZ );
+ const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
+ const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
+ const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
+ const LLQuad xPlusY = _mm_add_ps( ab, splatY );
+ return _mm_add_ps( xPlusY, splatZ );
}
// Return the 4D dot product of this vector and b
inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
{
- // ab = { w, z, y, x }
- const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
- // upperProdsInLowerElems = { y, x, y, x }
- const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab );
- // sumOfPairs = { w+y, z+x, 2y, 2x }
- const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab );
- // shuffled = { z+x, z+x, z+x, z+x }
- const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
- return _mm_add_ss( sumOfPairs, shuffled );
+ // ab = { w, z, y, x }
+ const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
+ // upperProdsInLowerElems = { y, x, y, x }
+ const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab );
+ // sumOfPairs = { w+y, z+x, 2y, 2x }
+ const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab );
+ // shuffled = { z+x, z+x, z+x, z+x }
+ const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
+ return _mm_add_ss( sumOfPairs, shuffled );
}
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
// Note that this does not consider zero length vectors!
inline void LLVector4a::normalize3()
{
- // lenSqrd = a dot a
- LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
- // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
- const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
- static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
- static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
- // Now we do one round of Newton-Raphson approximation to get full accuracy
- // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
- // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
- // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
- // = 0.5 * w * (3 - a*w^2)
- // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
- // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
- const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
- const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
- const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
- const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
- mQ = _mm_mul_ps( mQ, nrApprox );
+ // lenSqrd = a dot a
+ LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
+ // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
+ const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+ static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
+ static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
+ // Now we do one round of Newton-Raphson approximation to get full accuracy
+ // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
+ // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
+ // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
+ // = 0.5 * w * (3 - a*w^2)
+ // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
+ // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
+ const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
+ const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
+ const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
+ const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
+ mQ = _mm_mul_ps( mQ, nrApprox );
}
// Normalize this vector with respect to all components. Accurate to 22 bites of precision.
// Note that this does not consider zero length vectors!
inline void LLVector4a::normalize4()
{
- // lenSqrd = a dot a
- LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this );
- // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
- const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
- static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
- static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
- // Now we do one round of Newton-Raphson approximation to get full accuracy
- // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
- // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
- // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
- // = 0.5 * w * (3 - a*w^2)
- // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
- // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
- const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
- const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
- const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
- const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
- mQ = _mm_mul_ps( mQ, nrApprox );
+ // lenSqrd = a dot a
+ LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this );
+ // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
+ const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+ static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
+ static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
+ // Now we do one round of Newton-Raphson approximation to get full accuracy
+ // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
+ // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
+ // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
+ // = 0.5 * w * (3 - a*w^2)
+ // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
+ // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
+ const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
+ const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
+ const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
+ const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
+ mQ = _mm_mul_ps( mQ, nrApprox );
}
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
// Note that this does not consider zero length vectors!
inline LLSimdScalar LLVector4a::normalize3withLength()
{
- // lenSqrd = a dot a
- LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
- // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
- const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
- static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
- static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
- // Now we do one round of Newton-Raphson approximation to get full accuracy
- // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
- // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
- // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
- // = 0.5 * w * (3 - a*w^2)
- // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
- // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
- const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
- const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
- const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
- const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
- mQ = _mm_mul_ps( mQ, nrApprox );
- return _mm_sqrt_ss(lenSqrd);
+ // lenSqrd = a dot a
+ LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
+ // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
+ const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+ static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
+ static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
+ // Now we do one round of Newton-Raphson approximation to get full accuracy
+ // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
+ // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
+ // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
+ // = 0.5 * w * (3 - a*w^2)
+ // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
+ // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
+ const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
+ const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
+ const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
+ const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
+ mQ = _mm_mul_ps( mQ, nrApprox );
+ return _mm_sqrt_ss(lenSqrd);
}
// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
// Note that this does not consider zero length vectors!
inline void LLVector4a::normalize3fast()
{
- LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
- const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
- mQ = _mm_mul_ps( mQ, approxRsqrt );
+ LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
+ const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+ mQ = _mm_mul_ps( mQ, approxRsqrt );
}
inline void LLVector4a::normalize3fast_checked(LLVector4a* d)
{
- if (!isFinite3())
- {
- *this = d ? *d : LLVector4a(0,1,0,1);
- return;
- }
+ if (!isFinite3())
+ {
+ *this = d ? *d : LLVector4a(0,1,0,1);
+ return;
+ }
- LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
+ LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
- if (lenSqrd.getF32ptr()[0] <= FLT_EPSILON)
- {
- *this = d ? *d : LLVector4a(0,1,0,1);
- return;
- }
+ if (lenSqrd.getF32ptr()[0] <= FLT_EPSILON)
+ {
+ *this = d ? *d : LLVector4a(0,1,0,1);
+ return;
+ }
- const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
- mQ = _mm_mul_ps( mQ, approxRsqrt );
+ const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+ mQ = _mm_mul_ps( mQ, approxRsqrt );
}
// Return true if this vector is normalized with respect to x,y,z up to tolerance
inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
{
- static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
- LLSimdScalar tol = _mm_load_ss( &tolerance );
- tol = _mm_mul_ss( tol, tol );
- LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this );
- lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
- lenSquared.setAbs(lenSquared);
- return _mm_comile_ss( lenSquared, tol );
+ static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
+ LLSimdScalar tol = _mm_load_ss( &tolerance );
+ tol = _mm_mul_ss( tol, tol );
+ LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this );
+ lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
+ lenSquared.setAbs(lenSquared);
+ return _mm_comile_ss( lenSquared, tol );
}
// Return true if this vector is normalized with respect to all components up to tolerance
inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const
{
- static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
- LLSimdScalar tol = _mm_load_ss( &tolerance );
- tol = _mm_mul_ss( tol, tol );
- LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this );
- lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
- lenSquared.setAbs(lenSquared);
- return _mm_comile_ss( lenSquared, tol );
+ static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
+ LLSimdScalar tol = _mm_load_ss( &tolerance );
+ tol = _mm_mul_ss( tol, tol );
+ LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this );
+ lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
+ lenSquared.setAbs(lenSquared);
+ return _mm_comile_ss( lenSquared, tol );
}
-// Set all elements to the length of vector 'v'
+// Set all elements to the length of vector 'v'
inline void LLVector4a::setAllLength3( const LLVector4a& v )
{
- LLVector4a lenSqrd;
- lenSqrd.setAllDot3(v, v);
-
- mQ = _mm_sqrt_ps(lenSqrd.mQ);
+ LLVector4a lenSqrd;
+ lenSqrd.setAllDot3(v, v);
+
+ mQ = _mm_sqrt_ps(lenSqrd.mQ);
}
// Get this vector's length
inline LLSimdScalar LLVector4a::getLength3() const
{
- return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) );
+ return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) );
}
// Set the components of this vector to the minimum of the corresponding components of lhs and rhs
inline void LLVector4a::setMin(const LLVector4a& lhs, const LLVector4a& rhs)
{
- mQ = _mm_min_ps(lhs.mQ, rhs.mQ);
+ mQ = _mm_min_ps(lhs.mQ, rhs.mQ);
}
// Set the components of this vector to the maximum of the corresponding components of lhs and rhs
inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs)
{
- mQ = _mm_max_ps(lhs.mQ, rhs.mQ);
+ mQ = _mm_max_ps(lhs.mQ, rhs.mQ);
}
// Set this to lhs + (rhs-lhs)*c
inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c)
{
- LLVector4a t;
- t.setSub(rhs,lhs);
- t.mul(c);
- setAdd(lhs, t);
+ LLVector4a t;
+ t.setSub(rhs,lhs);
+ t.mul(c);
+ setAdd(lhs, t);
}
inline LLBool32 LLVector4a::isFinite3() const
{
- static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
- ll_assert_aligned(nanOrInfMask,16);
- const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
- const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
- const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
- return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ );
+ static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+ ll_assert_aligned(nanOrInfMask,16);
+ const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
+ const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
+ const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
+ return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ );
}
-
+
inline LLBool32 LLVector4a::isFinite4() const
{
- static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
- const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
- const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
- const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
- return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW );
+ static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+ const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
+ const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
+ const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
+ return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW );
}
inline void LLVector4a::setRotatedInv( const LLRotation& rot, const LLVector4a& vec )
{
- LLRotation inv; inv.setTranspose( rot );
- setRotated( inv, vec );
+ LLRotation inv; inv.setTranspose( rot );
+ setRotated( inv, vec );
}
inline void LLVector4a::setRotatedInv( const LLQuaternion2& quat, const LLVector4a& vec )
{
- LLQuaternion2 invRot; invRot.setConjugate( quat );
- setRotated(invRot, vec);
+ LLQuaternion2 invRot; invRot.setConjugate( quat );
+ setRotated(invRot, vec);
}
inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high )
{
- const LLVector4Logical highMask = greaterThan( high );
- const LLVector4Logical lowMask = lessThan( low );
+ const LLVector4Logical highMask = greaterThan( high );
+ const LLVector4Logical lowMask = lessThan( low );
- setSelectWithMask( highMask, high, *this );
- setSelectWithMask( lowMask, low, *this );
+ setSelectWithMask( highMask, high, *this );
+ setSelectWithMask( lowMask, low, *this );
}
////////////////////////////////////
// LOGICAL
-////////////////////////////////////
+////////////////////////////////////
// The functions in this section will compare the elements in this vector
// to those in rhs and return an LLVector4Logical with all bits set in elements
// where the comparison was true and all bits unset in elements where the comparison
@@ -544,68 +544,68 @@ inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high )
////////////////////////////////////
inline LLVector4Logical LLVector4a::greaterThan(const LLVector4a& rhs) const
-{
- return _mm_cmpgt_ps(mQ, rhs.mQ);
+{
+ return _mm_cmpgt_ps(mQ, rhs.mQ);
}
inline LLVector4Logical LLVector4a::lessThan(const LLVector4a& rhs) const
{
- return _mm_cmplt_ps(mQ, rhs.mQ);
+ return _mm_cmplt_ps(mQ, rhs.mQ);
}
inline LLVector4Logical LLVector4a::greaterEqual(const LLVector4a& rhs) const
{
- return _mm_cmpge_ps(mQ, rhs.mQ);
+ return _mm_cmpge_ps(mQ, rhs.mQ);
}
inline LLVector4Logical LLVector4a::lessEqual(const LLVector4a& rhs) const
{
- return _mm_cmple_ps(mQ, rhs.mQ);
+ return _mm_cmple_ps(mQ, rhs.mQ);
}
inline LLVector4Logical LLVector4a::equal(const LLVector4a& rhs) const
{
- return _mm_cmpeq_ps(mQ, rhs.mQ);
+ return _mm_cmpeq_ps(mQ, rhs.mQ);
}
// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance
inline bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance ) const
{
- LLVector4a diff; diff.setSub( *this, rhs );
- diff.setAbs( diff );
- const LLQuad tol = _mm_set1_ps( tolerance );
- const LLQuad cmp = _mm_cmplt_ps( diff, tol );
- return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW;
+ LLVector4a diff; diff.setSub( *this, rhs );
+ diff.setAbs( diff );
+ const LLQuad tol = _mm_set1_ps( tolerance );
+ const LLQuad cmp = _mm_cmplt_ps( diff, tol );
+ return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW;
}
inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const
{
- LLVector4a diff; diff.setSub( *this, rhs );
- diff.setAbs( diff );
- const LLQuad tol = _mm_set1_ps( tolerance );
- const LLQuad t = _mm_cmplt_ps( diff, tol );
- return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ;
-
+ LLVector4a diff; diff.setSub( *this, rhs );
+ diff.setAbs( diff );
+ const LLQuad tol = _mm_set1_ps( tolerance );
+ const LLQuad t = _mm_cmplt_ps( diff, tol );
+ return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ;
+
}
////////////////////////////////////
// OPERATORS
-////////////////////////////////////
+////////////////////////////////////
// Do NOT add aditional operators without consulting someone with SSE experience
inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs )
{
- mQ = rhs.mQ;
- return *this;
+ mQ = rhs.mQ;
+ return *this;
}
inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs )
{
- mQ = rhs;
- return *this;
+ mQ = rhs;
+ return *this;
}
inline LLVector4a::operator LLQuad() const
{
- return mQ;
+ return mQ;
}