diff options
author | Ansariel <ansariel.hiller@phoenixviewer.com> | 2024-05-22 19:04:52 +0200 |
---|---|---|
committer | Ansariel <ansariel.hiller@phoenixviewer.com> | 2024-05-22 19:04:52 +0200 |
commit | 1b67dd855c41f5a0cda7ec2a68d98071986ca703 (patch) | |
tree | ab243607f74f78200787bba5b9b88f07ef1b966f /indra/llmath/llvector4a.inl | |
parent | 6d6eabca44d08d5b97bfe3e941d2b9687c2246ea (diff) | |
parent | e1623bb276f83a43ce7a197e388720c05bdefe61 (diff) |
Merge remote-tracking branch 'origin/main' into DRTVWR-600-maint-A
# Conflicts:
# autobuild.xml
# indra/cmake/CMakeLists.txt
# indra/cmake/GoogleMock.cmake
# indra/llaudio/llaudioengine_fmodstudio.cpp
# indra/llaudio/llaudioengine_fmodstudio.h
# indra/llaudio/lllistener_fmodstudio.cpp
# indra/llaudio/lllistener_fmodstudio.h
# indra/llaudio/llstreamingaudio_fmodstudio.cpp
# indra/llaudio/llstreamingaudio_fmodstudio.h
# indra/llcharacter/llmultigesture.cpp
# indra/llcharacter/llmultigesture.h
# indra/llimage/llimage.cpp
# indra/llimage/llimagepng.cpp
# indra/llimage/llimageworker.cpp
# indra/llimage/tests/llimageworker_test.cpp
# indra/llmessage/tests/llmockhttpclient.h
# indra/llprimitive/llgltfmaterial.h
# indra/llrender/llfontfreetype.cpp
# indra/llui/llcombobox.cpp
# indra/llui/llfolderview.cpp
# indra/llui/llfolderviewmodel.h
# indra/llui/lllineeditor.cpp
# indra/llui/lllineeditor.h
# indra/llui/lltextbase.cpp
# indra/llui/lltextbase.h
# indra/llui/lltexteditor.cpp
# indra/llui/lltextvalidate.cpp
# indra/llui/lltextvalidate.h
# indra/llui/lluictrl.h
# indra/llui/llview.cpp
# indra/llwindow/llwindowmacosx.cpp
# indra/newview/app_settings/settings.xml
# indra/newview/llappearancemgr.cpp
# indra/newview/llappearancemgr.h
# indra/newview/llavatarpropertiesprocessor.cpp
# indra/newview/llavatarpropertiesprocessor.h
# indra/newview/llbreadcrumbview.cpp
# indra/newview/llbreadcrumbview.h
# indra/newview/llbreastmotion.cpp
# indra/newview/llbreastmotion.h
# indra/newview/llconversationmodel.h
# indra/newview/lldensityctrl.cpp
# indra/newview/lldensityctrl.h
# indra/newview/llface.inl
# indra/newview/llfloatereditsky.cpp
# indra/newview/llfloatereditwater.cpp
# indra/newview/llfloateremojipicker.h
# indra/newview/llfloaterimsessiontab.cpp
# indra/newview/llfloaterprofiletexture.cpp
# indra/newview/llfloaterprofiletexture.h
# indra/newview/llgesturemgr.cpp
# indra/newview/llgesturemgr.h
# indra/newview/llimpanel.cpp
# indra/newview/llimpanel.h
# indra/newview/llinventorybridge.cpp
# indra/newview/llinventorybridge.h
# indra/newview/llinventoryclipboard.cpp
# indra/newview/llinventoryclipboard.h
# indra/newview/llinventoryfunctions.cpp
# indra/newview/llinventoryfunctions.h
# indra/newview/llinventorygallery.cpp
# indra/newview/lllistbrowser.cpp
# indra/newview/lllistbrowser.h
# indra/newview/llpanelobjectinventory.cpp
# indra/newview/llpanelprofile.cpp
# indra/newview/llpanelprofile.h
# indra/newview/llpreviewgesture.cpp
# indra/newview/llsavedsettingsglue.cpp
# indra/newview/llsavedsettingsglue.h
# indra/newview/lltooldraganddrop.cpp
# indra/newview/llurllineeditorctrl.cpp
# indra/newview/llvectorperfoptions.cpp
# indra/newview/llvectorperfoptions.h
# indra/newview/llviewerparceloverlay.cpp
# indra/newview/llviewertexlayer.cpp
# indra/newview/llviewertexturelist.cpp
# indra/newview/macmain.h
# indra/test/test.cpp
Diffstat (limited to 'indra/llmath/llvector4a.inl')
-rw-r--r-- | indra/llmath/llvector4a.inl | 532 |
1 files changed, 266 insertions, 266 deletions
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl index 8be1c1b114..36dbec078c 100644 --- a/indra/llmath/llvector4a.inl +++ b/indra/llmath/llvector4a.inl @@ -1,25 +1,25 @@ -/** +/** * @file llvector4a.inl * @brief LLVector4a inline function implementations * * $LicenseInfo:firstyear=2010&license=viewerlgpl$ * Second Life Viewer Source Code * Copyright (C) 2010, Linden Research, Inc. - * + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License only. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * + * * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA * $/LicenseInfo$ */ @@ -31,138 +31,138 @@ // Load from 16-byte aligned src array (preferred method of loading) inline void LLVector4a::load4a(const F32* src) { - mQ = _mm_load_ps(src); + mQ = _mm_load_ps(src); } // Load from unaligned src array (NB: Significantly slower than load4a) inline void LLVector4a::loadua(const F32* src) { - mQ = _mm_loadu_ps(src); + mQ = _mm_loadu_ps(src); } // Load only three floats beginning at address 'src'. Slowest method. inline void LLVector4a::load3(const F32* src) { - // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X } - // NB: This differs from the convention of { Z, Y, X, W } - mQ = _mm_set_ps(0.f, src[2], src[1], src[0]); -} + // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X } + // NB: This differs from the convention of { Z, Y, X, W } + mQ = _mm_set_ps(0.f, src[2], src[1], src[0]); +} // Store to a 16-byte aligned memory address inline void LLVector4a::store4a(F32* dst) const { - _mm_store_ps(dst, mQ); + _mm_store_ps(dst, mQ); } //////////////////////////////////// -// BASIC GET/SET +// BASIC GET/SET //////////////////////////////////// // Return a "this" as an F32 pointer. F32* LLVector4a::getF32ptr() { - return (F32*) &mQ; + return (F32*) &mQ; } // Return a "this" as a const F32 pointer. const F32* const LLVector4a::getF32ptr() const { - return (const F32* const) &mQ; + return (const F32* const) &mQ; } // Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates // the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead inline F32 LLVector4a::operator[](const S32 idx) const { - return ((F32*)&mQ)[idx]; -} + return ((F32*)&mQ)[idx]; +} // Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. inline LLSimdScalar LLVector4a::getScalarAt(const S32 idx) const { - // Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop) - switch (idx) - { - case 0: - return mQ; - case 1: - return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1)); - case 2: - return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2)); - case 3: - default: - return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3)); - } + // Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop) + switch (idx) + { + case 0: + return mQ; + case 1: + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1)); + case 2: + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2)); + case 3: + default: + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3)); + } } // Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. template <int N> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt() const { - return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N)); + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N)); } template<> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const { - return mQ; + return mQ; } // Set to an x, y, z and optional w provided inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w) { - mQ = _mm_set_ps(w, z, y, x); + mQ = _mm_set_ps(w, z, y, x); } // Set to all zeros inline void LLVector4a::clear() { - mQ = LLVector4a::getZero().mQ; + mQ = LLVector4a::getZero().mQ; } inline void LLVector4a::splat(const F32 x) { - mQ = _mm_set1_ps(x); + mQ = _mm_set1_ps(x); } inline void LLVector4a::splat(const LLSimdScalar& x) { - mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) ); + mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) ); } // Set all 4 elements to element N of src, with N known at compile time template <int N> void LLVector4a::splat(const LLVector4a& src) { - mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) ); + mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) ); } // Set all 4 elements to element i of v, with i NOT known at compile time inline void LLVector4a::splat(const LLVector4a& v, U32 i) { - switch (i) - { - case 0: - mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0)); - break; - case 1: - mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1)); - break; - case 2: - mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2)); - break; - case 3: - mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3)); - break; - } + switch (i) + { + case 0: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0)); + break; + case 1: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1)); + break; + case 2: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2)); + break; + case 3: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3)); + break; + } } // Select bits from sourceIfTrue and sourceIfFalse according to bits in mask inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse ) { - // ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse ) - // E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b - // (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b, - // as expected (01 from sourceIfTrue, 10 from sourceIfFalse) - // Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/ - mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) ); + // ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse ) + // E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b + // (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b, + // as expected (01 from sourceIfTrue, 10 from sourceIfFalse) + // Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/ + mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) ); } //////////////////////////////////// @@ -172,84 +172,84 @@ inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const L // Set this to the element-wise (a + b) inline void LLVector4a::setAdd(const LLVector4a& a, const LLVector4a& b) { - mQ = _mm_add_ps(a.mQ, b.mQ); + mQ = _mm_add_ps(a.mQ, b.mQ); } // Set this to element-wise (a - b) inline void LLVector4a::setSub(const LLVector4a& a, const LLVector4a& b) { - mQ = _mm_sub_ps(a.mQ, b.mQ); + mQ = _mm_sub_ps(a.mQ, b.mQ); } // Set this to element-wise multiply (a * b) inline void LLVector4a::setMul(const LLVector4a& a, const LLVector4a& b) { - mQ = _mm_mul_ps(a.mQ, b.mQ); + mQ = _mm_mul_ps(a.mQ, b.mQ); } // Set this to element-wise quotient (a / b) inline void LLVector4a::setDiv(const LLVector4a& a, const LLVector4a& b) { - mQ = _mm_div_ps( a.mQ, b.mQ ); + mQ = _mm_div_ps( a.mQ, b.mQ ); } // Set this to the element-wise absolute value of src inline void LLVector4a::setAbs(const LLVector4a& src) { - static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; - mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A)); + static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; + mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A)); } // Add to each component in this vector the corresponding component in rhs inline void LLVector4a::add(const LLVector4a& rhs) { - mQ = _mm_add_ps(mQ, rhs.mQ); + mQ = _mm_add_ps(mQ, rhs.mQ); } // Subtract from each component in this vector the corresponding component in rhs inline void LLVector4a::sub(const LLVector4a& rhs) { - mQ = _mm_sub_ps(mQ, rhs.mQ); + mQ = _mm_sub_ps(mQ, rhs.mQ); } // Multiply each component in this vector by the corresponding component in rhs inline void LLVector4a::mul(const LLVector4a& rhs) { - mQ = _mm_mul_ps(mQ, rhs.mQ); + mQ = _mm_mul_ps(mQ, rhs.mQ); } // Divide each component in this vector by the corresponding component in rhs inline void LLVector4a::div(const LLVector4a& rhs) { - // TODO: Check accuracy, maybe add divFast - mQ = _mm_div_ps(mQ, rhs.mQ); + // TODO: Check accuracy, maybe add divFast + mQ = _mm_div_ps(mQ, rhs.mQ); } // Multiply this vector by x in a scalar fashion -inline void LLVector4a::mul(const F32 x) +inline void LLVector4a::mul(const F32 x) { - LLVector4a t; - t.splat(x); - - mQ = _mm_mul_ps(mQ, t.mQ); + LLVector4a t; + t.splat(x); + + mQ = _mm_mul_ps(mQ, t.mQ); } // Set this to (a x b) (geometric cross-product) inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b) { - // Vectors are stored in memory in w, z, y, x order from high to low - // Set vector1 = { a[W], a[X], a[Z], a[Y] } - const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); - // Set vector2 = { b[W], b[Y], b[X], b[Z] } - const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); - // mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] } - mQ = _mm_mul_ps( vector1, vector2 ); - // vector3 = { a[W], a[Y], a[X], a[Z] } - const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); - // vector4 = { b[W], b[X], b[Z], b[Y] } - const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); - // mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] } - mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 )); + // Vectors are stored in memory in w, z, y, x order from high to low + // Set vector1 = { a[W], a[X], a[Z], a[Y] } + const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); + // Set vector2 = { b[W], b[Y], b[X], b[Z] } + const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); + // mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] } + mQ = _mm_mul_ps( vector1, vector2 ); + // vector3 = { a[W], a[Y], a[X], a[Z] } + const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); + // vector4 = { b[W], b[X], b[Z], b[Y] } + const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); + // mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] } + mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 )); } /* This function works, but may be slightly slower than the one below on older machines @@ -261,7 +261,7 @@ inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b) const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 )); // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } const LLQuad xPlusY = _mm_add_ps(ab, wzxy); - // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY); // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] } const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 )); @@ -272,267 +272,267 @@ inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b) // Set all elements to the dot product of the x, y, and z elements in a and b inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) { - // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } - const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); - // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } - const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 )); - // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } - const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy)); - // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } - const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY); - // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] } - const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 )); - // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } - mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat); + // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } + const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); + // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } + const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 )); + // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy)); + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY); + // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] } + const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 )); + // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } + mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat); } // Set all elements to the dot product of the x, y, z, and w elements in a and b inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b) { - // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } - const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); - // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } - const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 )); - // zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } - const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy)); - // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } - const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY); - const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY); + // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } + const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); + // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } + const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 )); + // zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy)); + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY); + const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY); - // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } - mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat); + // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } + mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat); } // Return the 3D dot product of this vector and b inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const { - const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); - const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) ); - const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) ); - const LLQuad xPlusY = _mm_add_ps( ab, splatY ); - return _mm_add_ps( xPlusY, splatZ ); + const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); + const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) ); + const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) ); + const LLQuad xPlusY = _mm_add_ps( ab, splatY ); + return _mm_add_ps( xPlusY, splatZ ); } // Return the 4D dot product of this vector and b inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const { - // ab = { w, z, y, x } - const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); - // upperProdsInLowerElems = { y, x, y, x } - const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab ); - // sumOfPairs = { w+y, z+x, 2y, 2x } - const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab ); - // shuffled = { z+x, z+x, z+x, z+x } - const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) ); - return _mm_add_ss( sumOfPairs, shuffled ); + // ab = { w, z, y, x } + const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); + // upperProdsInLowerElems = { y, x, y, x } + const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab ); + // sumOfPairs = { w+y, z+x, 2y, 2x } + const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab ); + // shuffled = { z+x, z+x, z+x, z+x } + const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) ); + return _mm_add_ss( sumOfPairs, shuffled ); } // Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed // Note that this does not consider zero length vectors! inline void LLVector4a::normalize3() { - // lenSqrd = a dot a - LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); - // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } - const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); - static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; - static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; - // Now we do one round of Newton-Raphson approximation to get full accuracy - // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) - // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) - // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 - // = 0.5 * w * (3 - a*w^2) - // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula - // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] - const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); - const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); - const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); - const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); - mQ = _mm_mul_ps( mQ, nrApprox ); + // lenSqrd = a dot a + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } + const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; + // Now we do one round of Newton-Raphson approximation to get full accuracy + // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) + // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) + // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 + // = 0.5 * w * (3 - a*w^2) + // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula + // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] + const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); + const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); + const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); + const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); + mQ = _mm_mul_ps( mQ, nrApprox ); } // Normalize this vector with respect to all components. Accurate to 22 bites of precision. // Note that this does not consider zero length vectors! inline void LLVector4a::normalize4() { - // lenSqrd = a dot a - LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this ); - // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } - const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); - static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; - static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; - // Now we do one round of Newton-Raphson approximation to get full accuracy - // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) - // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) - // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 - // = 0.5 * w * (3 - a*w^2) - // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula - // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] - const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); - const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); - const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); - const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); - mQ = _mm_mul_ps( mQ, nrApprox ); + // lenSqrd = a dot a + LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this ); + // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } + const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; + // Now we do one round of Newton-Raphson approximation to get full accuracy + // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) + // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) + // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 + // = 0.5 * w * (3 - a*w^2) + // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula + // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] + const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); + const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); + const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); + const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); + mQ = _mm_mul_ps( mQ, nrApprox ); } // Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed // Note that this does not consider zero length vectors! inline LLSimdScalar LLVector4a::normalize3withLength() { - // lenSqrd = a dot a - LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); - // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } - const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); - static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; - static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; - // Now we do one round of Newton-Raphson approximation to get full accuracy - // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) - // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) - // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 - // = 0.5 * w * (3 - a*w^2) - // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula - // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] - const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); - const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); - const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); - const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); - mQ = _mm_mul_ps( mQ, nrApprox ); - return _mm_sqrt_ss(lenSqrd); + // lenSqrd = a dot a + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } + const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; + // Now we do one round of Newton-Raphson approximation to get full accuracy + // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) + // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) + // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 + // = 0.5 * w * (3 - a*w^2) + // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula + // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] + const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); + const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); + const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); + const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); + mQ = _mm_mul_ps( mQ, nrApprox ); + return _mm_sqrt_ss(lenSqrd); } // Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed // Note that this does not consider zero length vectors! inline void LLVector4a::normalize3fast() { - LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); - const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ); - mQ = _mm_mul_ps( mQ, approxRsqrt ); + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + mQ = _mm_mul_ps( mQ, approxRsqrt ); } inline void LLVector4a::normalize3fast_checked(LLVector4a* d) { - if (!isFinite3()) - { - *this = d ? *d : LLVector4a(0,1,0,1); - return; - } + if (!isFinite3()) + { + *this = d ? *d : LLVector4a(0,1,0,1); + return; + } - LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); - if (lenSqrd.getF32ptr()[0] <= FLT_EPSILON) - { - *this = d ? *d : LLVector4a(0,1,0,1); - return; - } + if (lenSqrd.getF32ptr()[0] <= FLT_EPSILON) + { + *this = d ? *d : LLVector4a(0,1,0,1); + return; + } - const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ); - mQ = _mm_mul_ps( mQ, approxRsqrt ); + const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + mQ = _mm_mul_ps( mQ, approxRsqrt ); } // Return true if this vector is normalized with respect to x,y,z up to tolerance inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const { - static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; - LLSimdScalar tol = _mm_load_ss( &tolerance ); - tol = _mm_mul_ss( tol, tol ); - LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this ); - lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); - lenSquared.setAbs(lenSquared); - return _mm_comile_ss( lenSquared, tol ); + static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; + LLSimdScalar tol = _mm_load_ss( &tolerance ); + tol = _mm_mul_ss( tol, tol ); + LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this ); + lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); + lenSquared.setAbs(lenSquared); + return _mm_comile_ss( lenSquared, tol ); } // Return true if this vector is normalized with respect to all components up to tolerance inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const { - static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; - LLSimdScalar tol = _mm_load_ss( &tolerance ); - tol = _mm_mul_ss( tol, tol ); - LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this ); - lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); - lenSquared.setAbs(lenSquared); - return _mm_comile_ss( lenSquared, tol ); + static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; + LLSimdScalar tol = _mm_load_ss( &tolerance ); + tol = _mm_mul_ss( tol, tol ); + LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this ); + lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); + lenSquared.setAbs(lenSquared); + return _mm_comile_ss( lenSquared, tol ); } -// Set all elements to the length of vector 'v' +// Set all elements to the length of vector 'v' inline void LLVector4a::setAllLength3( const LLVector4a& v ) { - LLVector4a lenSqrd; - lenSqrd.setAllDot3(v, v); - - mQ = _mm_sqrt_ps(lenSqrd.mQ); + LLVector4a lenSqrd; + lenSqrd.setAllDot3(v, v); + + mQ = _mm_sqrt_ps(lenSqrd.mQ); } // Get this vector's length inline LLSimdScalar LLVector4a::getLength3() const { - return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) ); + return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) ); } // Set the components of this vector to the minimum of the corresponding components of lhs and rhs inline void LLVector4a::setMin(const LLVector4a& lhs, const LLVector4a& rhs) { - mQ = _mm_min_ps(lhs.mQ, rhs.mQ); + mQ = _mm_min_ps(lhs.mQ, rhs.mQ); } // Set the components of this vector to the maximum of the corresponding components of lhs and rhs inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs) { - mQ = _mm_max_ps(lhs.mQ, rhs.mQ); + mQ = _mm_max_ps(lhs.mQ, rhs.mQ); } // Set this to lhs + (rhs-lhs)*c inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c) { - LLVector4a t; - t.setSub(rhs,lhs); - t.mul(c); - setAdd(lhs, t); + LLVector4a t; + t.setSub(rhs,lhs); + t.mul(c); + setAdd(lhs, t); } inline LLBool32 LLVector4a::isFinite3() const { - static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; - ll_assert_aligned(nanOrInfMask,16); - const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); - const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); - const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); - return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ ); + static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + ll_assert_aligned(nanOrInfMask,16); + const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); + const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); + const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); + return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ ); } - + inline LLBool32 LLVector4a::isFinite4() const { - static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; - const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); - const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); - const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); - return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW ); + static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); + const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); + const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); + return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW ); } inline void LLVector4a::setRotatedInv( const LLRotation& rot, const LLVector4a& vec ) { - LLRotation inv; inv.setTranspose( rot ); - setRotated( inv, vec ); + LLRotation inv; inv.setTranspose( rot ); + setRotated( inv, vec ); } inline void LLVector4a::setRotatedInv( const LLQuaternion2& quat, const LLVector4a& vec ) { - LLQuaternion2 invRot; invRot.setConjugate( quat ); - setRotated(invRot, vec); + LLQuaternion2 invRot; invRot.setConjugate( quat ); + setRotated(invRot, vec); } inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high ) { - const LLVector4Logical highMask = greaterThan( high ); - const LLVector4Logical lowMask = lessThan( low ); + const LLVector4Logical highMask = greaterThan( high ); + const LLVector4Logical lowMask = lessThan( low ); - setSelectWithMask( highMask, high, *this ); - setSelectWithMask( lowMask, low, *this ); + setSelectWithMask( highMask, high, *this ); + setSelectWithMask( lowMask, low, *this ); } //////////////////////////////////// // LOGICAL -//////////////////////////////////// +//////////////////////////////////// // The functions in this section will compare the elements in this vector // to those in rhs and return an LLVector4Logical with all bits set in elements // where the comparison was true and all bits unset in elements where the comparison @@ -544,68 +544,68 @@ inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high ) //////////////////////////////////// inline LLVector4Logical LLVector4a::greaterThan(const LLVector4a& rhs) const -{ - return _mm_cmpgt_ps(mQ, rhs.mQ); +{ + return _mm_cmpgt_ps(mQ, rhs.mQ); } inline LLVector4Logical LLVector4a::lessThan(const LLVector4a& rhs) const { - return _mm_cmplt_ps(mQ, rhs.mQ); + return _mm_cmplt_ps(mQ, rhs.mQ); } inline LLVector4Logical LLVector4a::greaterEqual(const LLVector4a& rhs) const { - return _mm_cmpge_ps(mQ, rhs.mQ); + return _mm_cmpge_ps(mQ, rhs.mQ); } inline LLVector4Logical LLVector4a::lessEqual(const LLVector4a& rhs) const { - return _mm_cmple_ps(mQ, rhs.mQ); + return _mm_cmple_ps(mQ, rhs.mQ); } inline LLVector4Logical LLVector4a::equal(const LLVector4a& rhs) const { - return _mm_cmpeq_ps(mQ, rhs.mQ); + return _mm_cmpeq_ps(mQ, rhs.mQ); } // Returns true if this and rhs are componentwise equal up to the specified absolute tolerance inline bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance ) const { - LLVector4a diff; diff.setSub( *this, rhs ); - diff.setAbs( diff ); - const LLQuad tol = _mm_set1_ps( tolerance ); - const LLQuad cmp = _mm_cmplt_ps( diff, tol ); - return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW; + LLVector4a diff; diff.setSub( *this, rhs ); + diff.setAbs( diff ); + const LLQuad tol = _mm_set1_ps( tolerance ); + const LLQuad cmp = _mm_cmplt_ps( diff, tol ); + return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW; } inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const { - LLVector4a diff; diff.setSub( *this, rhs ); - diff.setAbs( diff ); - const LLQuad tol = _mm_set1_ps( tolerance ); - const LLQuad t = _mm_cmplt_ps( diff, tol ); - return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ; - + LLVector4a diff; diff.setSub( *this, rhs ); + diff.setAbs( diff ); + const LLQuad tol = _mm_set1_ps( tolerance ); + const LLQuad t = _mm_cmplt_ps( diff, tol ); + return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ; + } //////////////////////////////////// // OPERATORS -//////////////////////////////////// +//////////////////////////////////// // Do NOT add aditional operators without consulting someone with SSE experience inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs ) { - mQ = rhs.mQ; - return *this; + mQ = rhs.mQ; + return *this; } inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs ) { - mQ = rhs; - return *this; + mQ = rhs; + return *this; } inline LLVector4a::operator LLQuad() const { - return mQ; + return mQ; } |