diff options
author | Leyla Farazha <leyla@lindenlab.com> | 2011-06-07 17:16:49 -0700 |
---|---|---|
committer | Leyla Farazha <leyla@lindenlab.com> | 2011-06-07 17:16:49 -0700 |
commit | 1361eeae4e4538c175a32d48246897c4659cc26c (patch) | |
tree | 19687f65cc1c51b3247d0d1cd665b19c1a41a1e4 /indra/llmath/llvector4a.inl | |
parent | bff172b374fa12e646d427e09f2ffac4289ab654 (diff) | |
parent | cf31723a1a258e0452dd26d9f5eb5fcdac290e05 (diff) |
merge
Diffstat (limited to 'indra/llmath/llvector4a.inl')
-rw-r--r-- | indra/llmath/llvector4a.inl | 593 |
1 files changed, 593 insertions, 0 deletions
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl new file mode 100644 index 0000000000..7ad22a5631 --- /dev/null +++ b/indra/llmath/llvector4a.inl @@ -0,0 +1,593 @@ +/** + * @file llvector4a.inl + * @brief LLVector4a inline function implementations + * + * $LicenseInfo:firstyear=2010&license=viewerlgpl$ + * Second Life Viewer Source Code + * Copyright (C) 2010, Linden Research, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License only. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA + * $/LicenseInfo$ + */ + +//////////////////////////////////// +// LOAD/STORE +//////////////////////////////////// + +// Load from 16-byte aligned src array (preferred method of loading) +inline void LLVector4a::load4a(const F32* src) +{ + mQ = _mm_load_ps(src); +} + +// Load from unaligned src array (NB: Significantly slower than load4a) +inline void LLVector4a::loadua(const F32* src) +{ + mQ = _mm_loadu_ps(src); +} + +// Load only three floats beginning at address 'src'. Slowest method. +inline void LLVector4a::load3(const F32* src) +{ + // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X } + // NB: This differs from the convention of { Z, Y, X, W } + mQ = _mm_set_ps(0.f, src[2], src[1], src[0]); +} + +// Store to a 16-byte aligned memory address +inline void LLVector4a::store4a(F32* dst) const +{ + _mm_store_ps(dst, mQ); +} + +//////////////////////////////////// +// BASIC GET/SET +//////////////////////////////////// + +// Return a "this" as an F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon) +F32* LLVector4a::getF32ptr() +{ + return (F32*) &mQ; +} + +// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon) +const F32* const LLVector4a::getF32ptr() const +{ + return (const F32* const) &mQ; +} + +// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates +// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead +inline F32 LLVector4a::operator[](const S32 idx) const +{ + return ((F32*)&mQ)[idx]; +} + +// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. +inline LLSimdScalar LLVector4a::getScalarAt(const S32 idx) const +{ + // Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop) + switch (idx) + { + case 0: + return mQ; + case 1: + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1)); + case 2: + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2)); + case 3: + default: + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3)); + } +} + +// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time. +template <int N> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt() const +{ + return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N)); +} + +template<> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const +{ + return mQ; +} + +// Set to an x, y, z and optional w provided +inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w) +{ + mQ = _mm_set_ps(w, z, y, x); +} + +// Set to all zeros +inline void LLVector4a::clear() +{ + mQ = LLVector4a::getZero().mQ; +} + +inline void LLVector4a::splat(const F32 x) +{ + mQ = _mm_set1_ps(x); +} + +inline void LLVector4a::splat(const LLSimdScalar& x) +{ + mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) ); +} + +// Set all 4 elements to element N of src, with N known at compile time +template <int N> void LLVector4a::splat(const LLVector4a& src) +{ + mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) ); +} + +// Set all 4 elements to element i of v, with i NOT known at compile time +inline void LLVector4a::splat(const LLVector4a& v, U32 i) +{ + switch (i) + { + case 0: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0)); + break; + case 1: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1)); + break; + case 2: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2)); + break; + case 3: + mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3)); + break; + } +} + +// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask +inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse ) +{ + // ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse ) + // E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b + // (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b, + // as expected (01 from sourceIfTrue, 10 from sourceIfFalse) + // Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/ + mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) ); +} + +//////////////////////////////////// +// ALGEBRAIC +//////////////////////////////////// + +// Set this to the element-wise (a + b) +inline void LLVector4a::setAdd(const LLVector4a& a, const LLVector4a& b) +{ + mQ = _mm_add_ps(a.mQ, b.mQ); +} + +// Set this to element-wise (a - b) +inline void LLVector4a::setSub(const LLVector4a& a, const LLVector4a& b) +{ + mQ = _mm_sub_ps(a.mQ, b.mQ); +} + +// Set this to element-wise multiply (a * b) +inline void LLVector4a::setMul(const LLVector4a& a, const LLVector4a& b) +{ + mQ = _mm_mul_ps(a.mQ, b.mQ); +} + +// Set this to element-wise quotient (a / b) +inline void LLVector4a::setDiv(const LLVector4a& a, const LLVector4a& b) +{ + mQ = _mm_div_ps( a.mQ, b.mQ ); +} + +// Set this to the element-wise absolute value of src +inline void LLVector4a::setAbs(const LLVector4a& src) +{ + static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; + mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A)); +} + +// Add to each component in this vector the corresponding component in rhs +inline void LLVector4a::add(const LLVector4a& rhs) +{ + mQ = _mm_add_ps(mQ, rhs.mQ); +} + +// Subtract from each component in this vector the corresponding component in rhs +inline void LLVector4a::sub(const LLVector4a& rhs) +{ + mQ = _mm_sub_ps(mQ, rhs.mQ); +} + +// Multiply each component in this vector by the corresponding component in rhs +inline void LLVector4a::mul(const LLVector4a& rhs) +{ + mQ = _mm_mul_ps(mQ, rhs.mQ); +} + +// Divide each component in this vector by the corresponding component in rhs +inline void LLVector4a::div(const LLVector4a& rhs) +{ + // TODO: Check accuracy, maybe add divFast + mQ = _mm_div_ps(mQ, rhs.mQ); +} + +// Multiply this vector by x in a scalar fashion +inline void LLVector4a::mul(const F32 x) +{ + LLVector4a t; + t.splat(x); + + mQ = _mm_mul_ps(mQ, t.mQ); +} + +// Set this to (a x b) (geometric cross-product) +inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b) +{ + // Vectors are stored in memory in w, z, y, x order from high to low + // Set vector1 = { a[W], a[X], a[Z], a[Y] } + const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); + // Set vector2 = { b[W], b[Y], b[X], b[Z] } + const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); + // mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] } + mQ = _mm_mul_ps( vector1, vector2 ); + // vector3 = { a[W], a[Y], a[X], a[Z] } + const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 )); + // vector4 = { b[W], b[X], b[Z], b[Y] } + const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 )); + // mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] } + mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 )); +} + +/* This function works, but may be slightly slower than the one below on older machines + inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) + { + // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } + const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); + // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } + const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 )); + // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusY = _mm_add_ps(ab, wzxy); + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY); + // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] } + const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 )); + // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } + mQ = _mm_add_ps(zSplat, xPlusYSplat); + }*/ + +// Set all elements to the dot product of the x, y, and z elements in a and b +inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b) +{ + // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } + const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); + // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } + const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 )); + // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy)); + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY); + // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] } + const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 )); + // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } + mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat); +} + +// Set all elements to the dot product of the x, y, z, and w elements in a and b +inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b) +{ + // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] } + const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ ); + // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] } + const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 )); + // zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy)); + // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } + const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY); + const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY); + + // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same } + mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat); +} + +// Return the 3D dot product of this vector and b +inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const +{ + const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); + const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) ); + const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) ); + const LLQuad xPlusY = _mm_add_ps( ab, splatY ); + return _mm_add_ps( xPlusY, splatZ ); +} + +// Return the 4D dot product of this vector and b +inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const +{ + // ab = { w, z, y, x } + const LLQuad ab = _mm_mul_ps( mQ, b.mQ ); + // upperProdsInLowerElems = { y, x, y, x } + const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab ); + // sumOfPairs = { w+y, z+x, 2y, 2x } + const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab ); + // shuffled = { z+x, z+x, z+x, z+x } + const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) ); + return _mm_add_ss( sumOfPairs, shuffled ); +} + +// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed +// Note that this does not consider zero length vectors! +inline void LLVector4a::normalize3() +{ + // lenSqrd = a dot a + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } + const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; + // Now we do one round of Newton-Raphson approximation to get full accuracy + // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) + // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) + // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 + // = 0.5 * w * (3 - a*w^2) + // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula + // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] + const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); + const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); + const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); + const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); + mQ = _mm_mul_ps( mQ, nrApprox ); +} + +// Normalize this vector with respect to all components. Accurate to 22 bites of precision. +// Note that this does not consider zero length vectors! +inline void LLVector4a::normalize4() +{ + // lenSqrd = a dot a + LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this ); + // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } + const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; + // Now we do one round of Newton-Raphson approximation to get full accuracy + // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) + // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) + // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 + // = 0.5 * w * (3 - a*w^2) + // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula + // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] + const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); + const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); + const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); + const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); + mQ = _mm_mul_ps( mQ, nrApprox ); +} + +// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed +// Note that this does not consider zero length vectors! +inline LLSimdScalar LLVector4a::normalize3withLength() +{ + // lenSqrd = a dot a + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 } + const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f }; + static const LLQuad three = {3.f, 3.f, 3.f, 3.f }; + // Now we do one round of Newton-Raphson approximation to get full accuracy + // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a)) + // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3)) + // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3 + // = 0.5 * w * (3 - a*w^2) + // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula + // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)] + const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt ); + const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt ); + const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt ); + const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt)); + mQ = _mm_mul_ps( mQ, nrApprox ); + return _mm_sqrt_ss(lenSqrd); +} + +// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed +// Note that this does not consider zero length vectors! +inline void LLVector4a::normalize3fast() +{ + LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this ); + const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ); + mQ = _mm_mul_ps( mQ, approxRsqrt ); +} + +// Return true if this vector is normalized with respect to x,y,z up to tolerance +inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const +{ + static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; + LLSimdScalar tol = _mm_load_ss( &tolerance ); + tol = _mm_mul_ss( tol, tol ); + LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this ); + lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); + lenSquared.setAbs(lenSquared); + return _mm_comile_ss( lenSquared, tol ); +} + +// Return true if this vector is normalized with respect to all components up to tolerance +inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const +{ + static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 }; + LLSimdScalar tol = _mm_load_ss( &tolerance ); + tol = _mm_mul_ss( tol, tol ); + LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this ); + lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) ); + lenSquared.setAbs(lenSquared); + return _mm_comile_ss( lenSquared, tol ); +} + +// Set all elements to the length of vector 'v' +inline void LLVector4a::setAllLength3( const LLVector4a& v ) +{ + LLVector4a lenSqrd; + lenSqrd.setAllDot3(v, v); + + mQ = _mm_sqrt_ps(lenSqrd.mQ); +} + +// Get this vector's length +inline LLSimdScalar LLVector4a::getLength3() const +{ + return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) ); +} + +// Set the components of this vector to the minimum of the corresponding components of lhs and rhs +inline void LLVector4a::setMin(const LLVector4a& lhs, const LLVector4a& rhs) +{ + mQ = _mm_min_ps(lhs.mQ, rhs.mQ); +} + +// Set the components of this vector to the maximum of the corresponding components of lhs and rhs +inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs) +{ + mQ = _mm_max_ps(lhs.mQ, rhs.mQ); +} + +// Set this to (c * lhs) + rhs * ( 1 - c) +inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c) +{ + LLVector4a a = lhs; + a.mul(c); + + LLVector4a b = rhs; + b.mul(1.f-c); + + setAdd(a, b); +} + +inline LLBool32 LLVector4a::isFinite3() const +{ + static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); + const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); + const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); + return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ ); +} + +inline LLBool32 LLVector4a::isFinite4() const +{ + static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask); + const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV ); + const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV )); + return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW ); +} + +inline void LLVector4a::setRotatedInv( const LLRotation& rot, const LLVector4a& vec ) +{ + LLRotation inv; inv.setTranspose( rot ); + setRotated( inv, vec ); +} + +inline void LLVector4a::setRotatedInv( const LLQuaternion2& quat, const LLVector4a& vec ) +{ + LLQuaternion2 invRot; invRot.setConjugate( quat ); + setRotated(invRot, vec); +} + +inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high ) +{ + const LLVector4Logical highMask = greaterThan( high ); + const LLVector4Logical lowMask = lessThan( low ); + + setSelectWithMask( highMask, high, *this ); + setSelectWithMask( lowMask, low, *this ); +} + + +//////////////////////////////////// +// LOGICAL +//////////////////////////////////// +// The functions in this section will compare the elements in this vector +// to those in rhs and return an LLVector4Logical with all bits set in elements +// where the comparison was true and all bits unset in elements where the comparison +// was false. See llvector4logica.h +//////////////////////////////////// +// WARNING: Other than equals3 and equals4, these functions do NOT account +// for floating point tolerance. You should include the appropriate tolerance +// in the inputs. +//////////////////////////////////// + +inline LLVector4Logical LLVector4a::greaterThan(const LLVector4a& rhs) const +{ + return _mm_cmpgt_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::lessThan(const LLVector4a& rhs) const +{ + return _mm_cmplt_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::greaterEqual(const LLVector4a& rhs) const +{ + return _mm_cmpge_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::lessEqual(const LLVector4a& rhs) const +{ + return _mm_cmple_ps(mQ, rhs.mQ); +} + +inline LLVector4Logical LLVector4a::equal(const LLVector4a& rhs) const +{ + return _mm_cmpeq_ps(mQ, rhs.mQ); +} + +// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance +inline bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance ) const +{ + LLVector4a diff; diff.setSub( *this, rhs ); + diff.setAbs( diff ); + const LLQuad tol = _mm_set1_ps( tolerance ); + const LLQuad cmp = _mm_cmplt_ps( diff, tol ); + return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW; +} + +inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const +{ + LLVector4a diff; diff.setSub( *this, rhs ); + diff.setAbs( diff ); + const LLQuad tol = _mm_set1_ps( tolerance ); + const LLQuad t = _mm_cmplt_ps( diff, tol ); + return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ; + +} + +//////////////////////////////////// +// OPERATORS +//////////////////////////////////// + +// Do NOT add aditional operators without consulting someone with SSE experience +inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs ) +{ + mQ = rhs.mQ; + return *this; +} + +inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs ) +{ + mQ = rhs; + return *this; +} + +inline LLVector4a::operator LLQuad() const +{ + return mQ; +} |