/** * @file llvector4a.cpp * @brief SIMD vector implementation * * $LicenseInfo:firstyear=2010&license=viewerlgpl$ * Second Life Viewer Source Code * Copyright (C) 2010, Linden Research, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License only. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA * $/LicenseInfo$ */ #include "llmemory.h" #include "llmath.h" #include "llquantize.h" extern const LLQuad F_ZERO_4A = { 0, 0, 0, 0 }; extern const LLQuad F_APPROXIMATELY_ZERO_4A = { F_APPROXIMATELY_ZERO, F_APPROXIMATELY_ZERO, F_APPROXIMATELY_ZERO, F_APPROXIMATELY_ZERO }; extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A ); extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F_APPROXIMATELY_ZERO_4A ); /*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes) { assert(src != NULL); assert(dst != NULL); assert(bytes > 0); assert((bytes % sizeof(F32))== 0); ll_assert_aligned(src,16); ll_assert_aligned(dst,16); assert(bytes%16==0); F32* end = dst + (bytes / sizeof(F32) ); if (bytes > 64) { F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst); //at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies F32* end_64 = end-16; _mm_prefetch((char*)begin_64, _MM_HINT_NTA); _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA); _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA); _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA); while (dst < begin_64) { copy4a(dst, src); dst += 4; src += 4; } while (dst < end_64) { _mm_prefetch((char*)src + 512, _MM_HINT_NTA); _mm_prefetch((char*)dst + 512, _MM_HINT_NTA); copy4a(dst, src); copy4a(dst+4, src+4); copy4a(dst+8, src+8); copy4a(dst+12, src+12); dst += 16; src += 16; } } while (dst < end) { copy4a(dst, src); dst += 4; src += 4; } } void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec ) { const LLVector4a col0 = rot.getColumn(0); const LLVector4a col1 = rot.getColumn(1); const LLVector4a col2 = rot.getColumn(2); LLVector4a result = _mm_load_ss( vec.getF32ptr() ); result.splat<0>( result ); result.mul( col0 ); { LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() + 1 ); yyyy.splat<0>( yyyy ); yyyy.mul( col1 ); result.add( yyyy ); } { LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() + 2 ); zzzz.splat<0>( zzzz ); zzzz.mul( col2 ); result.add( zzzz ); } *this = result; } void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec ) { const LLVector4a& quatVec = quat.getVector4a(); LLVector4a temp; temp.setCross3(quatVec, vec); temp.add( temp ); const LLVector4a realPart( quatVec.getScalarAt<3>() ); LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart ); mQ = vec; add( tempTimesReal ); LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp ); add(imagCrossTemp); } void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high ) { LLVector4a val(mQ); LLVector4a delta; delta.setSub( high, low ); { val.clamp(low, high); val.sub(low); // 8-bit quantization means we can do with just 12 bits of reciprocal accuracy const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ); // { // static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; // LLVector4a two; two.load4a( F_TWO_4A ); // // // Here we use _mm_rcp_ps plus one round of newton-raphson // // We wish to find 'x' such that x = 1/delta // // As a first approximation, we take x0 = _mm_rcp_ps(delta) // // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) // // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf // const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); // oneOverDelta.setMul( delta, recipApprox ); // oneOverDelta.setSub( two, oneOverDelta ); // oneOverDelta.mul( recipApprox ); // } val.mul(oneOverDelta); val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A)); } val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); { val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); val.mul(delta); val.add(low); } { LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); LLVector4a absVal; absVal.setAbs( val ); setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); } } void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high ) { LLVector4a val(mQ); LLVector4a delta; delta.setSub( high, low ); { val.clamp(low, high); val.sub(low); // 16-bit quantization means we need a round of Newton-Raphson LLVector4a oneOverDelta; { static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; ll_assert_aligned(F_TWO_4A,16); LLVector4a two; two.load4a( F_TWO_4A ); // Here we use _mm_rcp_ps plus one round of newton-raphson // We wish to find 'x' such that x = 1/delta // As a first approximation, we take x0 = _mm_rcp_ps(delta) // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); oneOverDelta.setMul( delta, recipApprox ); oneOverDelta.setSub( two, oneOverDelta ); oneOverDelta.mul( recipApprox ); } val.mul(oneOverDelta); val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A)); } val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); { val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); val.mul(delta); val.add(low); } { LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); LLVector4a absVal; absVal.setAbs( val ); setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); } }