diff options
author | Ansariel <ansariel.hiller@phoenixviewer.com> | 2024-05-22 19:04:52 +0200 |
---|---|---|
committer | Ansariel <ansariel.hiller@phoenixviewer.com> | 2024-05-22 19:04:52 +0200 |
commit | 1b67dd855c41f5a0cda7ec2a68d98071986ca703 (patch) | |
tree | ab243607f74f78200787bba5b9b88f07ef1b966f /indra/llmath/llvector4a.cpp | |
parent | 6d6eabca44d08d5b97bfe3e941d2b9687c2246ea (diff) | |
parent | e1623bb276f83a43ce7a197e388720c05bdefe61 (diff) |
Merge remote-tracking branch 'origin/main' into DRTVWR-600-maint-A
# Conflicts:
# autobuild.xml
# indra/cmake/CMakeLists.txt
# indra/cmake/GoogleMock.cmake
# indra/llaudio/llaudioengine_fmodstudio.cpp
# indra/llaudio/llaudioengine_fmodstudio.h
# indra/llaudio/lllistener_fmodstudio.cpp
# indra/llaudio/lllistener_fmodstudio.h
# indra/llaudio/llstreamingaudio_fmodstudio.cpp
# indra/llaudio/llstreamingaudio_fmodstudio.h
# indra/llcharacter/llmultigesture.cpp
# indra/llcharacter/llmultigesture.h
# indra/llimage/llimage.cpp
# indra/llimage/llimagepng.cpp
# indra/llimage/llimageworker.cpp
# indra/llimage/tests/llimageworker_test.cpp
# indra/llmessage/tests/llmockhttpclient.h
# indra/llprimitive/llgltfmaterial.h
# indra/llrender/llfontfreetype.cpp
# indra/llui/llcombobox.cpp
# indra/llui/llfolderview.cpp
# indra/llui/llfolderviewmodel.h
# indra/llui/lllineeditor.cpp
# indra/llui/lllineeditor.h
# indra/llui/lltextbase.cpp
# indra/llui/lltextbase.h
# indra/llui/lltexteditor.cpp
# indra/llui/lltextvalidate.cpp
# indra/llui/lltextvalidate.h
# indra/llui/lluictrl.h
# indra/llui/llview.cpp
# indra/llwindow/llwindowmacosx.cpp
# indra/newview/app_settings/settings.xml
# indra/newview/llappearancemgr.cpp
# indra/newview/llappearancemgr.h
# indra/newview/llavatarpropertiesprocessor.cpp
# indra/newview/llavatarpropertiesprocessor.h
# indra/newview/llbreadcrumbview.cpp
# indra/newview/llbreadcrumbview.h
# indra/newview/llbreastmotion.cpp
# indra/newview/llbreastmotion.h
# indra/newview/llconversationmodel.h
# indra/newview/lldensityctrl.cpp
# indra/newview/lldensityctrl.h
# indra/newview/llface.inl
# indra/newview/llfloatereditsky.cpp
# indra/newview/llfloatereditwater.cpp
# indra/newview/llfloateremojipicker.h
# indra/newview/llfloaterimsessiontab.cpp
# indra/newview/llfloaterprofiletexture.cpp
# indra/newview/llfloaterprofiletexture.h
# indra/newview/llgesturemgr.cpp
# indra/newview/llgesturemgr.h
# indra/newview/llimpanel.cpp
# indra/newview/llimpanel.h
# indra/newview/llinventorybridge.cpp
# indra/newview/llinventorybridge.h
# indra/newview/llinventoryclipboard.cpp
# indra/newview/llinventoryclipboard.h
# indra/newview/llinventoryfunctions.cpp
# indra/newview/llinventoryfunctions.h
# indra/newview/llinventorygallery.cpp
# indra/newview/lllistbrowser.cpp
# indra/newview/lllistbrowser.h
# indra/newview/llpanelobjectinventory.cpp
# indra/newview/llpanelprofile.cpp
# indra/newview/llpanelprofile.h
# indra/newview/llpreviewgesture.cpp
# indra/newview/llsavedsettingsglue.cpp
# indra/newview/llsavedsettingsglue.h
# indra/newview/lltooldraganddrop.cpp
# indra/newview/llurllineeditorctrl.cpp
# indra/newview/llvectorperfoptions.cpp
# indra/newview/llvectorperfoptions.h
# indra/newview/llviewerparceloverlay.cpp
# indra/newview/llviewertexlayer.cpp
# indra/newview/llviewertexturelist.cpp
# indra/newview/macmain.h
# indra/test/test.cpp
Diffstat (limited to 'indra/llmath/llvector4a.cpp')
-rw-r--r-- | indra/llmath/llvector4a.cpp | 260 |
1 files changed, 130 insertions, 130 deletions
diff --git a/indra/llmath/llvector4a.cpp b/indra/llmath/llvector4a.cpp index 570fa41a43..0ac91366b6 100644 --- a/indra/llmath/llvector4a.cpp +++ b/indra/llmath/llvector4a.cpp @@ -1,25 +1,25 @@ -/** +/** * @file llvector4a.cpp * @brief SIMD vector implementation * * $LicenseInfo:firstyear=2010&license=viewerlgpl$ * Second Life Viewer Source Code * Copyright (C) 2010, Linden Research, Inc. - * + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License only. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * + * * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA * $/LicenseInfo$ */ @@ -28,12 +28,12 @@ #include "llmath.h" #include "llquantize.h" -extern const LLQuad F_ZERO_4A = { 0, 0, 0, 0 }; -extern const LLQuad F_APPROXIMATELY_ZERO_4A = { - F_APPROXIMATELY_ZERO, - F_APPROXIMATELY_ZERO, - F_APPROXIMATELY_ZERO, - F_APPROXIMATELY_ZERO +extern const LLQuad F_ZERO_4A = { 0, 0, 0, 0 }; +extern const LLQuad F_APPROXIMATELY_ZERO_4A = { + F_APPROXIMATELY_ZERO, + F_APPROXIMATELY_ZERO, + F_APPROXIMATELY_ZERO, + F_APPROXIMATELY_ZERO }; extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A ); @@ -46,135 +46,135 @@ extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec ) { - const LLVector4a col0 = rot.getColumn(0); - const LLVector4a col1 = rot.getColumn(1); - const LLVector4a col2 = rot.getColumn(2); - - LLVector4a result = _mm_load_ss( vec.getF32ptr() ); - result.splat<0>( result ); - result.mul( col0 ); - - { - LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() + 1 ); - yyyy.splat<0>( yyyy ); - yyyy.mul( col1 ); - result.add( yyyy ); - } - - { - LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() + 2 ); - zzzz.splat<0>( zzzz ); - zzzz.mul( col2 ); - result.add( zzzz ); - } - - *this = result; + const LLVector4a col0 = rot.getColumn(0); + const LLVector4a col1 = rot.getColumn(1); + const LLVector4a col2 = rot.getColumn(2); + + LLVector4a result = _mm_load_ss( vec.getF32ptr() ); + result.splat<0>( result ); + result.mul( col0 ); + + { + LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() + 1 ); + yyyy.splat<0>( yyyy ); + yyyy.mul( col1 ); + result.add( yyyy ); + } + + { + LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() + 2 ); + zzzz.splat<0>( zzzz ); + zzzz.mul( col2 ); + result.add( zzzz ); + } + + *this = result; } void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec ) { - const LLVector4a& quatVec = quat.getVector4a(); - LLVector4a temp; temp.setCross3(quatVec, vec); - temp.add( temp ); - - const LLVector4a realPart( quatVec.getScalarAt<3>() ); - LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart ); - - mQ = vec; - add( tempTimesReal ); - - LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp ); - add(imagCrossTemp); + const LLVector4a& quatVec = quat.getVector4a(); + LLVector4a temp; temp.setCross3(quatVec, vec); + temp.add( temp ); + + const LLVector4a realPart( quatVec.getScalarAt<3>() ); + LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart ); + + mQ = vec; + add( tempTimesReal ); + + LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp ); + add(imagCrossTemp); } void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high ) { - LLVector4a val(mQ); - LLVector4a delta; delta.setSub( high, low ); - - { - val.clamp(low, high); - val.sub(low); - - // 8-bit quantization means we can do with just 12 bits of reciprocal accuracy - const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ); -// { -// static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; -// LLVector4a two; two.load4a( F_TWO_4A ); -// -// // Here we use _mm_rcp_ps plus one round of newton-raphson -// // We wish to find 'x' such that x = 1/delta -// // As a first approximation, we take x0 = _mm_rcp_ps(delta) -// // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) -// // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf -// const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); -// oneOverDelta.setMul( delta, recipApprox ); -// oneOverDelta.setSub( two, oneOverDelta ); -// oneOverDelta.mul( recipApprox ); -// } - - val.mul(oneOverDelta); - val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A)); - } - - val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); - - { - val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); - val.mul(delta); - val.add(low); - } - - { - LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); - LLVector4a absVal; absVal.setAbs( val ); - setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); - } + LLVector4a val(mQ); + LLVector4a delta; delta.setSub( high, low ); + + { + val.clamp(low, high); + val.sub(low); + + // 8-bit quantization means we can do with just 12 bits of reciprocal accuracy + const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ); +// { +// static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; +// LLVector4a two; two.load4a( F_TWO_4A ); +// +// // Here we use _mm_rcp_ps plus one round of newton-raphson +// // We wish to find 'x' such that x = 1/delta +// // As a first approximation, we take x0 = _mm_rcp_ps(delta) +// // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) +// // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf +// const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); +// oneOverDelta.setMul( delta, recipApprox ); +// oneOverDelta.setSub( two, oneOverDelta ); +// oneOverDelta.mul( recipApprox ); +// } + + val.mul(oneOverDelta); + val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A)); + } + + val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); + + { + val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); + val.mul(delta); + val.add(low); + } + + { + LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A)); + LLVector4a absVal; absVal.setAbs( val ); + setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); + } } void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high ) { - LLVector4a val(mQ); - LLVector4a delta; delta.setSub( high, low ); - - { - val.clamp(low, high); - val.sub(low); - - // 16-bit quantization means we need a round of Newton-Raphson - LLVector4a oneOverDelta; - { - static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; - ll_assert_aligned(F_TWO_4A,16); - - LLVector4a two; two.load4a( F_TWO_4A ); - - // Here we use _mm_rcp_ps plus one round of newton-raphson - // We wish to find 'x' such that x = 1/delta - // As a first approximation, we take x0 = _mm_rcp_ps(delta) - // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) - // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf - const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); - oneOverDelta.setMul( delta, recipApprox ); - oneOverDelta.setSub( two, oneOverDelta ); - oneOverDelta.mul( recipApprox ); - } - - val.mul(oneOverDelta); - val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A)); - } - - val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); - - { - val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); - val.mul(delta); - val.add(low); - } - - { - LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); - LLVector4a absVal; absVal.setAbs( val ); - setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); - } + LLVector4a val(mQ); + LLVector4a delta; delta.setSub( high, low ); + + { + val.clamp(low, high); + val.sub(low); + + // 16-bit quantization means we need a round of Newton-Raphson + LLVector4a oneOverDelta; + { + static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f }; + ll_assert_aligned(F_TWO_4A,16); + + LLVector4a two; two.load4a( F_TWO_4A ); + + // Here we use _mm_rcp_ps plus one round of newton-raphson + // We wish to find 'x' such that x = 1/delta + // As a first approximation, we take x0 = _mm_rcp_ps(delta) + // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 ) + // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf + const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ); + oneOverDelta.setMul( delta, recipApprox ); + oneOverDelta.setSub( two, oneOverDelta ); + oneOverDelta.mul( recipApprox ); + } + + val.mul(oneOverDelta); + val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A)); + } + + val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ )); + + { + val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); + val.mul(delta); + val.add(low); + } + + { + LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A)); + LLVector4a absVal; absVal.setAbs( val ); + setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val ); + } } |