1 files changed, 32 insertions, 7 deletions
diff --git a/indra/llmath/llvector4a.inl b/indra/llmath/llvector4a.inl
index 36dbec078c..0f7c4123ac 100644
--- a/indra/llmath/llvector4a.inl
+++ b/indra/llmath/llvector4a.inl
@@ -115,7 +115,7 @@ inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w)
 // Set to all zeros
 inline void LLVector4a::clear()
 {
-    mQ = LLVector4a::getZero().mQ;
+    mQ = _mm_setzero_ps();
 }
 
 inline void LLVector4a::splat(const F32 x)
@@ -272,6 +272,9 @@ inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
 // Set all elements to the dot product of the x, y, and z elements in a and b
 inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
 {
+#if (defined(__arm64__) || defined(__aarch64__))
+    mQ = _mm_dp_ps(a.mQ, b.mQ, 0x7f);
+#else
     // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
     const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
     // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
@@ -284,11 +287,15 @@ inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
     const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
     // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
     mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
+#endif
 }
 
 // Set all elements to the dot product of the x, y, z, and w elements in a and b
 inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
 {
+#if (defined(__arm64__) || defined(__aarch64__))
+    mQ = _mm_dp_ps(a.mQ, b.mQ, 0xff);
+#else
     // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
     const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
     // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
@@ -301,21 +308,29 @@ inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
 
     // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
     mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
+#endif
 }
 
 // Return the 3D dot product of this vector and b
 inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
 {
+#if (defined(__arm64__) || defined(__aarch64__))
+    return _mm_dp_ps(mQ, b.mQ, 0x7f);
+#else
     const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
     const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
     const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
     const LLQuad xPlusY = _mm_add_ps( ab, splatY );
     return _mm_add_ps( xPlusY, splatZ );
+#endif
 }
 
 // Return the 4D dot product of this vector and b
 inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
 {
+#if (defined(__arm64__) || defined(__aarch64__))
+    return _mm_dp_ps(mQ, b.mQ, 0xff);
+#else
     // ab = { w, z, y, x }
     const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
     // upperProdsInLowerElems = { y, x, y, x }
@@ -325,6 +340,7 @@ inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
     // shuffled = { z+x, z+x, z+x, z+x }
     const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
     return _mm_add_ss( sumOfPairs, shuffled );
+#endif
 }
 
 // Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
@@ -335,8 +351,13 @@ inline void LLVector4a::normalize3()
     LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
     // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
     const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+#if _M_ARM64
+    static const LLQuad half = {.n128_f32 = {0.5f, 0.5f, 0.5f, 0.5f}};
+    static const LLQuad three = {.n128_f32 = {3.f, 3.f, 3.f, 3.f }};
+#else
     static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
     static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
+#endif
     // Now we do one round of Newton-Raphson approximation to get full accuracy
     // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
     // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
@@ -359,8 +380,13 @@ inline void LLVector4a::normalize4()
     LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this );
     // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
     const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+#if _M_ARM64
+    static const LLQuad half = {.n128_f32 = {0.5f, 0.5f, 0.5f, 0.5f}};
+    static const LLQuad three = {.n128_f32 = {3.f, 3.f, 3.f, 3.f}};
+#else
     static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
     static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
+#endif
     // Now we do one round of Newton-Raphson approximation to get full accuracy
     // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
     // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
@@ -383,8 +409,13 @@ inline LLSimdScalar LLVector4a::normalize3withLength()
     LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
     // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
     const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
+#if _M_ARM64
+    static const LLQuad half = {.n128_f32 = {0.5f, 0.5f, 0.5f, 0.5f}};
+    static const LLQuad three = {.n128_f32 = {3.f, 3.f, 3.f, 3.f}};
+#else
     static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
     static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
+#endif
     // Now we do one round of Newton-Raphson approximation to get full accuracy
     // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
     // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
@@ -593,12 +624,6 @@ inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const
 ////////////////////////////////////
 
 // Do NOT add aditional operators without consulting someone with SSE experience
-inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs )
-{
-    mQ = rhs.mQ;
-    return *this;
-}
-
 inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs )
 {
     mQ = rhs;