From cde5d29faf84c5cb7fc1b0d0ff6d03f3b7354c8f Mon Sep 17 00:00:00 2001
From: RunitaiLinden <davep@lindenlab.com>
Date: Tue, 10 Sep 2024 18:27:45 -0500
Subject: Profile guided optimizations

---
 indra/llcommon/llstrider.h        |  7 ++++
 indra/llrender/llimagegl.cpp      | 52 +++++++++++++++++++++-----
 indra/llrender/llvertexbuffer.cpp | 59 +++++++++++++++++++++++++++++-
 indra/llrender/llvertexbuffer.h   | 12 ++++++
 indra/newview/llface.cpp          | 77 +++++++++++++++++++++++++--------------
 indra/newview/llskinningutil.cpp  |  4 +-
 indra/newview/llviewertexture.cpp | 45 -----------------------
 indra/newview/llvoavatar.cpp      | 16 +++++---
 indra/newview/llvovolume.cpp      | 23 ++----------
 9 files changed, 182 insertions(+), 113 deletions(-)

(limited to 'indra')

diff --git a/indra/llcommon/llstrider.h b/indra/llcommon/llstrider.h
index 06cf8d3480..d756aca62b 100644
--- a/indra/llcommon/llstrider.h
+++ b/indra/llcommon/llstrider.h
@@ -41,6 +41,13 @@ public:
     LLStrider(Object* first) { mObjectp = first; mSkip = sizeof(Object); }
     ~LLStrider() { }
 
+    const LLStrider<Object>& operator=(const LLStrider<Object>& rhs)
+    {
+        mBytep = rhs.mBytep;
+        mSkip = rhs.mSkip;
+        return *this;
+    }
+
     const LLStrider<Object>& operator =  (Object *first)    { mObjectp = first; return *this;}
     void setStride (S32 skipBytes)  { mSkip = (skipBytes ? skipBytes : sizeof(Object));}
 
diff --git a/indra/llrender/llimagegl.cpp b/indra/llrender/llimagegl.cpp
index 68c20048ec..67b4ada62f 100644
--- a/indra/llrender/llimagegl.cpp
+++ b/indra/llrender/llimagegl.cpp
@@ -1045,15 +1045,47 @@ void sub_image_lines(U32 target, S32 miplevel, S32 x_offset, S32 y_offset, S32 w
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_TEXTURE;
 
+    LL_PROFILE_ZONE_NUM(width);
+    LL_PROFILE_ZONE_NUM(height);
+
     U32 components = LLImageGL::dataFormatComponents(pixformat);
     U32 type_width = type_width_from_pixtype(pixtype);
 
     const U32 line_width = data_width * components * type_width;
     const U32 y_offset_end = y_offset + height;
-    for (U32 y_pos = y_offset; y_pos < y_offset_end; ++y_pos)
+
+    if (width == data_width && height % 32 == 0)
+    {
+        LL_PROFILE_ZONE_NAMED_CATEGORY_TEXTURE("subimage - batched lines");
+
+        // full width, batch multiple lines at a time
+        // set batch size based on width
+        U32 batch_size = 32;
+
+        if (width > 1024)
+        {
+            batch_size = 8;
+        }
+        else if (width > 512)
+        {
+            batch_size = 16;
+        }
+
+        // full width texture, do 32 lines at a time
+        for (U32 y_pos = y_offset; y_pos < y_offset_end; y_pos += batch_size)
+        {
+            glTexSubImage2D(target, miplevel, x_offset, y_pos, width, batch_size, pixformat, pixtype, src);
+            src += line_width * batch_size;
+        }
+    }
+    else
     {
-        glTexSubImage2D(target, miplevel, x_offset, y_pos, width, 1, pixformat, pixtype, src);
-        src += line_width;
+        // partial width or strange height
+        for (U32 y_pos = y_offset; y_pos < y_offset_end; y_pos += 1)
+        {
+            glTexSubImage2D(target, miplevel, x_offset, y_pos, width, 1, pixformat, pixtype, src);
+            src += line_width;
+        }
     }
 }
 
@@ -2139,6 +2171,8 @@ void LLImageGL::analyzeAlpha(const void* data_in, U32 w, U32 h)
         return ;
     }
 
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_TEXTURE;
+
     U32 length = w * h;
     U32 alphatotal = 0;
 
@@ -2150,15 +2184,15 @@ void LLImageGL::analyzeAlpha(const void* data_in, U32 w, U32 h)
     // this will mid-skew the data (and thus increase the chances of not
     // being used as a mask) from high-frequency alpha maps which
     // suffer the worst from aliasing when used as alpha masks.
-    if (w >= 2 && h >= 2)
+    if (w >= 4 && h >= 4)
     {
-        llassert(w%2 == 0);
-        llassert(h%2 == 0);
+        llassert(w%4 == 0);
+        llassert(h%4 == 0);
         const GLubyte* rowstart = ((const GLubyte*) data_in) + mAlphaOffset;
-        for (U32 y = 0; y < h; y+=2)
+        for (U32 y = 0; y < h; y+=4)
         {
             const GLubyte* current = rowstart;
-            for (U32 x = 0; x < w; x+=2)
+            for (U32 x = 0; x < w; x+=4)
             {
                 const U32 s1 = current[0];
                 alphatotal += s1;
@@ -2182,7 +2216,7 @@ void LLImageGL::analyzeAlpha(const void* data_in, U32 w, U32 h)
             }
 
 
-            rowstart += 2 * w * mAlphaStride;
+            rowstart += 4 * w * mAlphaStride;
         }
         length *= 2; // we sampled everything twice, essentially
     }
diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp
index 156e300853..c1f239fc43 100644
--- a/indra/llrender/llvertexbuffer.cpp
+++ b/indra/llrender/llvertexbuffer.cpp
@@ -954,6 +954,25 @@ LLVertexBuffer::LLVertexBuffer(U32 typemask)
     }
 }
 
+// list of mapped buffers
+// NOTE: must not be LLPointer<LLVertexBuffer> to avoid breaking non-ref-counted LLVertexBuffer instances
+static std::vector<LLVertexBuffer*> sMappedBuffers;
+
+//static
+void LLVertexBuffer::flushBuffers()
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+    // must only be called from main thread
+    llassert(LLCoros::on_main_thread_main_coro());
+    for (auto& buffer : sMappedBuffers)
+    {
+        buffer->_unmapBuffer();
+        buffer->mMapped = false;
+    }
+
+    sMappedBuffers.resize(0);
+}
+
 //static
 U32 LLVertexBuffer::calcOffsets(const U32& typemask, U32* offsets, U32 num_vertices)
 {
@@ -997,6 +1016,12 @@ U32 LLVertexBuffer::calcVertexSize(const U32& typemask)
 //virtual
 LLVertexBuffer::~LLVertexBuffer()
 {
+    if (mMapped)
+    { // is on the mapped buffer list but doesn't need to be flushed
+        mMapped = false;
+        unmapBuffer();
+    }
+
     destroyGLBuffer();
     destroyGLIndices();
 
@@ -1198,6 +1223,7 @@ bool expand_region(LLVertexBuffer::MappedRegion& region, U32 start, U32 end)
 U8* LLVertexBuffer::mapVertexBuffer(LLVertexBuffer::AttributeType type, U32 index, S32 count)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+    _mapBuffer();
 
     if (count == -1)
     {
@@ -1233,6 +1259,7 @@ U8* LLVertexBuffer::mapVertexBuffer(LLVertexBuffer::AttributeType type, U32 inde
 U8* LLVertexBuffer::mapIndexBuffer(U32 index, S32 count)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+    _mapBuffer();
 
     if (count == -1)
     {
@@ -1289,11 +1316,11 @@ void LLVertexBuffer::flush_vbo(GLenum target, U32 start, U32 end, void* data, U8
         LL_PROFILE_ZONE_NUM(end);
         LL_PROFILE_ZONE_NUM(end-start);
 
-        constexpr U32 block_size = 8192;
+        constexpr U32 block_size = 65536;
 
         for (U32 i = start; i <= end; i += block_size)
         {
-            LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block");
+            //LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block");
             //LL_PROFILE_GPU_ZONE("glBufferSubData");
             U32 tend = llmin(i + block_size, end);
             U32 size = tend - i + 1;
@@ -1304,8 +1331,29 @@ void LLVertexBuffer::flush_vbo(GLenum target, U32 start, U32 end, void* data, U8
 }
 
 void LLVertexBuffer::unmapBuffer()
+{
+    flushBuffers();
+}
+
+void LLVertexBuffer::_mapBuffer()
+{
+    // must only be called from main thread
+    llassert(LLCoros::on_main_thread_main_coro());
+    if (!mMapped)
+    {
+        mMapped = true;
+        sMappedBuffers.push_back(this);
+    }
+}
+
+void LLVertexBuffer::_unmapBuffer()
 {
     STOP_GLERROR;
+    if (!mMapped)
+    {
+        return;
+    }
+
     struct SortMappedRegion
     {
         bool operator()(const MappedRegion& lhs, const MappedRegion& rhs)
@@ -1549,6 +1597,13 @@ void LLVertexBuffer::setBuffer()
         return;
     }
 #endif
+
+    if (mMapped)
+    {
+        LL_WARNS() << "Missing call to unmapBuffer or flushBuffers" << LL_ENDL;
+        _unmapBuffer();
+    }
+
     // no data may be pending
     llassert(mMappedVertexRegions.empty());
     llassert(mMappedIndexRegions.empty());
diff --git a/indra/llrender/llvertexbuffer.h b/indra/llrender/llvertexbuffer.h
index 2a4affdc60..9fe468f89e 100644
--- a/indra/llrender/llvertexbuffer.h
+++ b/indra/llrender/llvertexbuffer.h
@@ -120,6 +120,9 @@ public:
     // indexed by the following enum
     static U32 calcOffsets(const U32& typemask, U32* offsets, U32 num_vertices);
 
+    // flush any pending mapped buffers
+    static void flushBuffers();
+
     //WARNING -- when updating these enums you MUST
     // 1 - update LLVertexBuffer::sTypeSize
     // 2 - update LLVertexBuffer::vb_type_name
@@ -190,6 +193,8 @@ public:
     // map for data access (see also getFooStrider below)
     U8*     mapVertexBuffer(AttributeType type, U32 index, S32 count = -1);
     U8*     mapIndexBuffer(U32 index, S32 count = -1);
+
+    // synonym for flushBuffers
     void    unmapBuffer();
 
     // set for rendering
@@ -312,6 +317,13 @@ private:
 
     bool    allocateBuffer(S32 nverts, S32 nindices, bool create) { return allocateBuffer(nverts, nindices); }
 
+    // actually unmap buffer
+    void _unmapBuffer();
+
+    // add to set of mapped buffers
+    void _mapBuffer();
+    bool mMapped = false;
+
 public:
 
     static U64 getBytesAllocated();
diff --git a/indra/newview/llface.cpp b/indra/newview/llface.cpp
index ccfef09b09..a1ec75e34b 100644
--- a/indra/newview/llface.cpp
+++ b/indra/newview/llface.cpp
@@ -1741,7 +1741,7 @@ bool LLFace::getGeometryVolume(const LLVolume& volume,
             { //bump mapped or has material, just do the whole expensive loop
                 LL_PROFILE_ZONE_NAMED_CATEGORY_FACE("getGeometryVolume - texgen default");
 
-                std::vector<LLVector2> bump_tc;
+                LLStrider<LLVector2> bump_tc;
 
                 if (mat && !mat->getNormalID().isNull())
                 { //writing out normal and specular texture coordinates, not bump offsets
@@ -1803,49 +1803,70 @@ bool LLFace::getGeometryVolume(const LLVolume& volume,
                     }
                     const bool do_xform = (xforms & xform_channel) != XFORM_NONE;
 
+                    // hold onto strider to front of TC array for use later
+                    bump_tc = dst;
 
-                    for (S32 i = 0; i < num_vertices; i++)
                     {
-                        LLVector2 tc(vf.mTexCoords[i]);
-
-                        LLVector4a& norm = vf.mNormals[i];
-
-                        LLVector4a& center = *(vf.mCenter);
-
-                        if (texgen != LLTextureEntry::TEX_GEN_DEFAULT)
+                        // NOTE: split TEX_GEN_PLANAR implementation to reduce branchiness of inner loop
+                        // These are per-vertex operations and every little bit counts
+                        if (texgen == LLTextureEntry::TEX_GEN_PLANAR)
                         {
-                            LLVector4a vec = vf.mPositions[i];
+                            LL_PROFILE_ZONE_NAMED_CATEGORY_FACE("tgd - planar");
+                            for (S32 i = 0; i < num_vertices; i++)
+                            {
+                                LLVector2 tc(vf.mTexCoords[i]);
+                                LLVector4a& norm = vf.mNormals[i];
+                                LLVector4a& center = *(vf.mCenter);
+                                LLVector4a vec = vf.mPositions[i];
 
-                            vec.mul(scalea);
+                                vec.mul(scalea);
 
-                            if (texgen == LLTextureEntry::TEX_GEN_PLANAR)
-                            {
                                 planarProjection(tc, norm, center, vec);
-                            }
-                        }
 
-                        if (tex_mode && mTextureMatrix)
-                        {
-                            LLVector3 tmp(tc.mV[0], tc.mV[1], 0.f);
-                            tmp = tmp * *mTextureMatrix;
-                            tc.mV[0] = tmp.mV[0];
-                            tc.mV[1] = tmp.mV[1];
+                                if (tex_mode && mTextureMatrix)
+                                {
+                                    LLVector3 tmp(tc.mV[0], tc.mV[1], 0.f);
+                                    tmp = tmp * *mTextureMatrix;
+                                    tc.mV[0] = tmp.mV[0];
+                                    tc.mV[1] = tmp.mV[1];
+                                }
+                                else if (do_xform)
+                                {
+                                    xform(tc, cos_ang, sin_ang, os, ot, ms, mt);
+                                }
+
+                                *dst++ = tc;
+                            }
                         }
-                        else if (do_xform)
+                        else
                         {
-                            xform(tc, cos_ang, sin_ang, os, ot, ms, mt);
-                        }
+                            LL_PROFILE_ZONE_NAMED_CATEGORY_FACE("tgd - transform");
 
-                        *dst++ = tc;
-                        if (do_bump)
-                        {
-                            bump_tc.push_back(tc);
+                            for (S32 i = 0; i < num_vertices; i++)
+                            {
+                                LLVector2 tc(vf.mTexCoords[i]);
+
+                                if (tex_mode && mTextureMatrix)
+                                {
+                                    LLVector3 tmp(tc.mV[0], tc.mV[1], 0.f);
+                                    tmp = tmp * *mTextureMatrix;
+                                    tc.mV[0] = tmp.mV[0];
+                                    tc.mV[1] = tmp.mV[1];
+                                }
+                                else if (do_xform)
+                                {
+                                    xform(tc, cos_ang, sin_ang, os, ot, ms, mt);
+                                }
+
+                                *dst++ = tc;
+                            }
                         }
                     }
                 }
 
                 if ((!mat && !gltf_mat) && do_bump)
                 {
+                    LL_PROFILE_ZONE_NAMED_CATEGORY_FACE("tgd - do bump");
                     mVertexBuffer->getTexCoord1Strider(tex_coords1, mGeomIndex, mGeomCount);
 
                     mVObjp->getVolume()->genTangents(face_index);
diff --git a/indra/newview/llskinningutil.cpp b/indra/newview/llskinningutil.cpp
index 9b4ed4c946..1c92e06700 100644
--- a/indra/newview/llskinningutil.cpp
+++ b/indra/newview/llskinningutil.cpp
@@ -315,11 +315,9 @@ void LLSkinningUtil::initJointNums(LLMeshSkinInfo* skin, LLVOAvatar *avatar)
     }
 }
 
-static LLTrace::BlockTimerStatHandle FTM_FACE_RIGGING_INFO("Face Rigging Info");
-
 void LLSkinningUtil::updateRiggingInfo(const LLMeshSkinInfo* skin, LLVOAvatar *avatar, LLVolumeFace& vol_face)
 {
-    LL_RECORD_BLOCK_TIME(FTM_FACE_RIGGING_INFO);
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_AVATAR;
 
     if (vol_face.mJointRiggingInfoTab.needsUpdate())
     {
diff --git a/indra/newview/llviewertexture.cpp b/indra/newview/llviewertexture.cpp
index 681d91c945..9e1cb84bd1 100644
--- a/indra/newview/llviewertexture.cpp
+++ b/indra/newview/llviewertexture.cpp
@@ -1361,51 +1361,6 @@ void LLViewerFetchedTexture::addToCreateTexture()
     }
     else
     {
-        LL_PROFILE_ZONE_SCOPED_CATEGORY_TEXTURE;
-#if 1
-        //
-        //if mRequestedDiscardLevel > mDesiredDiscardLevel, we assume the required image res keep going up,
-        //so do not scale down the over qualified image.
-        //Note: scaling down image is expensensive. Do it only when very necessary.
-        //
-        if(mRequestedDiscardLevel <= mDesiredDiscardLevel && !mForceToSaveRawImage)
-        {
-            U32 w = mFullWidth >> mRawDiscardLevel;
-            U32 h = mFullHeight >> mRawDiscardLevel;
-
-            //if big image, do not load extra data
-            //scale it down to size >= LLViewerTexture::sMinLargeImageSize
-            if(w * h > LLViewerTexture::sMinLargeImageSize)
-            {
-                S32 d_level = llmin(mRequestedDiscardLevel, (S32)mDesiredDiscardLevel) - mRawDiscardLevel;
-
-                if(d_level > 0)
-                {
-                    S32 i = 0;
-                    while((d_level > 0) && ((w >> i) * (h >> i) > LLViewerTexture::sMinLargeImageSize))
-                    {
-                        i++;
-                        d_level--;
-                    }
-                    if(i > 0)
-                    {
-                        mRawDiscardLevel += i;
-                        if(mRawDiscardLevel >= getDiscardLevel() && getDiscardLevel() > 0)
-                        {
-                            mNeedsCreateTexture = false;
-                            destroyRawImage();
-                            return;
-                        }
-
-                        {
-                            //make a duplicate in case somebody else is using this raw image
-                            mRawImage = mRawImage->scaled(w >> i, h >> i);
-                        }
-                    }
-                }
-            }
-        }
-#endif
         scheduleCreateTexture();
     }
     return;
diff --git a/indra/newview/llvoavatar.cpp b/indra/newview/llvoavatar.cpp
index b04bb99b97..26cb2a573a 100644
--- a/indra/newview/llvoavatar.cpp
+++ b/indra/newview/llvoavatar.cpp
@@ -10684,14 +10684,18 @@ void LLVOAvatar::updateRiggingInfo()
 
     std::map<LLUUID, S32> curr_rigging_info_key;
 
-    // Get current rigging info key
-    for (LLVOVolume* vol : volumes)
     {
-        if (vol->isMesh() && vol->getVolume())
+        LL_PROFILE_ZONE_NAMED_CATEGORY_AVATAR("update rig info - get key")
+
+        // Get current rigging info key
+        for (LLVOVolume* vol : volumes)
         {
-            const LLUUID& mesh_id = vol->getVolume()->getParams().getSculptID();
-            S32 max_lod = llmax(vol->getLOD(), vol->mLastRiggingInfoLOD);
-            curr_rigging_info_key[mesh_id] = max_lod;
+            if (vol->isMesh() && vol->getVolume())
+            {
+                const LLUUID& mesh_id = vol->getVolume()->getParams().getSculptID();
+                S32 max_lod = llmax(vol->getLOD(), vol->mLastRiggingInfoLOD);
+                curr_rigging_info_key[mesh_id] = max_lod;
+            }
         }
     }
 
diff --git a/indra/newview/llvovolume.cpp b/indra/newview/llvovolume.cpp
index 4b63354893..4f48a070e3 100644
--- a/indra/newview/llvovolume.cpp
+++ b/indra/newview/llvovolume.cpp
@@ -6028,8 +6028,8 @@ void LLVolumeGeometryManager::rebuildMesh(LLSpatialGroup* group)
 
             group->mBuilt = 1.f;
 
-            const U32 MAX_BUFFER_COUNT = 4096;
-            LLVertexBuffer* locked_buffer[MAX_BUFFER_COUNT];
+            static std::vector<LLVertexBuffer*> locked_buffer;
+            locked_buffer.resize(0);
 
             U32 buffer_count = 0;
 
@@ -6074,8 +6074,6 @@ void LLVolumeGeometryManager::rebuildMesh(LLSpatialGroup* group)
                                     group->dirtyGeom();
                                     gPipeline.markRebuild(group);
                                 }
-
-                                buff->unmapBuffer();
                             }
                         }
                     }
@@ -6091,17 +6089,7 @@ void LLVolumeGeometryManager::rebuildMesh(LLSpatialGroup* group)
 
             {
                 LL_PROFILE_ZONE_NAMED("rebuildMesh - flush");
-                for (LLVertexBuffer** iter = locked_buffer, ** end_iter = locked_buffer+buffer_count; iter != end_iter; ++iter)
-                {
-                    (*iter)->unmapBuffer();
-                }
-
-                // don't forget alpha
-                if(group != NULL &&
-                   !group->mVertexBuffer.isNull())
-                {
-                    group->mVertexBuffer->unmapBuffer();
-                }
+                LLVertexBuffer::flushBuffers();
             }
 
             group->clearState(LLSpatialGroup::MESH_DIRTY | LLSpatialGroup::NEW_DRAWINFO);
@@ -6783,11 +6771,6 @@ U32 LLVolumeGeometryManager::genDrawInfo(LLSpatialGroup* group, U32 mask, LLFace
 
             ++face_iter;
         }
-
-        if (buffer)
-        {
-            buffer->unmapBuffer();
-        }
     }
 
     group->mBufferMap[mask].clear();
-- 
cgit v1.2.3