Profile guided optimizations

author: RunitaiLinden <davep@lindenlab.com> 2024-09-10 18:27:45 -0500
committer: RunitaiLinden <davep@lindenlab.com> 2024-09-10 18:27:45 -0500
commit: cde5d29faf84c5cb7fc1b0d0ff6d03f3b7354c8f (patch)
tree: a0e1eec0a22e1c818463a0d756796bd001c2cf91 /indra/llrender
parent: 33116ea35ec9a925c1601c8f1833e4d1e9f8390b (diff)
3 files changed, 112 insertions, 11 deletions
diff --git a/indra/llrender/llimagegl.cpp b/indra/llrender/llimagegl.cpp
index 68c20048ec..67b4ada62f 100644
--- a/indra/llrender/llimagegl.cpp
+++ b/indra/llrender/llimagegl.cpp
@@ -1045,15 +1045,47 @@ void sub_image_lines(U32 target, S32 miplevel, S32 x_offset, S32 y_offset, S32 w
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_TEXTURE;
 
+    LL_PROFILE_ZONE_NUM(width);
+    LL_PROFILE_ZONE_NUM(height);
+
     U32 components = LLImageGL::dataFormatComponents(pixformat);
     U32 type_width = type_width_from_pixtype(pixtype);
 
     const U32 line_width = data_width * components * type_width;
     const U32 y_offset_end = y_offset + height;
-    for (U32 y_pos = y_offset; y_pos < y_offset_end; ++y_pos)
+
+    if (width == data_width && height % 32 == 0)
+    {
+        LL_PROFILE_ZONE_NAMED_CATEGORY_TEXTURE("subimage - batched lines");
+
+        // full width, batch multiple lines at a time
+        // set batch size based on width
+        U32 batch_size = 32;
+
+        if (width > 1024)
+        {
+            batch_size = 8;
+        }
+        else if (width > 512)
+        {
+            batch_size = 16;
+        }
+
+        // full width texture, do 32 lines at a time
+        for (U32 y_pos = y_offset; y_pos < y_offset_end; y_pos += batch_size)
+        {
+            glTexSubImage2D(target, miplevel, x_offset, y_pos, width, batch_size, pixformat, pixtype, src);
+            src += line_width * batch_size;
+        }
+    }
+    else
     {
-        glTexSubImage2D(target, miplevel, x_offset, y_pos, width, 1, pixformat, pixtype, src);
-        src += line_width;
+        // partial width or strange height
+        for (U32 y_pos = y_offset; y_pos < y_offset_end; y_pos += 1)
+        {
+            glTexSubImage2D(target, miplevel, x_offset, y_pos, width, 1, pixformat, pixtype, src);
+            src += line_width;
+        }
     }
 }
 
@@ -2139,6 +2171,8 @@ void LLImageGL::analyzeAlpha(const void* data_in, U32 w, U32 h)
         return ;
     }
 
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_TEXTURE;
+
     U32 length = w * h;
     U32 alphatotal = 0;
 
@@ -2150,15 +2184,15 @@ void LLImageGL::analyzeAlpha(const void* data_in, U32 w, U32 h)
     // this will mid-skew the data (and thus increase the chances of not
     // being used as a mask) from high-frequency alpha maps which
     // suffer the worst from aliasing when used as alpha masks.
-    if (w >= 2 && h >= 2)
+    if (w >= 4 && h >= 4)
     {
-        llassert(w%2 == 0);
-        llassert(h%2 == 0);
+        llassert(w%4 == 0);
+        llassert(h%4 == 0);
         const GLubyte* rowstart = ((const GLubyte*) data_in) + mAlphaOffset;
-        for (U32 y = 0; y < h; y+=2)
+        for (U32 y = 0; y < h; y+=4)
         {
             const GLubyte* current = rowstart;
-            for (U32 x = 0; x < w; x+=2)
+            for (U32 x = 0; x < w; x+=4)
             {
                 const U32 s1 = current[0];
                 alphatotal += s1;
@@ -2182,7 +2216,7 @@ void LLImageGL::analyzeAlpha(const void* data_in, U32 w, U32 h)
             }
 
 
-            rowstart += 2 * w * mAlphaStride;
+            rowstart += 4 * w * mAlphaStride;
         }
         length *= 2; // we sampled everything twice, essentially
     }
diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp
index 156e300853..c1f239fc43 100644
--- a/indra/llrender/llvertexbuffer.cpp
+++ b/indra/llrender/llvertexbuffer.cpp
@@ -954,6 +954,25 @@ LLVertexBuffer::LLVertexBuffer(U32 typemask)
     }
 }
 
+// list of mapped buffers
+// NOTE: must not be LLPointer<LLVertexBuffer> to avoid breaking non-ref-counted LLVertexBuffer instances
+static std::vector<LLVertexBuffer*> sMappedBuffers;
+
+//static
+void LLVertexBuffer::flushBuffers()
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+    // must only be called from main thread
+    llassert(LLCoros::on_main_thread_main_coro());
+    for (auto& buffer : sMappedBuffers)
+    {
+        buffer->_unmapBuffer();
+        buffer->mMapped = false;
+    }
+
+    sMappedBuffers.resize(0);
+}
+
 //static
 U32 LLVertexBuffer::calcOffsets(const U32& typemask, U32* offsets, U32 num_vertices)
 {
@@ -997,6 +1016,12 @@ U32 LLVertexBuffer::calcVertexSize(const U32& typemask)
 //virtual
 LLVertexBuffer::~LLVertexBuffer()
 {
+    if (mMapped)
+    { // is on the mapped buffer list but doesn't need to be flushed
+        mMapped = false;
+        unmapBuffer();
+    }
+
     destroyGLBuffer();
     destroyGLIndices();
 
@@ -1198,6 +1223,7 @@ bool expand_region(LLVertexBuffer::MappedRegion& region, U32 start, U32 end)
 U8* LLVertexBuffer::mapVertexBuffer(LLVertexBuffer::AttributeType type, U32 index, S32 count)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+    _mapBuffer();
 
     if (count == -1)
     {
@@ -1233,6 +1259,7 @@ U8* LLVertexBuffer::mapVertexBuffer(LLVertexBuffer::AttributeType type, U32 inde
 U8* LLVertexBuffer::mapIndexBuffer(U32 index, S32 count)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+    _mapBuffer();
 
     if (count == -1)
     {
@@ -1289,11 +1316,11 @@ void LLVertexBuffer::flush_vbo(GLenum target, U32 start, U32 end, void* data, U8
         LL_PROFILE_ZONE_NUM(end);
         LL_PROFILE_ZONE_NUM(end-start);
 
-        constexpr U32 block_size = 8192;
+        constexpr U32 block_size = 65536;
 
         for (U32 i = start; i <= end; i += block_size)
         {
-            LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block");
+            //LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block");
             //LL_PROFILE_GPU_ZONE("glBufferSubData");
             U32 tend = llmin(i + block_size, end);
             U32 size = tend - i + 1;
@@ -1305,7 +1332,28 @@ void LLVertexBuffer::flush_vbo(GLenum target, U32 start, U32 end, void* data, U8
 
 void LLVertexBuffer::unmapBuffer()
 {
+    flushBuffers();
+}
+
+void LLVertexBuffer::_mapBuffer()
+{
+    // must only be called from main thread
+    llassert(LLCoros::on_main_thread_main_coro());
+    if (!mMapped)
+    {
+        mMapped = true;
+        sMappedBuffers.push_back(this);
+    }
+}
+
+void LLVertexBuffer::_unmapBuffer()
+{
     STOP_GLERROR;
+    if (!mMapped)
+    {
+        return;
+    }
+
     struct SortMappedRegion
     {
         bool operator()(const MappedRegion& lhs, const MappedRegion& rhs)
@@ -1549,6 +1597,13 @@ void LLVertexBuffer::setBuffer()
         return;
     }
 #endif
+
+    if (mMapped)
+    {
+        LL_WARNS() << "Missing call to unmapBuffer or flushBuffers" << LL_ENDL;
+        _unmapBuffer();
+    }
+
     // no data may be pending
     llassert(mMappedVertexRegions.empty());
     llassert(mMappedIndexRegions.empty());
diff --git a/indra/llrender/llvertexbuffer.h b/indra/llrender/llvertexbuffer.h
index 2a4affdc60..9fe468f89e 100644
--- a/indra/llrender/llvertexbuffer.h
+++ b/indra/llrender/llvertexbuffer.h
@@ -120,6 +120,9 @@ public:
     // indexed by the following enum
     static U32 calcOffsets(const U32& typemask, U32* offsets, U32 num_vertices);
 
+    // flush any pending mapped buffers
+    static void flushBuffers();
+
     //WARNING -- when updating these enums you MUST
     // 1 - update LLVertexBuffer::sTypeSize
     // 2 - update LLVertexBuffer::vb_type_name
@@ -190,6 +193,8 @@ public:
     // map for data access (see also getFooStrider below)
     U8*     mapVertexBuffer(AttributeType type, U32 index, S32 count = -1);
     U8*     mapIndexBuffer(U32 index, S32 count = -1);
+
+    // synonym for flushBuffers
     void    unmapBuffer();
 
     // set for rendering
@@ -312,6 +317,13 @@ private:
 
     bool    allocateBuffer(S32 nverts, S32 nindices, bool create) { return allocateBuffer(nverts, nindices); }
 
+    // actually unmap buffer
+    void _unmapBuffer();
+
+    // add to set of mapped buffers
+    void _mapBuffer();
+    bool mMapped = false;
+
 public:
 
     static U64 getBytesAllocated();
author	RunitaiLinden <davep@lindenlab.com>	2024-09-10 18:27:45 -0500
committer	RunitaiLinden <davep@lindenlab.com>	2024-09-10 18:27:45 -0500
commit	cde5d29faf84c5cb7fc1b0d0ff6d03f3b7354c8f (patch)
tree	a0e1eec0a22e1c818463a0d756796bd001c2cf91 /indra/llrender
parent	33116ea35ec9a925c1601c8f1833e4d1e9f8390b (diff)