From 2e16d1b365e465c0c3e505770e213200ec25fc12 Mon Sep 17 00:00:00 2001
From: mobserveur <mobserveur@gmail.com>
Date: Sat, 29 Jun 2024 22:57:51 +0200
Subject: vertex buffer optimisations for Apple GPU

The vertex buffer will use the original mapping for non Apple GPUs and the new optimized mapping for Apple GPUs.
It needs the Apple gpu detection in the gl manager
---
 indra/llrender/llvertexbuffer.cpp | 42 ++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

(limited to 'indra/llrender')

diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp
index 783794512c..52fb58187a 100644
--- a/indra/llrender/llvertexbuffer.cpp
+++ b/indra/llrender/llvertexbuffer.cpp
@@ -1155,19 +1155,40 @@ static void flush_vbo(GLenum target, U32 start, U32 end, void* data)
         LL_PROFILE_ZONE_NUM(end-start);
 
         U32 size = end-start+1;
+        U32 block_size = 65536;
 
-        //Note (observeur): glBufferSubData() was causing synchronization stalls, specialy on Apple GPUs, possibly to the fact Apple GPU is a tiled gpu, resulting to heavy stutters, and spacialy when called several times per frame on the same buffer.
+        //Note (observeur): The following code is executed on non Apple gpus. Using glMapBufferRange() didn't show obvious benefit on the other tested platforms (intel igpu, amd igpu and nVidia dgpus).
+        if(!gGLManager.mIsApple)
+        {
+            for (U32 i = start; i <= end; i += block_size)
+            {
+                LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block");
+                LL_PROFILE_GPU_ZONE("glBufferSubData");
+                U32 tend = llmin(i + block_size, end);
+                U32 size = tend - i + 1;
+                glBufferSubData(target, i, size, (U8*) data + (i-start));
+            }
+
+            return;
+        }
+
+        //Note (observeur): glBufferSubData() was causing synchronization stalls on Apple GPUs resulting to heavy stutters and lower performance in the world and UI rendering. Using glMapBufferRange() benefits Macs with Apple gpus enormously.
 
-        //Note (observeur): I maintained the notion of block_size for testing purpose, but i think it's a bad idea. We don't know the overhead of glMapBufferRange() depending on the driver, so it's better avoiding calling it more than necessary.(0 -> loop is disabled, 8192 -> original value, 524288 -> a resonable value).
-        constexpr U32 block_size = 0;
+        //Note (observeur): Other bits such as GL_MAP_INVALIDATE_RANGE_BIT or GL_MAP_UNSYNCHRONIZED_BIT didn't seem to make much of a difference on Apple gpus, so we stick to the simple way.
+        U32 MapBits = GL_MAP_WRITE_BIT;
 
+        //Note (observeur): Using a block size of 0 will call the following block and map the buffer all in once. It doesn't bother Apple machines, it might actually benefit them a little bit. A larger value is also fine. The largest buffers I observed where around 2mb or 3mb while most of buffers are smaller than 50000 bytes.
+        block_size = 524288;
+
+        //Note (observeur): This is called in case block_size is set to 0 (All in one mapping).
         if(block_size == 0)
         {
             U8 * mptr = NULL;
             LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block");
             LL_PROFILE_GPU_ZONE("glBufferSubData");
 
-            mptr = (U8*) glMapBufferRange( target, start, size, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
+            mptr = (U8*) glMapBufferRange( target, start, size, MapBits);
+
             if(mptr)
             {
                 std::memcpy(mptr, (U8*) data, size);
@@ -1180,11 +1201,13 @@ static void flush_vbo(GLenum target, U32 start, U32 end, void* data)
             return;
         }
 
+        //Note (observeur): The following code is executed in case of block_size is superior to 0
+
         //Note (observeur): This is for analysis purpose only
-        if(size > block_size)
-        {
-            LL_INFOS() << "Large data range : " << size << LL_ENDL;
-        }
+        //if(size > block_size)
+        //{
+        //    LL_INFOS() << "Large data range (MB MODE) : " << size << LL_ENDL;
+        //}
 
         U8 * mptr = NULL;
 
@@ -1195,7 +1218,8 @@ static void flush_vbo(GLenum target, U32 start, U32 end, void* data)
             U32 tend = llmin(i + block_size, end);
             size = tend - i + 1;
 
-            mptr = (U8*) glMapBufferRange( target, i, size, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
+            mptr = (U8*) glMapBufferRange( target, i, size, MapBits );
+
             if(mptr)
             {
                 std::memcpy(mptr, (U8*) data + (i-start), size);
-- 
cgit v1.3