summaryrefslogtreecommitdiff
path: root/indra/llrender
diff options
context:
space:
mode:
Diffstat (limited to 'indra/llrender')
-rw-r--r--indra/llrender/llgl.cpp7
-rw-r--r--indra/llrender/llgl.h1
-rw-r--r--indra/llrender/llvertexbuffer.cpp42
3 files changed, 41 insertions, 9 deletions
diff --git a/indra/llrender/llgl.cpp b/indra/llrender/llgl.cpp
index 10920f1de3..9207e6ad73 100644
--- a/indra/llrender/llgl.cpp
+++ b/indra/llrender/llgl.cpp
@@ -1001,6 +1001,7 @@ LLGLManager::LLGLManager() :
mIsAMD(FALSE),
mIsNVIDIA(FALSE),
mIsIntel(FALSE),
+ mIsApple(FALSE),
#if LL_DARWIN
mIsMobileGF(FALSE),
#endif
@@ -1174,6 +1175,11 @@ bool LLGLManager::initGL()
mGLVendorShort = "INTEL";
mIsIntel = TRUE;
}
+ else if(mGLVendor.find("APPLE") != std::string::npos)
+ {
+ mGLVendorShort = "APPLE";
+ mIsApple = TRUE;
+ }
else
{
mGLVendorShort = "MISC";
@@ -1373,6 +1379,7 @@ void LLGLManager::asLLSD(LLSD& info)
info["is_ati"] = mIsAMD; // note, do not rename is_ati to is_amd without coordinating with DW
info["is_nvidia"] = mIsNVIDIA;
info["is_intel"] = mIsIntel;
+ info["is_apple"] = mIsApple;
info["gl_renderer"] = mGLRenderer;
}
diff --git a/indra/llrender/llgl.h b/indra/llrender/llgl.h
index 5a7ad943df..e4b106c999 100644
--- a/indra/llrender/llgl.h
+++ b/indra/llrender/llgl.h
@@ -100,6 +100,7 @@ public:
BOOL mIsAMD;
BOOL mIsNVIDIA;
BOOL mIsIntel;
+ BOOL mIsApple;
#if LL_DARWIN
// Needed to distinguish problem cards on older Macs that break with Materials
diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp
index 783794512c..52fb58187a 100644
--- a/indra/llrender/llvertexbuffer.cpp
+++ b/indra/llrender/llvertexbuffer.cpp
@@ -1155,19 +1155,40 @@ static void flush_vbo(GLenum target, U32 start, U32 end, void* data)
LL_PROFILE_ZONE_NUM(end-start);
U32 size = end-start+1;
+ U32 block_size = 65536;
- //Note (observeur): glBufferSubData() was causing synchronization stalls, specialy on Apple GPUs, possibly to the fact Apple GPU is a tiled gpu, resulting to heavy stutters, and spacialy when called several times per frame on the same buffer.
+ //Note (observeur): The following code is executed on non Apple gpus. Using glMapBufferRange() didn't show obvious benefit on the other tested platforms (intel igpu, amd igpu and nVidia dgpus).
+ if(!gGLManager.mIsApple)
+ {
+ for (U32 i = start; i <= end; i += block_size)
+ {
+ LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block");
+ LL_PROFILE_GPU_ZONE("glBufferSubData");
+ U32 tend = llmin(i + block_size, end);
+ U32 size = tend - i + 1;
+ glBufferSubData(target, i, size, (U8*) data + (i-start));
+ }
+
+ return;
+ }
+
+ //Note (observeur): glBufferSubData() was causing synchronization stalls on Apple GPUs resulting to heavy stutters and lower performance in the world and UI rendering. Using glMapBufferRange() benefits Macs with Apple gpus enormously.
- //Note (observeur): I maintained the notion of block_size for testing purpose, but i think it's a bad idea. We don't know the overhead of glMapBufferRange() depending on the driver, so it's better avoiding calling it more than necessary.(0 -> loop is disabled, 8192 -> original value, 524288 -> a resonable value).
- constexpr U32 block_size = 0;
+ //Note (observeur): Other bits such as GL_MAP_INVALIDATE_RANGE_BIT or GL_MAP_UNSYNCHRONIZED_BIT didn't seem to make much of a difference on Apple gpus, so we stick to the simple way.
+ U32 MapBits = GL_MAP_WRITE_BIT;
+ //Note (observeur): Using a block size of 0 will call the following block and map the buffer all in once. It doesn't bother Apple machines, it might actually benefit them a little bit. A larger value is also fine. The largest buffers I observed where around 2mb or 3mb while most of buffers are smaller than 50000 bytes.
+ block_size = 524288;
+
+ //Note (observeur): This is called in case block_size is set to 0 (All in one mapping).
if(block_size == 0)
{
U8 * mptr = NULL;
LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block");
LL_PROFILE_GPU_ZONE("glBufferSubData");
- mptr = (U8*) glMapBufferRange( target, start, size, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
+ mptr = (U8*) glMapBufferRange( target, start, size, MapBits);
+
if(mptr)
{
std::memcpy(mptr, (U8*) data, size);
@@ -1180,11 +1201,13 @@ static void flush_vbo(GLenum target, U32 start, U32 end, void* data)
return;
}
+ //Note (observeur): The following code is executed in case of block_size is superior to 0
+
//Note (observeur): This is for analysis purpose only
- if(size > block_size)
- {
- LL_INFOS() << "Large data range : " << size << LL_ENDL;
- }
+ //if(size > block_size)
+ //{
+ // LL_INFOS() << "Large data range (MB MODE) : " << size << LL_ENDL;
+ //}
U8 * mptr = NULL;
@@ -1195,7 +1218,8 @@ static void flush_vbo(GLenum target, U32 start, U32 end, void* data)
U32 tend = llmin(i + block_size, end);
size = tend - i + 1;
- mptr = (U8*) glMapBufferRange( target, i, size, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
+ mptr = (U8*) glMapBufferRange( target, i, size, MapBits );
+
if(mptr)
{
std::memcpy(mptr, (U8*) data + (i-start), size);