5 files changed, 446 insertions, 490 deletions
diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp
index 7b8f85acba..20261dcb8a 100644
--- a/indra/llrender/llvertexbuffer.cpp
+++ b/indra/llrender/llvertexbuffer.cpp
@@ -62,6 +62,14 @@ U32 wpo2(U32 i)
 	return r;
 }
 
+struct CompareMappedRegion
+{
+    bool operator()(const LLVertexBuffer::MappedRegion& lhs, const LLVertexBuffer::MappedRegion& rhs)
+    {
+        return lhs.mStart < rhs.mStart;
+    }
+};
+
 
 const U32 LL_VBO_BLOCK_SIZE = 2048;
 const U32 LL_VBO_POOL_MAX_SEED_SIZE = 256*1024;
@@ -81,266 +89,217 @@ U32 vbo_block_index(U32 size)
 
 const U32 LL_VBO_POOL_SEED_COUNT = vbo_block_index(LL_VBO_POOL_MAX_SEED_SIZE) + 1;
 
+#define ENABLE_GL_WORK_QUEUE 0
+
+#if ENABLE_GL_WORK_QUEUE
+
+#define THREAD_COUNT 1
 
 //============================================================================
 
-//static
-LLVBOPool LLVertexBuffer::sStreamVBOPool(GL_STREAM_DRAW, GL_ARRAY_BUFFER);
-LLVBOPool LLVertexBuffer::sDynamicVBOPool(GL_DYNAMIC_DRAW, GL_ARRAY_BUFFER);
-LLVBOPool LLVertexBuffer::sDynamicCopyVBOPool(GL_DYNAMIC_COPY, GL_ARRAY_BUFFER);
-LLVBOPool LLVertexBuffer::sStreamIBOPool(GL_STREAM_DRAW, GL_ELEMENT_ARRAY_BUFFER);
-LLVBOPool LLVertexBuffer::sDynamicIBOPool(GL_DYNAMIC_DRAW, GL_ELEMENT_ARRAY_BUFFER);
+// High performance WorkQueue for usage in real-time rendering work
+class GLWorkQueue
+{
+public:
+    using Work = std::function<void()>;
 
-U32 LLVBOPool::sBytesPooled = 0;
-U32 LLVBOPool::sIndexBytesPooled = 0;
-U32 LLVBOPool::sNameIdx = 0;
-U32 LLVBOPool::sNamePool[1024];
+    GLWorkQueue();
 
-std::list<U32> LLVertexBuffer::sAvailableVAOName;
-U32 LLVertexBuffer::sCurVAOName = 1;
+    void post(const Work& value);
 
-U32 LLVertexBuffer::sAllocatedIndexBytes = 0;
-U32 LLVertexBuffer::sIndexCount = 0;
+    size_t size();
 
-U32 LLVertexBuffer::sBindCount = 0;
-U32 LLVertexBuffer::sSetCount = 0;
-S32 LLVertexBuffer::sCount = 0;
-S32 LLVertexBuffer::sGLCount = 0;
-S32 LLVertexBuffer::sMappedCount = 0;
-bool LLVertexBuffer::sDisableVBOMapping = false;
-bool LLVertexBuffer::sEnableVBOs = true;
-U32 LLVertexBuffer::sGLRenderBuffer = 0;
-U32 LLVertexBuffer::sGLRenderArray = 0;
-U32 LLVertexBuffer::sGLRenderIndices = 0;
-U32 LLVertexBuffer::sLastMask = 0;
-bool LLVertexBuffer::sVBOActive = false;
-bool LLVertexBuffer::sIBOActive = false;
-U32 LLVertexBuffer::sAllocatedBytes = 0;
-U32 LLVertexBuffer::sVertexCount = 0;
-bool LLVertexBuffer::sMapped = false;
-bool LLVertexBuffer::sUseStreamDraw = true;
-bool LLVertexBuffer::sUseVAO = false;
-bool LLVertexBuffer::sPreferStreamDraw = false;
+    bool done();
 
-U32 LLVBOPool::genBuffer()
-{
-	LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX
+    // Get the next element from the queue
+    Work pop();
 
-	if (sNameIdx == 0)
-	{
-		glGenBuffers(1024, sNamePool);
-		sNameIdx = 1024;
-	}
+    void runOne();
 
-	return sNamePool[--sNameIdx];
-}
+    bool runPending();
 
-void LLVBOPool::deleteBuffer(U32 name)
-{
-	LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX
-	if (gGLManager.mInited)
-	{
-		LLVertexBuffer::unbind();
+    void runUntilClose();
 
-		glBindBuffer(mType, name);
-		glBufferData(mType, 0, NULL, mUsage);
-		glBindBuffer(mType, 0);
+    void close();
 
-		glDeleteBuffers(1, &name);
-	}
-}
+    bool isClosed();
 
+    void syncGL();
 
-LLVBOPool::LLVBOPool(U32 vboUsage, U32 vboType)
-: mUsage(vboUsage), mType(vboType), mMissCountDirty(true)
-{
-    mFreeList.resize(LL_VBO_POOL_SEED_COUNT);
-	mMissCount.resize(LL_VBO_POOL_SEED_COUNT);
-	std::fill(mMissCount.begin(), mMissCount.end(), 0);
-}
+private:
+    std::mutex mMutex;
+    std::condition_variable mCondition;
+    std::queue<Work> mQueue;
+    bool mClosed = false;
+};
 
-U8* LLVBOPool::allocate(U32& name, U32 size, bool for_seed)
+GLWorkQueue::GLWorkQueue()
 {
-	LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX
-	llassert(vbo_block_size(size) == size);
-	
-	U8* ret = NULL;
 
-	U32 i = vbo_block_index(size);
+}
 
-	if (mFreeList.size() <= i)
-	{
-		mFreeList.resize(i+1);
-	}
+void GLWorkQueue::syncGL()
+{
+    /*if (mSync)
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        glWaitSync(mSync, 0, GL_TIMEOUT_IGNORED);
+        mSync = 0;
+    }*/
+}
 
-	if (mFreeList[i].empty() || for_seed)
-	{
-		//make a new buffer
-		name = genBuffer();
+size_t GLWorkQueue::size()
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD;
+    std::lock_guard<std::mutex> lock(mMutex);
+    return mQueue.size();
+}
 
-		glBindBuffer(mType, name);
+bool GLWorkQueue::done()
+{
+    return size() == 0 && isClosed();
+}
 
-		if (!for_seed && i < LL_VBO_POOL_SEED_COUNT)
-		{ //record this miss
-			mMissCount[i]++;	
-            mMissCountDirty = true;  // signal to ::seedPool()
-		}
+void GLWorkQueue::post(const GLWorkQueue::Work& value)
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD;
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        mQueue.push(std::move(value));
+    }
 
-		if (mType == GL_ARRAY_BUFFER)
-		{
-			LLVertexBuffer::sAllocatedBytes += size;
-		}
-		else
-		{
-			LLVertexBuffer::sAllocatedIndexBytes += size;
-		}
+    mCondition.notify_one();
+}
 
-		if (LLVertexBuffer::sDisableVBOMapping || mUsage != GL_DYNAMIC_DRAW)
-		{
-			glBufferData(mType, size, 0, mUsage);
-			if (mUsage != GL_DYNAMIC_COPY)
-			{ //data will be provided by application
-				ret = (U8*) ll_aligned_malloc<64>(size);
-				if (!ret)
-				{
-                    LL_ERRS()
-                        << "Failed to allocate " << size << " bytes for LLVBOPool buffer " << name << "." << LL_NEWLINE
-                        << "Free list size: "
-                        << mFreeList.size()  // this happens if we are out of memory so a solution might be to clear some from freelist
-							  << " Allocated Bytes: " << LLVertexBuffer::sAllocatedBytes
-                        << " Allocated Index Bytes: " << LLVertexBuffer::sAllocatedIndexBytes << " Pooled Bytes: " << sBytesPooled
-                        << " Pooled Index Bytes: " << sIndexBytesPooled << LL_ENDL;
-				}
-			}
-		}
-		else
-		{ //always use a true hint of static draw when allocating non-client-backed buffers
-			glBufferData(mType, size, 0, GL_STATIC_DRAW);
-		}
+// Get the next element from the queue
+GLWorkQueue::Work GLWorkQueue::pop()
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD;
+    // Lock the mutex
+    {
+        std::unique_lock<std::mutex> lock(mMutex);
 
-		glBindBuffer(mType, 0);
+        // Wait for a new element to become available or for the queue to close
+        {
+            mCondition.wait(lock, [=] { return !mQueue.empty() || mClosed; });
+        }
+    }
 
-		if (for_seed)
-		{ //put into pool for future use
-			llassert(mFreeList.size() > i);
+    Work ret;
 
-			Record rec;
-			rec.mGLName = name;
-			rec.mClientData = ret;
-	
-			if (mType == GL_ARRAY_BUFFER)
-			{
-				sBytesPooled += size;
-			}
-			else
-			{
-				sIndexBytesPooled += size;
-			}
-			mFreeList[i].push_back(rec);
-            mMissCountDirty = true;  // signal to ::seedPool()
-		}
-	}
-	else
-	{
-		name = mFreeList[i].front().mGLName;
-		ret = mFreeList[i].front().mClientData;
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
 
-		if (mType == GL_ARRAY_BUFFER)
-		{
-			sBytesPooled -= size;
-		}
-		else
-		{
-			sIndexBytesPooled -= size;
-		}
+        // Get the next element from the queue
+        if (mQueue.size() > 0)
+        {
+            ret = mQueue.front();
+            mQueue.pop();
+        }
+        else
+        {
+            ret = []() {};
+        }
+    }
 
-		mFreeList[i].pop_front();
-        mMissCountDirty = true;  // signal to ::seedPool()
-	}
+    return ret;
+}
 
-	return ret;
+void GLWorkQueue::runOne()
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD;
+    Work w = pop();
+    w();
+    //mSync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 }
 
-void LLVBOPool::release(U32 name, U8* buffer, U32 size)
+void GLWorkQueue::runUntilClose()
 {
-	llassert(vbo_block_size(size) == size);
+    while (!isClosed())
+    {
+        runOne();
+    }
+}
 
-	deleteBuffer(name);
-	ll_aligned_free_fallback((U8*) buffer);
+void GLWorkQueue::close()
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD;
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        mClosed = true;
+    }
 
-	if (mType == GL_ARRAY_BUFFER)
-	{
-		LLVertexBuffer::sAllocatedBytes -= size;
-	}
-	else
-	{
-		LLVertexBuffer::sAllocatedIndexBytes -= size;
-	}
+    mCondition.notify_all();
 }
 
-void LLVBOPool::seedPool()
+bool GLWorkQueue::isClosed()
 {
-	LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX
-    if (mMissCountDirty)
-	{
-	U32 dummy_name = 0;
-        U32 size       = LL_VBO_BLOCK_SIZE;
-
-	for (U32 i = 0; i < LL_VBO_POOL_SEED_COUNT; i++)
-	{
-		if (mMissCount[i] > mFreeList[i].size())
-		{ 
-			S32 count = mMissCount[i] - mFreeList[i].size();
-			for (U32 j = 0; j < count; ++j)
-			{
-				allocate(dummy_name, size, true);
-			}
-		}
-            size += LL_VBO_BLOCK_SIZE;
-        }
-        mMissCountDirty = false;
-	}
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD;
+    std::lock_guard<std::mutex> lock(mMutex);
+    return mClosed;
 }
 
-void LLVBOPool::cleanup()
+#include "llwindow.h"
+
+class LLGLWorkerThread : public LLThread
 {
-	U32 size = LL_VBO_BLOCK_SIZE;
+public:
+    LLGLWorkerThread(const std::string& name, GLWorkQueue* queue, LLWindow* window)
+        : LLThread(name)
+    {
+        mWindow = window;
+        mContext = mWindow->createSharedContext();
+        mQueue = queue;
+    }
 
-	for (U32 i = 0; i < mFreeList.size(); ++i)
-	{
-		record_list_t& l = mFreeList[i];
+    void run() override
+    {
+        mWindow->makeContextCurrent(mContext);
+        gGL.init(false);
+        mQueue->runUntilClose();
+        gGL.shutdown();
+        mWindow->destroySharedContext(mContext);
+    }
 
-		while (!l.empty())
-		{
-			Record& r = l.front();
+    GLWorkQueue* mQueue;
+    LLWindow* mWindow;
+    void* mContext = nullptr;
+};
 
-			deleteBuffer(r.mGLName);
-			
-			if (r.mClientData)
-			{
-				ll_aligned_free<64>((void*) r.mClientData);
-			}
 
-			l.pop_front();
+static LLGLWorkerThread* sVBOThread[THREAD_COUNT];
+static GLWorkQueue* sQueue = nullptr;
 
-			if (mType == GL_ARRAY_BUFFER)
-			{
-				sBytesPooled -= size;
-				LLVertexBuffer::sAllocatedBytes -= size;
-			}
-			else
-			{
-				sIndexBytesPooled -= size;
-				LLVertexBuffer::sAllocatedIndexBytes -= size;
-			}
-		}
+#endif
 
-		size += LL_VBO_BLOCK_SIZE;
-	}
+//============================================================================
 
-	//reset miss counts
-	std::fill(mMissCount.begin(), mMissCount.end(), 0);
-}
+//static
+std::list<U32> LLVertexBuffer::sAvailableVAOName;
+U32 LLVertexBuffer::sCurVAOName = 1;
+
+U32 LLVertexBuffer::sAllocatedIndexBytes = 0;
+U32 LLVertexBuffer::sIndexCount = 0;
+
+U32 LLVertexBuffer::sBindCount = 0;
+U32 LLVertexBuffer::sSetCount = 0;
+S32 LLVertexBuffer::sCount = 0;
+S32 LLVertexBuffer::sGLCount = 0;
+S32 LLVertexBuffer::sMappedCount = 0;
+bool LLVertexBuffer::sDisableVBOMapping = false;
+bool LLVertexBuffer::sEnableVBOs = true;
+U32 LLVertexBuffer::sGLRenderBuffer = 0;
+U32 LLVertexBuffer::sGLRenderArray = 0;
+U32 LLVertexBuffer::sGLRenderIndices = 0;
+U32 LLVertexBuffer::sLastMask = 0;
+bool LLVertexBuffer::sVBOActive = false;
+bool LLVertexBuffer::sIBOActive = false;
+U32 LLVertexBuffer::sAllocatedBytes = 0;
+U32 LLVertexBuffer::sVertexCount = 0;
+bool LLVertexBuffer::sMapped = false;
+bool LLVertexBuffer::sUseStreamDraw = true;
+bool LLVertexBuffer::sUseVAO = false;
+bool LLVertexBuffer::sPreferStreamDraw = false;
 
 
 //NOTE: each component must be AT LEAST 4 bytes in size to avoid a performance penalty on AMD hardware
@@ -420,17 +379,6 @@ void LLVertexBuffer::releaseVAOName(U32 name)
 
 
 //static
-void LLVertexBuffer::seedPools()
-{
-	LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX
-	sStreamVBOPool.seedPool();
-	sDynamicVBOPool.seedPool();
-	sDynamicCopyVBOPool.seedPool();
-	sStreamIBOPool.seedPool();
-	sDynamicIBOPool.seedPool();
-}
-
-//static
 void LLVertexBuffer::setupClientArrays(U32 data_mask)
 {
 	if (sLastMask != data_mask)
@@ -473,7 +421,7 @@ void LLVertexBuffer::drawArrays(U32 mode, const std::vector<LLVector3>& pos)
     }
     gGL.end();
     gGL.flush();
-		}
+}
 
 //static
 void LLVertexBuffer::drawElements(U32 mode, const LLVector4a* pos, const LLVector2* tc, S32 num_indices, const U16* indicesp)
@@ -704,10 +652,20 @@ void LLVertexBuffer::drawArrays(U32 mode, U32 first, U32 count) const
 }
 
 //static
-void LLVertexBuffer::initClass(bool use_vbo, bool no_vbo_mapping)
+void LLVertexBuffer::initClass(LLWindow* window)
 {
-    sEnableVBOs = use_vbo;
-	sDisableVBOMapping = sEnableVBOs && no_vbo_mapping;
+    sEnableVBOs = true;
+    sDisableVBOMapping = true;
+
+#if ENABLE_GL_WORK_QUEUE
+    sQueue = new GLWorkQueue();
+
+    for (int i = 0; i < THREAD_COUNT; ++i)
+    {
+        sVBOThread[i] = new LLGLWorkerThread("VBO Worker", sQueue, window);
+        sVBOThread[i]->start();
+    }
+#endif
 }
 
 //static 
@@ -743,14 +701,19 @@ void LLVertexBuffer::cleanupClass()
 {
 	unbind();
 	
-	sStreamIBOPool.cleanup();
-	sDynamicIBOPool.cleanup();
-	sStreamVBOPool.cleanup();
-	sDynamicVBOPool.cleanup();
-	sDynamicCopyVBOPool.cleanup();
-
-    llassert(0 == LLVBOPool::sBytesPooled);
-    llassert(0 == LLVBOPool::sIndexBytesPooled);
+#if ENABLE_GL_WORK_QUEUE
+    sQueue->close();
+    for (int i = 0; i < THREAD_COUNT; ++i)
+    {
+        sVBOThread[i]->shutdown();
+        delete sVBOThread[i];
+        sVBOThread[i] = nullptr;
+    }
+
+    delete sQueue;
+    sQueue = nullptr;
+#endif
+
     //llassert(0 == sAllocatedBytes);
     //llassert(0 == sAllocatedIndexBytes);
 }
@@ -781,21 +744,6 @@ S32 LLVertexBuffer::determineUsage(S32 usage)
 		ret_usage = GL_STREAM_DRAW;
 	}
 	
-	if (ret_usage && ret_usage != GL_STREAM_DRAW)
-	{ //only stream_draw and dynamic_draw are supported when using VBOs, dynamic draw is the default
-		if (ret_usage != GL_DYNAMIC_COPY)
-		{
-		    if (sDisableVBOMapping)
-		    { //always use stream draw if VBO mapping is disabled
-			    ret_usage = GL_STREAM_DRAW;
-		    }
-		    else
-		    {
-			    ret_usage = GL_DYNAMIC_DRAW;
-		    }
-	    }
-	}
-	
 	return ret_usage;
 }
 
@@ -848,7 +796,7 @@ S32 LLVertexBuffer::calcOffsets(const U32& typemask, S32* offsets, S32 num_verti
 
 	offsets[TYPE_TEXTURE_INDEX] = offsets[TYPE_VERTEX] + 12;
 	
-	return offset+16;
+	return offset;
 }
 
 //static 
@@ -896,74 +844,101 @@ LLVertexBuffer::~LLVertexBuffer()
 
 //----------------------------------------------------------------------------
 
+// batch glGenBuffers
+static GLuint gen_buffer()
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+    constexpr U32 pool_size = 4096;
+
+    thread_local static GLuint sNamePool[pool_size];
+    thread_local static U32 sIndex = 0;
+
+    if (sIndex == 0)
+    {
+        LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("gen ibo");
+        sIndex = pool_size;
+        glGenBuffers(pool_size, sNamePool);
+    }
+
+    return sNamePool[--sIndex];
+}
+
+// batch glDeleteBuffers
+static void release_buffer(U32 buff)
+{
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+#if 0
+
+    constexpr U32 pool_size = 4096;
+
+    thread_local static GLuint sNamePool[pool_size];
+    thread_local static U32 sIndex = 0;
+
+    if (sIndex == pool_size)
+    {
+        LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("gen ibo");
+        sIndex = 0;
+        glDeleteBuffers(pool_size, sNamePool);
+    }
+
+    sNamePool[sIndex++] = buff;
+#else
+    glDeleteBuffers(1, &buff);
+#endif
+}
+
 void LLVertexBuffer::genBuffer(U32 size)
 {
-	mSize = vbo_block_size(size);
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
 
-	if (mUsage == GL_STREAM_DRAW)
-	{
-		mMappedData = sStreamVBOPool.allocate(mGLBuffer, mSize);
-	}
-	else if (mUsage == GL_DYNAMIC_DRAW)
-	{
-		mMappedData = sDynamicVBOPool.allocate(mGLBuffer, mSize);
-	}
-	else
-	{
-		mMappedData = sDynamicCopyVBOPool.allocate(mGLBuffer, mSize);
-	}
-	
-	
-	sGLCount++;
+    mSize = size;
+    mMappedData = (U8*) ll_aligned_malloc_16(size);
+    mGLBuffer = gen_buffer();
+
+    glBindBuffer(GL_ARRAY_BUFFER, mGLBuffer);
+    glBufferData(GL_ARRAY_BUFFER, mSize, nullptr, mUsage);
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+    sGLCount++;
 }
 
 void LLVertexBuffer::genIndices(U32 size)
 {
-	mIndicesSize = vbo_block_size(size);
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+
+    mIndicesSize = size;
+    mMappedIndexData = (U8*) ll_aligned_malloc_16(size);
+
+    mGLIndices = gen_buffer();
+
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, mGLIndices);
+    glBufferData(GL_ELEMENT_ARRAY_BUFFER, mIndicesSize, nullptr, mUsage);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
 
-	if (mUsage == GL_STREAM_DRAW)
-	{
-		mMappedIndexData = sStreamIBOPool.allocate(mGLIndices, mIndicesSize);
-	}
-	else
-	{
-		mMappedIndexData = sDynamicIBOPool.allocate(mGLIndices, mIndicesSize);
-	}
-	
 	sGLCount++;
 }
 
 void LLVertexBuffer::releaseBuffer()
 {
-	if (mUsage == GL_STREAM_DRAW)
-	{
-		sStreamVBOPool.release(mGLBuffer, mMappedData, mSize);
-	}
-	else
-	{
-		sDynamicVBOPool.release(mGLBuffer, mMappedData, mSize);
-	}
-	
-	mGLBuffer = 0;
-	mMappedData = NULL;
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+    release_buffer(mGLBuffer);
+    mGLBuffer = 0;
 
+    ll_aligned_free_16(mMappedData);
+    mMappedData = nullptr;
+	
 	sGLCount--;
 }
 
 void LLVertexBuffer::releaseIndices()
 {
-	if (mUsage == GL_STREAM_DRAW)
-	{
-		sStreamIBOPool.release(mGLIndices, mMappedIndexData, mIndicesSize);
-	}
-	else
-	{
-		sDynamicIBOPool.release(mGLIndices, mMappedIndexData, mIndicesSize);
-	}
+    LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX;
+    release_buffer(mGLIndices);
+    mGLIndices = 0;
+
+    ll_aligned_free_16(mMappedIndexData);
+    mMappedIndexData = nullptr;
 
-	mGLIndices = 0;
-	mMappedIndexData = NULL;
-	
 	sGLCount--;
 }
 
@@ -1183,21 +1158,20 @@ bool LLVertexBuffer::useVBOs() const
 
 //----------------------------------------------------------------------------
 
-bool expand_region(LLVertexBuffer::MappedRegion& region, S32 index, S32 count)
+// if no gap between region and given range exists, expand region to cover given range and return true
+// otherwise return false
+bool expand_region(LLVertexBuffer::MappedRegion& region, S32 start, S32 end)
 {
-	S32 end = index+count;
-	S32 region_end = region.mIndex+region.mCount;
 	
-	if (end < region.mIndex ||
-		index > region_end)
+	if (end < region.mStart ||
+		start > region.mEnd)
 	{ //gap exists, do not merge
 		return false;
 	}
 
-	S32 new_end = llmax(end, region_end);
-	S32 new_index = llmin(index, region.mIndex);
-	region.mIndex = new_index;
-	region.mCount = new_end-new_index;
+    region.mStart = llmin(region.mStart, start);
+    region.mEnd = llmax(region.mEnd, end);
+
 	return true;
 }
 
@@ -1215,34 +1189,34 @@ U8* LLVertexBuffer::mapVertexBuffer(S32 type, S32 index, S32 count, bool map_ran
 	{
 		LL_ERRS() << "LLVertexBuffer::mapVertexBuffer() called on unallocated buffer." << LL_ENDL;
 	}
-		
-	if (useVBOs())
-	{
-		if (count == -1)
-		{
-			count = mNumVerts-index;
-		}
 
-		bool mapped = false;
-		//see if range is already mapped
+
+    if (useVBOs())
+    {
+        if (count == -1)
+        {
+            count = mNumVerts - index;
+        }
+
+        S32 start = mOffsets[type] + sTypeSize[type] * index;
+        S32 end = start + sTypeSize[type] * count;
+
+		bool flagged = false;
+		// flag region as mapped
 		for (U32 i = 0; i < mMappedVertexRegions.size(); ++i)
 		{
 			MappedRegion& region = mMappedVertexRegions[i];
-			if (region.mType == type)
-			{
-				if (expand_region(region, index, count))
-				{
-					mapped = true;
-					break;
-				}
-			}
+            if (expand_region(region, start, end))
+            {
+                flagged = true;
+                break;
+            }
 		}
 
-		if (!mapped)
+		if (!flagged)
 		{
-			//not already mapped, map new region
-			MappedRegion region(type, index, count);
-			mMappedVertexRegions.push_back(region);
+			//didn't expand an existing region, make a new one
+            mMappedVertexRegions.push_back({ start, end });
 		}
 
 		if (mVertexLocked && map_range)
@@ -1299,25 +1273,26 @@ U8* LLVertexBuffer::mapIndexBuffer(S32 index, S32 count, bool map_range)
 			count = mNumIndices-index;
 		}
 
-		bool mapped = false;
-		//see if range is already mapped
-		for (U32 i = 0; i < mMappedIndexRegions.size(); ++i)
-		{
-			MappedRegion& region = mMappedIndexRegions[i];
-			if (expand_region(region, index, count))
-			{
-				mapped = true;
-				break;
-			}
-		}
+        S32 start = sizeof(U16) * index;
+        S32 end = start + sizeof(U16) * count;
 
-		if (!mapped)
-		{
-			//not already mapped, map new region
-			MappedRegion region(TYPE_INDEX, index, count);
-			mMappedIndexRegions.push_back(region);
-		}
-		
+        bool flagged = false;
+        // flag region as mapped
+        for (U32 i = 0; i < mMappedIndexRegions.size(); ++i)
+        {
+            MappedRegion& region = mMappedIndexRegions[i];
+            if (expand_region(region, start, end))
+            {
+                flagged = true;
+                break;
+            }
+        }
+
+        if (!flagged)
+        {
+            //didn't expand an existing region, make a new one
+            mMappedIndexRegions.push_back({ start, end });
+        }
 
 		if (mIndexLocked && map_range)
 		{
@@ -1360,6 +1335,27 @@ U8* LLVertexBuffer::mapIndexBuffer(S32 index, S32 count, bool map_range)
     return mMappedIndexData + sizeof(U16)*index;
 }
 
+static void flush_vbo(GLenum target, S32 start, S32 end, void* data)
+{
+    if (end != 0)
+    {
+        LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData");
+        LL_PROFILE_ZONE_NUM(start);
+        LL_PROFILE_ZONE_NUM(end);
+        LL_PROFILE_ZONE_NUM(end-start);
+
+        constexpr S32 block_size = 65536;
+
+        for (S32 i = start; i < end; i += block_size)
+        {
+            LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block");
+            LL_PROFILE_GPU_ZONE("glBufferSubData");
+            S32 tend = llmin(i + block_size, end);
+            glBufferSubData(target, i, tend - i, (U8*) data + (i-start));
+        }
+    }
+}
+
 void LLVertexBuffer::unmapBuffer()
 {
 	if (!useVBOs())
@@ -1377,37 +1373,31 @@ void LLVertexBuffer::unmapBuffer()
 
 		if (!mMappedVertexRegions.empty())
 		{
-			stop_glerror();
+            S32 start = 0;
+            S32 end = 0;
+
 			for (U32 i = 0; i < mMappedVertexRegions.size(); ++i)
 			{
 				const MappedRegion& region = mMappedVertexRegions[i];
-				S32 offset = region.mIndex >= 0 ? mOffsets[region.mType]+sTypeSize[region.mType]*region.mIndex : 0;
-				S32 length = sTypeSize[region.mType]*region.mCount;
-				if (mSize >= length + offset)
-				{
-					glBufferSubData(GL_ARRAY_BUFFER, offset, length, (U8*)mMappedData + offset);
-				}
-				else
-				{
-					GLint size = 0;
-					glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, &size);
-					LL_WARNS() << "Attempted to map regions to a buffer that is too small, " 
-						<< "mapped size: " << mSize
-						<< ", gl buffer size: " << size
-						<< ", length: " << length
-						<< ", offset: " << offset
-						<< LL_ENDL;
-				}
-				stop_glerror();
+                if (region.mStart == end + 1)
+                {
+                    end = region.mEnd;
+                }
+                else
+                {
+                    flush_vbo(GL_ARRAY_BUFFER, start, end, (U8*)mMappedData + start);
+                    start = region.mStart;
+                    end = region.mEnd;
+                }
 			}
 
+            flush_vbo(GL_ARRAY_BUFFER, start, end, (U8*)mMappedData + start);
+
 			mMappedVertexRegions.clear();
 		}
 		else
 		{
-			stop_glerror();
-			glBufferSubData(GL_ARRAY_BUFFER, 0, getSize(), (U8*) mMappedData);
-			stop_glerror();
+            llassert(false); // this shouldn't happen -- a buffer must always be explicitly mapped
 		}
 		
 		mVertexLocked = false;
@@ -1421,36 +1411,31 @@ void LLVertexBuffer::unmapBuffer()
 		
 		if (!mMappedIndexRegions.empty())
 		{
-			for (U32 i = 0; i < mMappedIndexRegions.size(); ++i)
-			{
-				const MappedRegion& region = mMappedIndexRegions[i];
-				S32 offset = region.mIndex >= 0 ? sizeof(U16)*region.mIndex : 0;
-				S32 length = sizeof(U16)*region.mCount;
-				if (mIndicesSize >= length + offset)
-				{
-					glBufferSubData(GL_ELEMENT_ARRAY_BUFFER, offset, length, (U8*) mMappedIndexData+offset);
-				}
-				else
-				{
-					GLint size = 0;
-					glGetBufferParameteriv(GL_ELEMENT_ARRAY_BUFFER, GL_BUFFER_SIZE, &size);
-					LL_WARNS() << "Attempted to map regions to a buffer that is too small, " 
-						<< "mapped size: " << mIndicesSize
-						<< ", gl buffer size: " << size
-						<< ", length: " << length
-						<< ", offset: " << offset
-						<< LL_ENDL;
-				}
-				stop_glerror();
-			}
+            S32 start = 0;
+            S32 end = 0;
+
+            for (U32 i = 0; i < mMappedIndexRegions.size(); ++i)
+            {
+                const MappedRegion& region = mMappedIndexRegions[i];
+                if (region.mStart == end + 1)
+                {
+                    end = region.mEnd;
+                }
+                else
+                {
+                    flush_vbo(GL_ELEMENT_ARRAY_BUFFER, start, end, (U8*)mMappedIndexData + start);
+                    start = region.mStart;
+                    end = region.mEnd;
+                }
+            }
+
+            flush_vbo(GL_ELEMENT_ARRAY_BUFFER, start, end, (U8*)mMappedIndexData + start);
 
 			mMappedIndexRegions.clear();
 		}
 		else
 		{
-			stop_glerror();
-			glBufferSubData(GL_ELEMENT_ARRAY_BUFFER, 0, getIndicesSize(), (U8*) mMappedIndexData);
-			stop_glerror();
+            llassert(false); // this shouldn't happen -- a buffer must always be explicitly mapped
 		}
 		
 		mIndexLocked = false;
@@ -1640,11 +1625,53 @@ bool LLVertexBuffer::bindGLIndicesFast()
     return false;
 }
 
-void LLVertexBuffer::flush()
+void LLVertexBuffer::flush(bool discard)
 {
 	if (useVBOs())
 	{
-		unmapBuffer();
+        if (discard)
+        { // discard existing VBO data if the buffer must be updated
+            
+            if (!mMappedVertexRegions.empty())
+            {
+                LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("flush discard vbo");
+                LL_PROFILE_ZONE_NUM(mSize);
+                release_buffer(mGLBuffer);
+                mGLBuffer = gen_buffer();
+                bindGLBuffer();
+                {
+                    LL_PROFILE_GPU_ZONE("glBufferData");
+                    glBufferData(GL_ARRAY_BUFFER, mSize, nullptr, mUsage);
+
+                    for (int i = 0; i < mSize; i += 65536)
+                    {
+                        LL_PROFILE_GPU_ZONE("glBufferSubData");
+                        S32 end = llmin(i + 65536, mSize);
+                        S32 count = end - i;
+                        glBufferSubData(GL_ARRAY_BUFFER, i, count, mMappedData + i);
+                    }
+                }
+                mMappedVertexRegions.clear();
+            }
+            if (!mMappedIndexRegions.empty())
+            {
+                LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("flush discard ibo");
+                LL_PROFILE_ZONE_NUM(mIndicesSize);
+                release_buffer(mGLIndices);
+                mGLIndices = gen_buffer();
+                bindGLIndices();
+                {
+                    LL_PROFILE_GPU_ZONE("glBufferData (ibo)");
+                    glBufferData(GL_ELEMENT_ARRAY_BUFFER, mIndicesSize, mMappedIndexData, mUsage);
+                }
+                mMappedIndexRegions.clear();
+            }
+        }
+        else
+        {
+            unmapBuffer();
+        }
+
 	}
 }
 
@@ -2045,10 +2072,4 @@ void LLVertexBuffer::setupVertexBufferFast(U32 data_mask)
     }
 	}
 
-LLVertexBuffer::MappedRegion::MappedRegion(S32 type, S32 index, S32 count)
-: mType(type), mIndex(index), mCount(count)
-{ 
-	mEnd = mIndex+mCount;	
-}	
-
 
diff --git a/indra/llrender/llvertexbuffer.h b/indra/llrender/llvertexbuffer.h
index bb7460fb2a..74b951884d 100644
--- a/indra/llrender/llvertexbuffer.h
+++ b/indra/llrender/llvertexbuffer.h
@@ -51,66 +51,15 @@
 
 
 //============================================================================
-// gl name pools for dynamic and streaming buffers
-class LLVBOPool
-{
-public:
-	static U32 sBytesPooled;
-	static U32 sIndexBytesPooled;
-	
-	LLVBOPool(U32 vboUsage, U32 vboType);
-		
-	const U32 mUsage;
-	const U32 mType;
-
-	//size MUST be a power of 2
-	U8* allocate(U32& name, U32 size, bool for_seed = false);
-	
-	//size MUST be the size provided to allocate that returned the given name
-	void release(U32 name, U8* buffer, U32 size);
-	
-	//batch allocate buffers to be provided to the application on demand
-	void seedPool();
-
-	//destroy all records in mFreeList
-	void cleanup();
-
-	U32 genBuffer();
-	void deleteBuffer(U32 name);
-
-	class Record
-	{
-	public:
-		U32 mGLName;
-		U8* mClientData;
-	};
-
-	typedef std::list<Record> record_list_t;
-	std::vector<record_list_t> mFreeList;
-	std::vector<U32> mMissCount;
-    bool mMissCountDirty;   // flag any changes to mFreeList or mMissCount
-
-	//used to avoid calling glGenBuffers for every VBO creation
-	static U32 sNamePool[1024];
-	static U32 sNameIdx;
-};
-
-
-//============================================================================
 // base class 
 class LLPrivateMemoryPool;
 class LLVertexBuffer : public LLRefCount
 {
 public:
-	class MappedRegion
+	struct MappedRegion
 	{
-	public:
-		S32 mType;
-		S32 mIndex;
-		S32 mCount;
-		S32 mEnd;
-		
-		MappedRegion(S32 type, S32 index, S32 count);
+        S32 mStart;
+        S32 mEnd;
 	};
 
 	LLVertexBuffer(const LLVertexBuffer& rhs)
@@ -125,12 +74,6 @@ public:
 		return *this;
 	}
 
-	static LLVBOPool sStreamVBOPool;
-	static LLVBOPool sDynamicVBOPool;
-	static LLVBOPool sDynamicCopyVBOPool;
-	static LLVBOPool sStreamIBOPool;
-	static LLVBOPool sDynamicIBOPool;
-
 	static std::list<U32> sAvailableVAOName;
 	static U32 sCurVAOName;
 
@@ -138,12 +81,10 @@ public:
 	static bool sUseVAO;
 	static bool	sPreferStreamDraw;
 
-	static void seedPools();
-
 	static U32 getVAOName();
 	static void releaseVAOName(U32 name);
 
-	static void initClass(bool use_vbo, bool no_vbo_mapping);
+	static void initClass(LLWindow* window);
 	static void cleanupClass();
 	static void setupClientArrays(U32 data_mask);
 	static void drawArrays(U32 mode, const std::vector<LLVector3>& pos);
@@ -240,7 +181,7 @@ public:
 	virtual void	setBuffer(U32 data_mask); 	// calls  setupVertexBuffer() if data_mask is not 0
     void	setBufferFast(U32 data_mask); 	// calls setupVertexBufferFast(), assumes data_mask is not 0 among other assumptions
 
-	void flush(); //flush pending data to GL memory
+    void flush(bool discard = false); //flush pending data to GL memory, if discard is true, discard previous VBO
 	// allocate buffer
 	bool	allocateBuffer(S32 nverts, S32 nindices, bool create);
 	virtual bool resizeBuffer(S32 newnverts, S32 newnindices);
diff --git a/indra/newview/llviewerdisplay.cpp b/indra/newview/llviewerdisplay.cpp
index 01fca47184..c6d2b476db 100644
--- a/indra/newview/llviewerdisplay.cpp
+++ b/indra/newview/llviewerdisplay.cpp
@@ -710,12 +710,6 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot)
 
 			if (!for_snapshot)
 			{
-				if (gFrameCount > 1)
-				{ //for some reason, ATI 4800 series will error out if you 
-				  //try to generate a shadow before the first frame is through
-					gPipeline.generateSunShadow(*LLViewerCamera::getInstance());
-				}
-
 				LLVertexBuffer::unbind();
 
 				LLGLState::checkStates();
@@ -936,8 +930,7 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot)
 			else
 			{
 				gPipeline.renderGeom(*LLViewerCamera::getInstance(), TRUE);
-			}
-			
+			}			
 			gGL.setColorMask(true, true);
 
 			//store this frame's modelview matrix for use
@@ -967,6 +960,14 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot)
         LLRenderTarget &rt = (gPipeline.sRenderDeferred ? gPipeline.mRT->deferredScreen : gPipeline.mRT->screen);
         rt.flush();
 
+
+        if (gFrameCount > 1 && !for_snapshot)
+        { //for some reason, ATI 4800 series will error out if you 
+          //try to generate a shadow before the first frame is through
+            gPipeline.generateSunShadow(*LLViewerCamera::getInstance());
+        }
+
+
         if (LLPipeline::sRenderDeferred)
         {
 			gPipeline.renderDeferredLighting();
diff --git a/indra/newview/llviewerwindow.cpp b/indra/newview/llviewerwindow.cpp
index bc4f00bd3f..5848cbfd9d 100644
--- a/indra/newview/llviewerwindow.cpp
+++ b/indra/newview/llviewerwindow.cpp
@@ -658,12 +658,6 @@ public:
 			
 			}
 
-			addText(xpos, ypos, llformat("%d MB Index Data (%d MB Pooled, %d KIndices)", LLVertexBuffer::sAllocatedIndexBytes/(1024*1024), LLVBOPool::sIndexBytesPooled/(1024*1024), LLVertexBuffer::sIndexCount/1024));
-			ypos += y_inc;
-
-			addText(xpos, ypos, llformat("%d MB Vertex Data (%d MB Pooled, %d KVerts)", LLVertexBuffer::sAllocatedBytes/(1024*1024), LLVBOPool::sBytesPooled/(1024*1024), LLVertexBuffer::sVertexCount/1024));
-			ypos += y_inc;
-
 			addText(xpos, ypos, llformat("%d Vertex Buffers", LLVertexBuffer::sGLCount));
 			ypos += y_inc;
 
@@ -1974,7 +1968,7 @@ LLViewerWindow::LLViewerWindow(const Params& p)
 	LL_DEBUGS("Window") << "Loading feature tables." << LL_ENDL;
 
 	// Initialize OpenGL Renderer
-	LLVertexBuffer::initClass(gSavedSettings.getBOOL("RenderVBOEnable"), gSavedSettings.getBOOL("RenderVBOMappingDisable"));
+	LLVertexBuffer::initClass(mWindow);
 	LL_INFOS("RenderInit") << "LLVertexBuffer initialization done." << LL_ENDL ;
 	gGL.init(true);
 
diff --git a/indra/newview/pipeline.cpp b/indra/newview/pipeline.cpp
index 5e585852f4..d56b31a372 100644
--- a/indra/newview/pipeline.cpp
+++ b/indra/newview/pipeline.cpp
@@ -2338,6 +2338,7 @@ static LLTrace::BlockTimerStatHandle FTM_CULL("Object Culling");
 void LLPipeline::updateCull(LLCamera& camera, LLCullResult& result)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_PIPELINE; //LL_RECORD_BLOCK_TIME(FTM_CULL);
+    LL_PROFILE_GPU_ZONE("updateCull"); // should always be zero GPU time, but drop a timer to flush stuff out
 
     bool water_clip = !sRenderTransparentWater;
 
@@ -2649,10 +2650,6 @@ void LLPipeline::updateGL()
 			LLGLUpdate::sGLQ.pop_front();
 		}
 	}
-
-	{ //seed VBO Pools
-		LLVertexBuffer::seedPools();
-	}
 }
 
 void LLPipeline::clearRebuildGroups()
@@ -3229,6 +3226,7 @@ void LLPipeline::markRebuild(LLDrawable *drawablep, LLDrawable::EDrawableFlags f
 void LLPipeline::stateSort(LLCamera& camera, LLCullResult &result)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_PIPELINE;
+    LL_PROFILE_GPU_ZONE("stateSort");
 
 	if (hasAnyRenderType(LLPipeline::RENDER_TYPE_AVATAR,
 					  LLPipeline::RENDER_TYPE_CONTROL_AV,
@@ -3837,6 +3835,7 @@ void LLPipeline::postSort(LLCamera &camera)
     // flush particle VB
     if (LLVOPartGroup::sVB)
     {
+        LL_PROFILE_GPU_ZONE("flush particle vb");
         LLVOPartGroup::sVB->flush();
     }
     else
@@ -3860,9 +3859,12 @@ void LLPipeline::postSort(LLCamera &camera)
     }*/
 
     // pack vertex buffers for groups that chose to delay their updates
-    for (LLSpatialGroup::sg_vector_t::iterator iter = mMeshDirtyGroup.begin(); iter != mMeshDirtyGroup.end(); ++iter)
     {
-        (*iter)->rebuildMesh();
+        LL_PROFILE_GPU_ZONE("rebuildMesh");
+        for (LLSpatialGroup::sg_vector_t::iterator iter = mMeshDirtyGroup.begin(); iter != mMeshDirtyGroup.end(); ++iter)
+        {
+            (*iter)->rebuildMesh();
+        }
     }
 
     /*if (use_transform_feedback)
@@ -7259,8 +7261,6 @@ void LLPipeline::doResetVertexBuffers(bool forced)
 	LLVOPartGroup::destroyGL();
     gGL.resetVertexBuffer();
 
-	SUBSYSTEM_CLEANUP(LLVertexBuffer);
-	
 	if (LLVertexBuffer::sGLCount != 0)
 	{
 		LL_WARNS() << "VBO wipe failed -- " << LLVertexBuffer::sGLCount << " buffers remaining." << LL_ENDL;
@@ -7280,7 +7280,6 @@ void LLPipeline::doResetVertexBuffers(bool forced)
 	sNoAlpha = gSavedSettings.getBOOL("RenderNoAlpha");
 	LLPipeline::sTextureBindTest = gSavedSettings.getBOOL("RenderDebugTextureBind");
 
-	LLVertexBuffer::initClass(LLVertexBuffer::sEnableVBOs, LLVertexBuffer::sDisableVBOMapping);
     gGL.initVertexBuffer();
 
     mDeferredVB = new LLVertexBuffer(DEFERRED_VB_MASK, 0);