diff options
-rw-r--r-- | indra/llrender/llrender.cpp | 108 | ||||
-rw-r--r-- | indra/llrender/llvertexbuffer.cpp | 845 | ||||
-rw-r--r-- | indra/llrender/llvertexbuffer.h | 73 | ||||
-rw-r--r-- | indra/newview/llviewerdisplay.cpp | 17 | ||||
-rw-r--r-- | indra/newview/llviewerwindow.cpp | 8 | ||||
-rw-r--r-- | indra/newview/pipeline.cpp | 17 |
6 files changed, 567 insertions, 501 deletions
diff --git a/indra/llrender/llrender.cpp b/indra/llrender/llrender.cpp index 8ab8d3151b..7c974934ae 100644 --- a/indra/llrender/llrender.cpp +++ b/indra/llrender/llrender.cpp @@ -35,6 +35,7 @@ #include "llrendertarget.h" #include "lltexture.h" #include "llshadermgr.h" +#include "llmd5.h" #if LL_WINDOWS extern void APIENTRY gl_debug_callback(GLenum source, @@ -66,6 +67,14 @@ LLVector2 LLRender::sUIGLScaleFactor = LLVector2(1.f, 1.f); static const U32 LL_NUM_TEXTURE_LAYERS = 32; static const U32 LL_NUM_LIGHT_UNITS = 8; +struct LLVBCache +{ + LLPointer<LLVertexBuffer> vb; + std::chrono::steady_clock::time_point touched; +}; + +static std::unordered_map<std::size_t, LLVBCache> sVBCache; + static const GLenum sGLTextureType[] = { GL_TEXTURE_2D, @@ -1635,24 +1644,105 @@ void LLRender::flush() if (mBuffer) { - if (mBuffer->useVBOs() && !mBuffer->isLocked()) - { //hack to only flush the part of the buffer that was updated (relies on stream draw using buffersubdata) - mBuffer->getVertexStrider(mVerticesp, 0, count); - mBuffer->getTexCoord0Strider(mTexcoordsp, 0, count); - mBuffer->getColorStrider(mColorsp, 0, count); + + LLMD5 hash; + U32 attribute_mask = LLGLSLShader::sCurBoundShaderPtr->mAttributeMask; + + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("vb cache hash"); + + hash.update((U8*)mVerticesp.get(), count * sizeof(LLVector4a)); + if (attribute_mask & LLVertexBuffer::MAP_TEXCOORD0) + { + hash.update((U8*)mTexcoordsp.get(), count * sizeof(LLVector2)); + } + + if (attribute_mask & LLVertexBuffer::MAP_COLOR) + { + hash.update((U8*)mColorsp.get(), count * sizeof(LLColor4U)); + } + + hash.finalize(); + } + + size_t vhash[2]; + hash.raw_digest((unsigned char*) vhash); + + // check the VB cache before making a new vertex buffer + // This is a giant hack to deal with (mostly) our terrible UI rendering code + // that was built on top of OpenGL immediate mode. Huge performance wins + // can be had by not uploading geometry to VRAM unless absolutely necessary. + // Most of our usage of the "immediate mode" style draw calls is actually + // sending the same geometry over and over again. + // To leverage this, we maintain a running hash of the vertex stream being + // built up before a flush, and then check that hash against a VB + // cache just before creating a vertex buffer in VRAM + auto& cache = sVBCache.find(vhash[0]); + + LLPointer<LLVertexBuffer> vb; + + if (cache != sVBCache.end()) + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("vb cache hit"); + // cache hit, just use the cached buffer + vb = cache->second.vb; + cache->second.touched = std::chrono::steady_clock::now(); + } + else + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("vb cache miss"); + vb = new LLVertexBuffer(attribute_mask, GL_STATIC_DRAW); + vb->allocateBuffer(count, 0, true); + vb->setPositionData((LLVector4a*) mVerticesp.get()); + + if (attribute_mask & LLVertexBuffer::MAP_TEXCOORD0) + { + vb->setTexCoordData(mTexcoordsp.get()); + } + + if (attribute_mask & LLVertexBuffer::MAP_COLOR) + { + vb->setColorData(mColorsp.get()); + } + + vb->unbind(); + + sVBCache[vhash[0]] = { vb , std::chrono::steady_clock::now() }; + + static U32 miss_count = 0; + miss_count++; + if (miss_count > 1024) + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("vb cache clean"); + miss_count = 0; + auto now = std::chrono::steady_clock::now(); + + using namespace std::chrono_literals; + // every 1024 misses, clean the cache of any VBs that haven't been touched in the last second + for (auto& iter = sVBCache.begin(); iter != sVBCache.end(); ) + { + if (now - iter->second.touched > 1s) + { + iter = sVBCache.erase(iter); + } + else + { + ++iter; + } + } + } } - mBuffer->flush(); - mBuffer->setBuffer(immediate_mask); + vb->setBuffer(immediate_mask); if (mMode == LLRender::QUADS && sGLCoreProfile) { - mBuffer->drawArrays(LLRender::TRIANGLES, 0, count); + vb->drawArrays(LLRender::TRIANGLES, 0, count); mQuadCycle = 1; } else { - mBuffer->drawArrays(mMode, 0, count); + vb->drawArrays(mMode, 0, count); } } else diff --git a/indra/llrender/llvertexbuffer.cpp b/indra/llrender/llvertexbuffer.cpp index 7b8f85acba..57be21cf6e 100644 --- a/indra/llrender/llvertexbuffer.cpp +++ b/indra/llrender/llvertexbuffer.cpp @@ -62,6 +62,14 @@ U32 wpo2(U32 i) return r; } +struct CompareMappedRegion +{ + bool operator()(const LLVertexBuffer::MappedRegion& lhs, const LLVertexBuffer::MappedRegion& rhs) + { + return lhs.mStart < rhs.mStart; + } +}; + const U32 LL_VBO_BLOCK_SIZE = 2048; const U32 LL_VBO_POOL_MAX_SEED_SIZE = 256*1024; @@ -81,266 +89,217 @@ U32 vbo_block_index(U32 size) const U32 LL_VBO_POOL_SEED_COUNT = vbo_block_index(LL_VBO_POOL_MAX_SEED_SIZE) + 1; +#define ENABLE_GL_WORK_QUEUE 0 + +#if ENABLE_GL_WORK_QUEUE + +#define THREAD_COUNT 1 //============================================================================ -//static -LLVBOPool LLVertexBuffer::sStreamVBOPool(GL_STREAM_DRAW, GL_ARRAY_BUFFER); -LLVBOPool LLVertexBuffer::sDynamicVBOPool(GL_DYNAMIC_DRAW, GL_ARRAY_BUFFER); -LLVBOPool LLVertexBuffer::sDynamicCopyVBOPool(GL_DYNAMIC_COPY, GL_ARRAY_BUFFER); -LLVBOPool LLVertexBuffer::sStreamIBOPool(GL_STREAM_DRAW, GL_ELEMENT_ARRAY_BUFFER); -LLVBOPool LLVertexBuffer::sDynamicIBOPool(GL_DYNAMIC_DRAW, GL_ELEMENT_ARRAY_BUFFER); +// High performance WorkQueue for usage in real-time rendering work +class GLWorkQueue +{ +public: + using Work = std::function<void()>; -U32 LLVBOPool::sBytesPooled = 0; -U32 LLVBOPool::sIndexBytesPooled = 0; -U32 LLVBOPool::sNameIdx = 0; -U32 LLVBOPool::sNamePool[1024]; + GLWorkQueue(); -std::list<U32> LLVertexBuffer::sAvailableVAOName; -U32 LLVertexBuffer::sCurVAOName = 1; + void post(const Work& value); -U32 LLVertexBuffer::sAllocatedIndexBytes = 0; -U32 LLVertexBuffer::sIndexCount = 0; + size_t size(); -U32 LLVertexBuffer::sBindCount = 0; -U32 LLVertexBuffer::sSetCount = 0; -S32 LLVertexBuffer::sCount = 0; -S32 LLVertexBuffer::sGLCount = 0; -S32 LLVertexBuffer::sMappedCount = 0; -bool LLVertexBuffer::sDisableVBOMapping = false; -bool LLVertexBuffer::sEnableVBOs = true; -U32 LLVertexBuffer::sGLRenderBuffer = 0; -U32 LLVertexBuffer::sGLRenderArray = 0; -U32 LLVertexBuffer::sGLRenderIndices = 0; -U32 LLVertexBuffer::sLastMask = 0; -bool LLVertexBuffer::sVBOActive = false; -bool LLVertexBuffer::sIBOActive = false; -U32 LLVertexBuffer::sAllocatedBytes = 0; -U32 LLVertexBuffer::sVertexCount = 0; -bool LLVertexBuffer::sMapped = false; -bool LLVertexBuffer::sUseStreamDraw = true; -bool LLVertexBuffer::sUseVAO = false; -bool LLVertexBuffer::sPreferStreamDraw = false; + bool done(); -U32 LLVBOPool::genBuffer() -{ - LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX + // Get the next element from the queue + Work pop(); - if (sNameIdx == 0) - { - glGenBuffers(1024, sNamePool); - sNameIdx = 1024; - } + void runOne(); - return sNamePool[--sNameIdx]; -} + bool runPending(); -void LLVBOPool::deleteBuffer(U32 name) -{ - LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX - if (gGLManager.mInited) - { - LLVertexBuffer::unbind(); + void runUntilClose(); - glBindBuffer(mType, name); - glBufferData(mType, 0, NULL, mUsage); - glBindBuffer(mType, 0); + void close(); - glDeleteBuffers(1, &name); - } -} + bool isClosed(); + void syncGL(); -LLVBOPool::LLVBOPool(U32 vboUsage, U32 vboType) -: mUsage(vboUsage), mType(vboType), mMissCountDirty(true) -{ - mFreeList.resize(LL_VBO_POOL_SEED_COUNT); - mMissCount.resize(LL_VBO_POOL_SEED_COUNT); - std::fill(mMissCount.begin(), mMissCount.end(), 0); -} +private: + std::mutex mMutex; + std::condition_variable mCondition; + std::queue<Work> mQueue; + bool mClosed = false; +}; -U8* LLVBOPool::allocate(U32& name, U32 size, bool for_seed) +GLWorkQueue::GLWorkQueue() { - LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX - llassert(vbo_block_size(size) == size); - - U8* ret = NULL; - U32 i = vbo_block_index(size); +} - if (mFreeList.size() <= i) - { - mFreeList.resize(i+1); - } +void GLWorkQueue::syncGL() +{ + /*if (mSync) + { + std::lock_guard<std::mutex> lock(mMutex); + glWaitSync(mSync, 0, GL_TIMEOUT_IGNORED); + mSync = 0; + }*/ +} - if (mFreeList[i].empty() || for_seed) - { - //make a new buffer - name = genBuffer(); +size_t GLWorkQueue::size() +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD; + std::lock_guard<std::mutex> lock(mMutex); + return mQueue.size(); +} - glBindBuffer(mType, name); +bool GLWorkQueue::done() +{ + return size() == 0 && isClosed(); +} - if (!for_seed && i < LL_VBO_POOL_SEED_COUNT) - { //record this miss - mMissCount[i]++; - mMissCountDirty = true; // signal to ::seedPool() - } +void GLWorkQueue::post(const GLWorkQueue::Work& value) +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD; + { + std::lock_guard<std::mutex> lock(mMutex); + mQueue.push(std::move(value)); + } - if (mType == GL_ARRAY_BUFFER) - { - LLVertexBuffer::sAllocatedBytes += size; - } - else - { - LLVertexBuffer::sAllocatedIndexBytes += size; - } + mCondition.notify_one(); +} - if (LLVertexBuffer::sDisableVBOMapping || mUsage != GL_DYNAMIC_DRAW) - { - glBufferData(mType, size, 0, mUsage); - if (mUsage != GL_DYNAMIC_COPY) - { //data will be provided by application - ret = (U8*) ll_aligned_malloc<64>(size); - if (!ret) - { - LL_ERRS() - << "Failed to allocate " << size << " bytes for LLVBOPool buffer " << name << "." << LL_NEWLINE - << "Free list size: " - << mFreeList.size() // this happens if we are out of memory so a solution might be to clear some from freelist - << " Allocated Bytes: " << LLVertexBuffer::sAllocatedBytes - << " Allocated Index Bytes: " << LLVertexBuffer::sAllocatedIndexBytes << " Pooled Bytes: " << sBytesPooled - << " Pooled Index Bytes: " << sIndexBytesPooled << LL_ENDL; - } - } - } - else - { //always use a true hint of static draw when allocating non-client-backed buffers - glBufferData(mType, size, 0, GL_STATIC_DRAW); - } +// Get the next element from the queue +GLWorkQueue::Work GLWorkQueue::pop() +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD; + // Lock the mutex + { + std::unique_lock<std::mutex> lock(mMutex); - glBindBuffer(mType, 0); + // Wait for a new element to become available or for the queue to close + { + mCondition.wait(lock, [=] { return !mQueue.empty() || mClosed; }); + } + } - if (for_seed) - { //put into pool for future use - llassert(mFreeList.size() > i); + Work ret; - Record rec; - rec.mGLName = name; - rec.mClientData = ret; - - if (mType == GL_ARRAY_BUFFER) - { - sBytesPooled += size; - } - else - { - sIndexBytesPooled += size; - } - mFreeList[i].push_back(rec); - mMissCountDirty = true; // signal to ::seedPool() - } - } - else - { - name = mFreeList[i].front().mGLName; - ret = mFreeList[i].front().mClientData; + { + std::lock_guard<std::mutex> lock(mMutex); - if (mType == GL_ARRAY_BUFFER) - { - sBytesPooled -= size; - } - else - { - sIndexBytesPooled -= size; - } + // Get the next element from the queue + if (mQueue.size() > 0) + { + ret = mQueue.front(); + mQueue.pop(); + } + else + { + ret = []() {}; + } + } - mFreeList[i].pop_front(); - mMissCountDirty = true; // signal to ::seedPool() - } + return ret; +} - return ret; +void GLWorkQueue::runOne() +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD; + Work w = pop(); + w(); + //mSync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); } -void LLVBOPool::release(U32 name, U8* buffer, U32 size) +void GLWorkQueue::runUntilClose() { - llassert(vbo_block_size(size) == size); + while (!isClosed()) + { + runOne(); + } +} - deleteBuffer(name); - ll_aligned_free_fallback((U8*) buffer); +void GLWorkQueue::close() +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD; + { + std::lock_guard<std::mutex> lock(mMutex); + mClosed = true; + } - if (mType == GL_ARRAY_BUFFER) - { - LLVertexBuffer::sAllocatedBytes -= size; - } - else - { - LLVertexBuffer::sAllocatedIndexBytes -= size; - } + mCondition.notify_all(); } -void LLVBOPool::seedPool() +bool GLWorkQueue::isClosed() { - LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX - if (mMissCountDirty) - { - U32 dummy_name = 0; - U32 size = LL_VBO_BLOCK_SIZE; - - for (U32 i = 0; i < LL_VBO_POOL_SEED_COUNT; i++) - { - if (mMissCount[i] > mFreeList[i].size()) - { - S32 count = mMissCount[i] - mFreeList[i].size(); - for (U32 j = 0; j < count; ++j) - { - allocate(dummy_name, size, true); - } - } - size += LL_VBO_BLOCK_SIZE; - } - mMissCountDirty = false; - } + LL_PROFILE_ZONE_SCOPED_CATEGORY_THREAD; + std::lock_guard<std::mutex> lock(mMutex); + return mClosed; } -void LLVBOPool::cleanup() +#include "llwindow.h" + +class LLGLWorkerThread : public LLThread { - U32 size = LL_VBO_BLOCK_SIZE; +public: + LLGLWorkerThread(const std::string& name, GLWorkQueue* queue, LLWindow* window) + : LLThread(name) + { + mWindow = window; + mContext = mWindow->createSharedContext(); + mQueue = queue; + } - for (U32 i = 0; i < mFreeList.size(); ++i) - { - record_list_t& l = mFreeList[i]; + void run() override + { + mWindow->makeContextCurrent(mContext); + gGL.init(false); + mQueue->runUntilClose(); + gGL.shutdown(); + mWindow->destroySharedContext(mContext); + } - while (!l.empty()) - { - Record& r = l.front(); + GLWorkQueue* mQueue; + LLWindow* mWindow; + void* mContext = nullptr; +}; - deleteBuffer(r.mGLName); - - if (r.mClientData) - { - ll_aligned_free<64>((void*) r.mClientData); - } - l.pop_front(); +static LLGLWorkerThread* sVBOThread[THREAD_COUNT]; +static GLWorkQueue* sQueue = nullptr; - if (mType == GL_ARRAY_BUFFER) - { - sBytesPooled -= size; - LLVertexBuffer::sAllocatedBytes -= size; - } - else - { - sIndexBytesPooled -= size; - LLVertexBuffer::sAllocatedIndexBytes -= size; - } - } +#endif - size += LL_VBO_BLOCK_SIZE; - } +//============================================================================ - //reset miss counts - std::fill(mMissCount.begin(), mMissCount.end(), 0); -} +//static +std::list<U32> LLVertexBuffer::sAvailableVAOName; +U32 LLVertexBuffer::sCurVAOName = 1; + +U32 LLVertexBuffer::sAllocatedIndexBytes = 0; +U32 LLVertexBuffer::sIndexCount = 0; + +U32 LLVertexBuffer::sBindCount = 0; +U32 LLVertexBuffer::sSetCount = 0; +S32 LLVertexBuffer::sCount = 0; +S32 LLVertexBuffer::sGLCount = 0; +S32 LLVertexBuffer::sMappedCount = 0; +bool LLVertexBuffer::sDisableVBOMapping = false; +bool LLVertexBuffer::sEnableVBOs = true; +U32 LLVertexBuffer::sGLRenderBuffer = 0; +U32 LLVertexBuffer::sGLRenderArray = 0; +U32 LLVertexBuffer::sGLRenderIndices = 0; +U32 LLVertexBuffer::sLastMask = 0; +bool LLVertexBuffer::sVBOActive = false; +bool LLVertexBuffer::sIBOActive = false; +U32 LLVertexBuffer::sAllocatedBytes = 0; +U32 LLVertexBuffer::sVertexCount = 0; +bool LLVertexBuffer::sMapped = false; +bool LLVertexBuffer::sUseStreamDraw = true; +bool LLVertexBuffer::sUseVAO = false; +bool LLVertexBuffer::sPreferStreamDraw = false; //NOTE: each component must be AT LEAST 4 bytes in size to avoid a performance penalty on AMD hardware @@ -420,17 +379,6 @@ void LLVertexBuffer::releaseVAOName(U32 name) //static -void LLVertexBuffer::seedPools() -{ - LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX - sStreamVBOPool.seedPool(); - sDynamicVBOPool.seedPool(); - sDynamicCopyVBOPool.seedPool(); - sStreamIBOPool.seedPool(); - sDynamicIBOPool.seedPool(); -} - -//static void LLVertexBuffer::setupClientArrays(U32 data_mask) { if (sLastMask != data_mask) @@ -473,7 +421,7 @@ void LLVertexBuffer::drawArrays(U32 mode, const std::vector<LLVector3>& pos) } gGL.end(); gGL.flush(); - } +} //static void LLVertexBuffer::drawElements(U32 mode, const LLVector4a* pos, const LLVector2* tc, S32 num_indices, const U16* indicesp) @@ -704,10 +652,20 @@ void LLVertexBuffer::drawArrays(U32 mode, U32 first, U32 count) const } //static -void LLVertexBuffer::initClass(bool use_vbo, bool no_vbo_mapping) +void LLVertexBuffer::initClass(LLWindow* window) { - sEnableVBOs = use_vbo; - sDisableVBOMapping = sEnableVBOs && no_vbo_mapping; + sEnableVBOs = true; + sDisableVBOMapping = true; + +#if ENABLE_GL_WORK_QUEUE + sQueue = new GLWorkQueue(); + + for (int i = 0; i < THREAD_COUNT; ++i) + { + sVBOThread[i] = new LLGLWorkerThread("VBO Worker", sQueue, window); + sVBOThread[i]->start(); + } +#endif } //static @@ -743,14 +701,19 @@ void LLVertexBuffer::cleanupClass() { unbind(); - sStreamIBOPool.cleanup(); - sDynamicIBOPool.cleanup(); - sStreamVBOPool.cleanup(); - sDynamicVBOPool.cleanup(); - sDynamicCopyVBOPool.cleanup(); - - llassert(0 == LLVBOPool::sBytesPooled); - llassert(0 == LLVBOPool::sIndexBytesPooled); +#if ENABLE_GL_WORK_QUEUE + sQueue->close(); + for (int i = 0; i < THREAD_COUNT; ++i) + { + sVBOThread[i]->shutdown(); + delete sVBOThread[i]; + sVBOThread[i] = nullptr; + } + + delete sQueue; + sQueue = nullptr; +#endif + //llassert(0 == sAllocatedBytes); //llassert(0 == sAllocatedIndexBytes); } @@ -781,21 +744,6 @@ S32 LLVertexBuffer::determineUsage(S32 usage) ret_usage = GL_STREAM_DRAW; } - if (ret_usage && ret_usage != GL_STREAM_DRAW) - { //only stream_draw and dynamic_draw are supported when using VBOs, dynamic draw is the default - if (ret_usage != GL_DYNAMIC_COPY) - { - if (sDisableVBOMapping) - { //always use stream draw if VBO mapping is disabled - ret_usage = GL_STREAM_DRAW; - } - else - { - ret_usage = GL_DYNAMIC_DRAW; - } - } - } - return ret_usage; } @@ -848,7 +796,7 @@ S32 LLVertexBuffer::calcOffsets(const U32& typemask, S32* offsets, S32 num_verti offsets[TYPE_TEXTURE_INDEX] = offsets[TYPE_VERTEX] + 12; - return offset+16; + return offset; } //static @@ -896,74 +844,101 @@ LLVertexBuffer::~LLVertexBuffer() //---------------------------------------------------------------------------- +// batch glGenBuffers +static GLuint gen_buffer() +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX; + constexpr U32 pool_size = 4096; + + thread_local static GLuint sNamePool[pool_size]; + thread_local static U32 sIndex = 0; + + if (sIndex == 0) + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("gen ibo"); + sIndex = pool_size; + glGenBuffers(pool_size, sNamePool); + } + + return sNamePool[--sIndex]; +} + +// batch glDeleteBuffers +static void release_buffer(U32 buff) +{ + LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX; +#if 0 + + constexpr U32 pool_size = 4096; + + thread_local static GLuint sNamePool[pool_size]; + thread_local static U32 sIndex = 0; + + if (sIndex == pool_size) + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("gen ibo"); + sIndex = 0; + glDeleteBuffers(pool_size, sNamePool); + } + + sNamePool[sIndex++] = buff; +#else + glDeleteBuffers(1, &buff); +#endif +} + void LLVertexBuffer::genBuffer(U32 size) { - mSize = vbo_block_size(size); + LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX; - if (mUsage == GL_STREAM_DRAW) - { - mMappedData = sStreamVBOPool.allocate(mGLBuffer, mSize); - } - else if (mUsage == GL_DYNAMIC_DRAW) - { - mMappedData = sDynamicVBOPool.allocate(mGLBuffer, mSize); - } - else - { - mMappedData = sDynamicCopyVBOPool.allocate(mGLBuffer, mSize); - } - - - sGLCount++; + mSize = size; + mMappedData = (U8*) ll_aligned_malloc_16(size); + mGLBuffer = gen_buffer(); + + glBindBuffer(GL_ARRAY_BUFFER, mGLBuffer); + glBufferData(GL_ARRAY_BUFFER, mSize, nullptr, mUsage); + glBindBuffer(GL_ARRAY_BUFFER, 0); + + sGLCount++; } void LLVertexBuffer::genIndices(U32 size) { - mIndicesSize = vbo_block_size(size); + LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX; + + mIndicesSize = size; + mMappedIndexData = (U8*) ll_aligned_malloc_16(size); + + mGLIndices = gen_buffer(); + + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, mGLIndices); + glBufferData(GL_ELEMENT_ARRAY_BUFFER, mIndicesSize, nullptr, mUsage); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0); - if (mUsage == GL_STREAM_DRAW) - { - mMappedIndexData = sStreamIBOPool.allocate(mGLIndices, mIndicesSize); - } - else - { - mMappedIndexData = sDynamicIBOPool.allocate(mGLIndices, mIndicesSize); - } - sGLCount++; } void LLVertexBuffer::releaseBuffer() { - if (mUsage == GL_STREAM_DRAW) - { - sStreamVBOPool.release(mGLBuffer, mMappedData, mSize); - } - else - { - sDynamicVBOPool.release(mGLBuffer, mMappedData, mSize); - } - - mGLBuffer = 0; - mMappedData = NULL; + LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX; + release_buffer(mGLBuffer); + mGLBuffer = 0; + ll_aligned_free_16(mMappedData); + mMappedData = nullptr; + sGLCount--; } void LLVertexBuffer::releaseIndices() { - if (mUsage == GL_STREAM_DRAW) - { - sStreamIBOPool.release(mGLIndices, mMappedIndexData, mIndicesSize); - } - else - { - sDynamicIBOPool.release(mGLIndices, mMappedIndexData, mIndicesSize); - } + LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX; + release_buffer(mGLIndices); + mGLIndices = 0; + + ll_aligned_free_16(mMappedIndexData); + mMappedIndexData = nullptr; - mGLIndices = 0; - mMappedIndexData = NULL; - sGLCount--; } @@ -1183,21 +1158,20 @@ bool LLVertexBuffer::useVBOs() const //---------------------------------------------------------------------------- -bool expand_region(LLVertexBuffer::MappedRegion& region, S32 index, S32 count) +// if no gap between region and given range exists, expand region to cover given range and return true +// otherwise return false +bool expand_region(LLVertexBuffer::MappedRegion& region, S32 start, S32 end) { - S32 end = index+count; - S32 region_end = region.mIndex+region.mCount; - if (end < region.mIndex || - index > region_end) + if (end < region.mStart || + start > region.mEnd) { //gap exists, do not merge return false; } - S32 new_end = llmax(end, region_end); - S32 new_index = llmin(index, region.mIndex); - region.mIndex = new_index; - region.mCount = new_end-new_index; + region.mStart = llmin(region.mStart, start); + region.mEnd = llmax(region.mEnd, end); + return true; } @@ -1206,7 +1180,6 @@ bool expand_region(LLVertexBuffer::MappedRegion& region, S32 index, S32 count) U8* LLVertexBuffer::mapVertexBuffer(S32 type, S32 index, S32 count, bool map_range) { LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX; - bindGLBuffer(true); if (mFinal) { LL_ERRS() << "LLVertexBuffer::mapVeretxBuffer() called on a finalized buffer." << LL_ENDL; @@ -1215,34 +1188,34 @@ U8* LLVertexBuffer::mapVertexBuffer(S32 type, S32 index, S32 count, bool map_ran { LL_ERRS() << "LLVertexBuffer::mapVertexBuffer() called on unallocated buffer." << LL_ENDL; } - - if (useVBOs()) - { - if (count == -1) - { - count = mNumVerts-index; - } - bool mapped = false; - //see if range is already mapped + + if (useVBOs()) + { + if (count == -1) + { + count = mNumVerts - index; + } + + S32 start = mOffsets[type] + sTypeSize[type] * index; + S32 end = start + sTypeSize[type] * count; + + bool flagged = false; + // flag region as mapped for (U32 i = 0; i < mMappedVertexRegions.size(); ++i) { MappedRegion& region = mMappedVertexRegions[i]; - if (region.mType == type) - { - if (expand_region(region, index, count)) - { - mapped = true; - break; - } - } + if (expand_region(region, start, end)) + { + flagged = true; + break; + } } - if (!mapped) + if (!flagged) { - //not already mapped, map new region - MappedRegion region(type, index, count); - mMappedVertexRegions.push_back(region); + //didn't expand an existing region, make a new one + mMappedVertexRegions.push_back({ start, end }); } if (mVertexLocked && map_range) @@ -1282,7 +1255,6 @@ U8* LLVertexBuffer::mapVertexBuffer(S32 type, S32 index, S32 count, bool map_ran U8* LLVertexBuffer::mapIndexBuffer(S32 index, S32 count, bool map_range) { LL_PROFILE_ZONE_SCOPED_CATEGORY_VERTEX; - bindGLIndices(true); if (mFinal) { LL_ERRS() << "LLVertexBuffer::mapIndexBuffer() called on a finalized buffer." << LL_ENDL; @@ -1299,25 +1271,26 @@ U8* LLVertexBuffer::mapIndexBuffer(S32 index, S32 count, bool map_range) count = mNumIndices-index; } - bool mapped = false; - //see if range is already mapped - for (U32 i = 0; i < mMappedIndexRegions.size(); ++i) - { - MappedRegion& region = mMappedIndexRegions[i]; - if (expand_region(region, index, count)) - { - mapped = true; - break; - } - } + S32 start = sizeof(U16) * index; + S32 end = start + sizeof(U16) * count; - if (!mapped) - { - //not already mapped, map new region - MappedRegion region(TYPE_INDEX, index, count); - mMappedIndexRegions.push_back(region); - } - + bool flagged = false; + // flag region as mapped + for (U32 i = 0; i < mMappedIndexRegions.size(); ++i) + { + MappedRegion& region = mMappedIndexRegions[i]; + if (expand_region(region, start, end)) + { + flagged = true; + break; + } + } + + if (!flagged) + { + //didn't expand an existing region, make a new one + mMappedIndexRegions.push_back({ start, end }); + } if (mIndexLocked && map_range) { @@ -1360,6 +1333,27 @@ U8* LLVertexBuffer::mapIndexBuffer(S32 index, S32 count, bool map_range) return mMappedIndexData + sizeof(U16)*index; } +static void flush_vbo(GLenum target, S32 start, S32 end, void* data) +{ + if (end != 0) + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData"); + LL_PROFILE_ZONE_NUM(start); + LL_PROFILE_ZONE_NUM(end); + LL_PROFILE_ZONE_NUM(end-start); + + constexpr S32 block_size = 65536; + + for (S32 i = start; i < end; i += block_size) + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("glBufferSubData block"); + LL_PROFILE_GPU_ZONE("glBufferSubData"); + S32 tend = llmin(i + block_size, end); + glBufferSubData(target, i, tend - i, (U8*) data + (i-start)); + } + } +} + void LLVertexBuffer::unmapBuffer() { if (!useVBOs()) @@ -1377,37 +1371,31 @@ void LLVertexBuffer::unmapBuffer() if (!mMappedVertexRegions.empty()) { - stop_glerror(); + S32 start = 0; + S32 end = 0; + for (U32 i = 0; i < mMappedVertexRegions.size(); ++i) { const MappedRegion& region = mMappedVertexRegions[i]; - S32 offset = region.mIndex >= 0 ? mOffsets[region.mType]+sTypeSize[region.mType]*region.mIndex : 0; - S32 length = sTypeSize[region.mType]*region.mCount; - if (mSize >= length + offset) - { - glBufferSubData(GL_ARRAY_BUFFER, offset, length, (U8*)mMappedData + offset); - } - else - { - GLint size = 0; - glGetBufferParameteriv(GL_ARRAY_BUFFER, GL_BUFFER_SIZE, &size); - LL_WARNS() << "Attempted to map regions to a buffer that is too small, " - << "mapped size: " << mSize - << ", gl buffer size: " << size - << ", length: " << length - << ", offset: " << offset - << LL_ENDL; - } - stop_glerror(); + if (region.mStart == end + 1) + { + end = region.mEnd; + } + else + { + flush_vbo(GL_ARRAY_BUFFER, start, end, (U8*)mMappedData + start); + start = region.mStart; + end = region.mEnd; + } } + flush_vbo(GL_ARRAY_BUFFER, start, end, (U8*)mMappedData + start); + mMappedVertexRegions.clear(); } else { - stop_glerror(); - glBufferSubData(GL_ARRAY_BUFFER, 0, getSize(), (U8*) mMappedData); - stop_glerror(); + llassert(false); // this shouldn't happen -- a buffer must always be explicitly mapped } mVertexLocked = false; @@ -1421,36 +1409,31 @@ void LLVertexBuffer::unmapBuffer() if (!mMappedIndexRegions.empty()) { - for (U32 i = 0; i < mMappedIndexRegions.size(); ++i) - { - const MappedRegion& region = mMappedIndexRegions[i]; - S32 offset = region.mIndex >= 0 ? sizeof(U16)*region.mIndex : 0; - S32 length = sizeof(U16)*region.mCount; - if (mIndicesSize >= length + offset) - { - glBufferSubData(GL_ELEMENT_ARRAY_BUFFER, offset, length, (U8*) mMappedIndexData+offset); - } - else - { - GLint size = 0; - glGetBufferParameteriv(GL_ELEMENT_ARRAY_BUFFER, GL_BUFFER_SIZE, &size); - LL_WARNS() << "Attempted to map regions to a buffer that is too small, " - << "mapped size: " << mIndicesSize - << ", gl buffer size: " << size - << ", length: " << length - << ", offset: " << offset - << LL_ENDL; - } - stop_glerror(); - } + S32 start = 0; + S32 end = 0; + + for (U32 i = 0; i < mMappedIndexRegions.size(); ++i) + { + const MappedRegion& region = mMappedIndexRegions[i]; + if (region.mStart == end + 1) + { + end = region.mEnd; + } + else + { + flush_vbo(GL_ELEMENT_ARRAY_BUFFER, start, end, (U8*)mMappedIndexData + start); + start = region.mStart; + end = region.mEnd; + } + } + + flush_vbo(GL_ELEMENT_ARRAY_BUFFER, start, end, (U8*)mMappedIndexData + start); mMappedIndexRegions.clear(); } else { - stop_glerror(); - glBufferSubData(GL_ELEMENT_ARRAY_BUFFER, 0, getIndicesSize(), (U8*) mMappedIndexData); - stop_glerror(); + llassert(false); // this shouldn't happen -- a buffer must always be explicitly mapped } mIndexLocked = false; @@ -1640,11 +1623,53 @@ bool LLVertexBuffer::bindGLIndicesFast() return false; } -void LLVertexBuffer::flush() +void LLVertexBuffer::flush(bool discard) { if (useVBOs()) { - unmapBuffer(); + if (discard) + { // discard existing VBO data if the buffer must be updated + + if (!mMappedVertexRegions.empty()) + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("flush discard vbo"); + LL_PROFILE_ZONE_NUM(mSize); + release_buffer(mGLBuffer); + mGLBuffer = gen_buffer(); + bindGLBuffer(); + { + LL_PROFILE_GPU_ZONE("glBufferData"); + glBufferData(GL_ARRAY_BUFFER, mSize, nullptr, mUsage); + + for (int i = 0; i < mSize; i += 65536) + { + LL_PROFILE_GPU_ZONE("glBufferSubData"); + S32 end = llmin(i + 65536, mSize); + S32 count = end - i; + glBufferSubData(GL_ARRAY_BUFFER, i, count, mMappedData + i); + } + } + mMappedVertexRegions.clear(); + } + if (!mMappedIndexRegions.empty()) + { + LL_PROFILE_ZONE_NAMED_CATEGORY_VERTEX("flush discard ibo"); + LL_PROFILE_ZONE_NUM(mIndicesSize); + release_buffer(mGLIndices); + mGLIndices = gen_buffer(); + bindGLIndices(); + { + LL_PROFILE_GPU_ZONE("glBufferData (ibo)"); + glBufferData(GL_ELEMENT_ARRAY_BUFFER, mIndicesSize, mMappedIndexData, mUsage); + } + mMappedIndexRegions.clear(); + } + } + else + { + unmapBuffer(); + } + } } @@ -2043,12 +2068,24 @@ void LLVertexBuffer::setupVertexBufferFast(U32 data_mask) void* ptr = (void*)(base + mOffsets[TYPE_VERTEX]); glVertexAttribPointer(loc, 3, GL_FLOAT, GL_FALSE, LLVertexBuffer::sTypeSize[TYPE_VERTEX], ptr); } - } +} + +void LLVertexBuffer::setPositionData(const LLVector4a* data) +{ + bindGLBuffer(); + flush_vbo(GL_ARRAY_BUFFER, 0, sizeof(LLVector4a) * getNumVerts(), (U8*) data); +} -LLVertexBuffer::MappedRegion::MappedRegion(S32 type, S32 index, S32 count) -: mType(type), mIndex(index), mCount(count) -{ - mEnd = mIndex+mCount; -} +void LLVertexBuffer::setTexCoordData(const LLVector2* data) +{ + bindGLBuffer(); + flush_vbo(GL_ARRAY_BUFFER, mOffsets[TYPE_TEXCOORD0], mOffsets[TYPE_TEXCOORD0] + sTypeSize[TYPE_TEXCOORD0] * getNumVerts(), (U8*)data); +} + +void LLVertexBuffer::setColorData(const LLColor4U* data) +{ + bindGLBuffer(); + flush_vbo(GL_ARRAY_BUFFER, mOffsets[TYPE_COLOR], mOffsets[TYPE_COLOR] + sTypeSize[TYPE_COLOR] * getNumVerts(), (U8*) data); +} diff --git a/indra/llrender/llvertexbuffer.h b/indra/llrender/llvertexbuffer.h index bb7460fb2a..926d37b052 100644 --- a/indra/llrender/llvertexbuffer.h +++ b/indra/llrender/llvertexbuffer.h @@ -51,66 +51,15 @@ //============================================================================ -// gl name pools for dynamic and streaming buffers -class LLVBOPool -{ -public: - static U32 sBytesPooled; - static U32 sIndexBytesPooled; - - LLVBOPool(U32 vboUsage, U32 vboType); - - const U32 mUsage; - const U32 mType; - - //size MUST be a power of 2 - U8* allocate(U32& name, U32 size, bool for_seed = false); - - //size MUST be the size provided to allocate that returned the given name - void release(U32 name, U8* buffer, U32 size); - - //batch allocate buffers to be provided to the application on demand - void seedPool(); - - //destroy all records in mFreeList - void cleanup(); - - U32 genBuffer(); - void deleteBuffer(U32 name); - - class Record - { - public: - U32 mGLName; - U8* mClientData; - }; - - typedef std::list<Record> record_list_t; - std::vector<record_list_t> mFreeList; - std::vector<U32> mMissCount; - bool mMissCountDirty; // flag any changes to mFreeList or mMissCount - - //used to avoid calling glGenBuffers for every VBO creation - static U32 sNamePool[1024]; - static U32 sNameIdx; -}; - - -//============================================================================ // base class class LLPrivateMemoryPool; class LLVertexBuffer : public LLRefCount { public: - class MappedRegion + struct MappedRegion { - public: - S32 mType; - S32 mIndex; - S32 mCount; - S32 mEnd; - - MappedRegion(S32 type, S32 index, S32 count); + S32 mStart; + S32 mEnd; }; LLVertexBuffer(const LLVertexBuffer& rhs) @@ -125,12 +74,6 @@ public: return *this; } - static LLVBOPool sStreamVBOPool; - static LLVBOPool sDynamicVBOPool; - static LLVBOPool sDynamicCopyVBOPool; - static LLVBOPool sStreamIBOPool; - static LLVBOPool sDynamicIBOPool; - static std::list<U32> sAvailableVAOName; static U32 sCurVAOName; @@ -138,12 +81,10 @@ public: static bool sUseVAO; static bool sPreferStreamDraw; - static void seedPools(); - static U32 getVAOName(); static void releaseVAOName(U32 name); - static void initClass(bool use_vbo, bool no_vbo_mapping); + static void initClass(LLWindow* window); static void cleanupClass(); static void setupClientArrays(U32 data_mask); static void drawArrays(U32 mode, const std::vector<LLVector3>& pos); @@ -240,7 +181,7 @@ public: virtual void setBuffer(U32 data_mask); // calls setupVertexBuffer() if data_mask is not 0 void setBufferFast(U32 data_mask); // calls setupVertexBufferFast(), assumes data_mask is not 0 among other assumptions - void flush(); //flush pending data to GL memory + void flush(bool discard = false); //flush pending data to GL memory, if discard is true, discard previous VBO // allocate buffer bool allocateBuffer(S32 nverts, S32 nindices, bool create); virtual bool resizeBuffer(S32 newnverts, S32 newnindices); @@ -271,6 +212,10 @@ public: bool getMetallicRoughnessTexcoordStrider(LLStrider<LLVector2>& strider, S32 index=0, S32 count = -1, bool map_range = false); bool getEmissiveTexcoordStrider(LLStrider<LLVector2>& strider, S32 index=0, S32 count = -1, bool map_range = false); + void setPositionData(const LLVector4a* data); + void setTexCoordData(const LLVector2* data); + void setColorData(const LLColor4U* data); + bool useVBOs() const; bool isEmpty() const { return mEmpty; } diff --git a/indra/newview/llviewerdisplay.cpp b/indra/newview/llviewerdisplay.cpp index 01fca47184..c6d2b476db 100644 --- a/indra/newview/llviewerdisplay.cpp +++ b/indra/newview/llviewerdisplay.cpp @@ -710,12 +710,6 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot) if (!for_snapshot) { - if (gFrameCount > 1) - { //for some reason, ATI 4800 series will error out if you - //try to generate a shadow before the first frame is through - gPipeline.generateSunShadow(*LLViewerCamera::getInstance()); - } - LLVertexBuffer::unbind(); LLGLState::checkStates(); @@ -936,8 +930,7 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot) else { gPipeline.renderGeom(*LLViewerCamera::getInstance(), TRUE); - } - + } gGL.setColorMask(true, true); //store this frame's modelview matrix for use @@ -967,6 +960,14 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot) LLRenderTarget &rt = (gPipeline.sRenderDeferred ? gPipeline.mRT->deferredScreen : gPipeline.mRT->screen); rt.flush(); + + if (gFrameCount > 1 && !for_snapshot) + { //for some reason, ATI 4800 series will error out if you + //try to generate a shadow before the first frame is through + gPipeline.generateSunShadow(*LLViewerCamera::getInstance()); + } + + if (LLPipeline::sRenderDeferred) { gPipeline.renderDeferredLighting(); diff --git a/indra/newview/llviewerwindow.cpp b/indra/newview/llviewerwindow.cpp index bc4f00bd3f..5848cbfd9d 100644 --- a/indra/newview/llviewerwindow.cpp +++ b/indra/newview/llviewerwindow.cpp @@ -658,12 +658,6 @@ public: } - addText(xpos, ypos, llformat("%d MB Index Data (%d MB Pooled, %d KIndices)", LLVertexBuffer::sAllocatedIndexBytes/(1024*1024), LLVBOPool::sIndexBytesPooled/(1024*1024), LLVertexBuffer::sIndexCount/1024)); - ypos += y_inc; - - addText(xpos, ypos, llformat("%d MB Vertex Data (%d MB Pooled, %d KVerts)", LLVertexBuffer::sAllocatedBytes/(1024*1024), LLVBOPool::sBytesPooled/(1024*1024), LLVertexBuffer::sVertexCount/1024)); - ypos += y_inc; - addText(xpos, ypos, llformat("%d Vertex Buffers", LLVertexBuffer::sGLCount)); ypos += y_inc; @@ -1974,7 +1968,7 @@ LLViewerWindow::LLViewerWindow(const Params& p) LL_DEBUGS("Window") << "Loading feature tables." << LL_ENDL; // Initialize OpenGL Renderer - LLVertexBuffer::initClass(gSavedSettings.getBOOL("RenderVBOEnable"), gSavedSettings.getBOOL("RenderVBOMappingDisable")); + LLVertexBuffer::initClass(mWindow); LL_INFOS("RenderInit") << "LLVertexBuffer initialization done." << LL_ENDL ; gGL.init(true); diff --git a/indra/newview/pipeline.cpp b/indra/newview/pipeline.cpp index 832c7b8b43..6ac059df91 100644 --- a/indra/newview/pipeline.cpp +++ b/indra/newview/pipeline.cpp @@ -2336,6 +2336,7 @@ static LLTrace::BlockTimerStatHandle FTM_CULL("Object Culling"); void LLPipeline::updateCull(LLCamera& camera, LLCullResult& result) { LL_PROFILE_ZONE_SCOPED_CATEGORY_PIPELINE; //LL_RECORD_BLOCK_TIME(FTM_CULL); + LL_PROFILE_GPU_ZONE("updateCull"); // should always be zero GPU time, but drop a timer to flush stuff out bool water_clip = !sRenderTransparentWater; @@ -2647,10 +2648,6 @@ void LLPipeline::updateGL() LLGLUpdate::sGLQ.pop_front(); } } - - { //seed VBO Pools - LLVertexBuffer::seedPools(); - } } void LLPipeline::clearRebuildGroups() @@ -3227,6 +3224,7 @@ void LLPipeline::markRebuild(LLDrawable *drawablep, LLDrawable::EDrawableFlags f void LLPipeline::stateSort(LLCamera& camera, LLCullResult &result) { LL_PROFILE_ZONE_SCOPED_CATEGORY_PIPELINE; + LL_PROFILE_GPU_ZONE("stateSort"); if (hasAnyRenderType(LLPipeline::RENDER_TYPE_AVATAR, LLPipeline::RENDER_TYPE_CONTROL_AV, @@ -3835,6 +3833,7 @@ void LLPipeline::postSort(LLCamera &camera) // flush particle VB if (LLVOPartGroup::sVB) { + LL_PROFILE_GPU_ZONE("flush particle vb"); LLVOPartGroup::sVB->flush(); } else @@ -3858,9 +3857,12 @@ void LLPipeline::postSort(LLCamera &camera) }*/ // pack vertex buffers for groups that chose to delay their updates - for (LLSpatialGroup::sg_vector_t::iterator iter = mMeshDirtyGroup.begin(); iter != mMeshDirtyGroup.end(); ++iter) { - (*iter)->rebuildMesh(); + LL_PROFILE_GPU_ZONE("rebuildMesh"); + for (LLSpatialGroup::sg_vector_t::iterator iter = mMeshDirtyGroup.begin(); iter != mMeshDirtyGroup.end(); ++iter) + { + (*iter)->rebuildMesh(); + } } /*if (use_transform_feedback) @@ -7257,8 +7259,6 @@ void LLPipeline::doResetVertexBuffers(bool forced) LLVOPartGroup::destroyGL(); gGL.resetVertexBuffer(); - SUBSYSTEM_CLEANUP(LLVertexBuffer); - if (LLVertexBuffer::sGLCount != 0) { LL_WARNS() << "VBO wipe failed -- " << LLVertexBuffer::sGLCount << " buffers remaining." << LL_ENDL; @@ -7278,7 +7278,6 @@ void LLPipeline::doResetVertexBuffers(bool forced) sNoAlpha = gSavedSettings.getBOOL("RenderNoAlpha"); LLPipeline::sTextureBindTest = gSavedSettings.getBOOL("RenderDebugTextureBind"); - LLVertexBuffer::initClass(LLVertexBuffer::sEnableVBOs, LLVertexBuffer::sDisableVBOMapping); gGL.initVertexBuffer(); mDeferredVB = new LLVertexBuffer(DEFERRED_VB_MASK, 0); |