From b0645835595f3517223329ba62f46277d3e3a9dd Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Tue, 22 Oct 2024 11:34:46 -0400 Subject: Make llcoro::scheduler log coros that run too long between yields. Introduce LLCoros::CoroData::mHistogram, a map of cutoff times (bucket breakpoints) with counts of occurrences. The idea is that mHistogram counts how many times the real time taken by a particular coroutine resumption falls into one of those buckets. Initialize the map with guessed buckets; these are set in llcoros.cpp so they can be changed without requiring extensive rebuilds. scheduler::pick_next() now records the timestamp and fiber context just before the fiber manager resumes the next coroutine. If the next pick_next() call reveals that the previous resumption took longer than the minimum bucket breakpoint, it increments the appropriate bucket counter and logs the instance. LLCoros::toplevel() reports nonzero mHistogram entries on coroutine termination. --- indra/llcommon/coro_scheduler.cpp | 84 ++++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 18 deletions(-) (limited to 'indra/llcommon/coro_scheduler.cpp') diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp index 2d8b6e1a97..93efac7c7e 100644 --- a/indra/llcommon/coro_scheduler.cpp +++ b/indra/llcommon/coro_scheduler.cpp @@ -20,6 +20,7 @@ #include // other Linden headers #include "llcallbacklist.h" +#include "llcoros.h" #include "lldate.h" #include "llerror.h" @@ -56,17 +57,56 @@ void scheduler::awakened( boost::fibers::context* ctx) noexcept boost::fibers::context* scheduler::pick_next() noexcept { + auto now = LLDate::now().secondsSinceEpoch(); // count calls to pick_next() ++mSwitches; // pick_next() is called when the previous fiber has suspended, and we // need to pick another. Did the previous pick_next() call pick the main - // fiber? If so, it's the main fiber that just suspended. - auto now = LLDate::now().secondsSinceEpoch(); - if (mMainRunning) + // fiber? (Or is this the first pick_next() call?) If so, it's the main + // fiber that just suspended. + if ((! mPrevCtx) || mPrevCtx->get_id() == mMainID) { - mMainRunning = false; mMainLast = now; } + else + { + // How long did we spend in the fiber that just suspended? + // Don't bother with long runs of the main fiber, since (a) it happens + // pretty often and (b) it's moderately likely that we've reached here + // from the canonical yield at the top of mainloop, and what we'd want + // to know about is whatever the main fiber was doing in the + // *previous* iteration of mainloop. + F64 elapsed{ now - mResumeTime }; + LLCoros::CoroData& data{ LLCoros::get_CoroData(mPrevCtx->get_id()) }; + // Find iterator to the first mHistogram key greater than elapsed. + auto past = data.mHistogram.upper_bound(elapsed); + // If the smallest key (mHistogram.begin()->first) is greater than + // elapsed, then we need not bother with this timeslice. + if (past != data.mHistogram.begin()) + { + // Here elapsed was greater than at least one key. Back off to the + // previous entry and increment that count. If it's end(), backing + // off gets us the last entry -- assuming mHistogram isn't empty. + llassert(! data.mHistogram.empty()); + ++(--past)->second; + LL::WorkQueue::ptr_t queue{ getWorkQueue() }; + // make sure the queue exists + if (queue) + { + // If it proves difficult to track down *why* the fiber spent so + // much time, consider also binding and reporting + // boost::stacktrace::stacktrace(). + queue->post( + [name=std::move(data.getName()), elapsed] + { + LL_WARNS_ONCE("LLCoros.scheduler") + << "Coroutine " << name << " ran for " + << elapsed << " seconds" << LL_ENDL; + + }); + } + } + } boost::fibers::context* next; @@ -96,17 +136,9 @@ boost::fibers::context* scheduler::pick_next() noexcept // passage could be skipped. // Record this event for logging, but push it off to a thread pool to - // perform that work. Presumably std::weak_ptr::lock() is cheaper than - // WorkQueue::getInstance(). - LL::WorkQueue::ptr_t queue{ mQueue.lock() }; - // We probably started before the relevant WorkQueue was created. - if (! queue) - { - // Try again to locate the specified WorkQueue. - queue = LL::WorkQueue::getInstance(qname); - mQueue = queue; - } - // Both the lock() call and the getInstance() call might have failed. + // perform that work. + LL::WorkQueue::ptr_t queue{ getWorkQueue() }; + // The work queue we're looking for might not exist right now. if (queue) { // Bind values. Do NOT bind 'this' to avoid cross-thread access! @@ -116,7 +148,6 @@ boost::fibers::context* scheduler::pick_next() noexcept // so we have no access. queue->post( [switches=mSwitches, start=mStart, elapsed, now] - () { U32 runtime(U32(now) - U32(start)); U32 minutes(runtime / 60u); @@ -150,12 +181,29 @@ boost::fibers::context* scheduler::pick_next() noexcept { // we're about to resume the main fiber: it's no longer "ready" mMainCtx = nullptr; - // instead, it's "running" - mMainRunning = true; } + mPrevCtx = next; + // remember when we resumed this fiber so our next call can measure how + // long the previous resumption was + mResumeTime = LLDate::now().secondsSinceEpoch(); return next; } +LL::WorkQueue::ptr_t scheduler::getWorkQueue() +{ + // Cache a weak_ptr to our target work queue, presuming that + // std::weak_ptr::lock() is cheaper than WorkQueue::getInstance(). + LL::WorkQueue::ptr_t queue{ mQueue.lock() }; + // We probably started before the relevant WorkQueue was created. + if (! queue) + { + // Try again to locate the specified WorkQueue. + queue = LL::WorkQueue::getInstance(qname); + mQueue = queue; + } + return queue; +} + void scheduler::use() { boost::fibers::use_scheduling_algorithm(); -- cgit v1.2.3