diff options
author | Nat Goodspeed <nat@lindenlab.com> | 2024-10-22 11:34:46 -0400 |
---|---|---|
committer | Nat Goodspeed <nat@lindenlab.com> | 2024-10-22 11:34:46 -0400 |
commit | b0645835595f3517223329ba62f46277d3e3a9dd (patch) | |
tree | a2b523c3beff518dcbec55663185d6ff70ae5097 /indra/llcommon/coro_scheduler.cpp | |
parent | 8e737a5d45157426cb84927170a78baea2106813 (diff) |
Make llcoro::scheduler log coros that run too long between yields.
Introduce LLCoros::CoroData::mHistogram, a map of cutoff times (bucket
breakpoints) with counts of occurrences. The idea is that mHistogram counts
how many times the real time taken by a particular coroutine resumption falls
into one of those buckets. Initialize the map with guessed buckets; these are
set in llcoros.cpp so they can be changed without requiring extensive rebuilds.
scheduler::pick_next() now records the timestamp and fiber context just before
the fiber manager resumes the next coroutine. If the next pick_next() call
reveals that the previous resumption took longer than the minimum bucket
breakpoint, it increments the appropriate bucket counter and logs the instance.
LLCoros::toplevel() reports nonzero mHistogram entries on coroutine
termination.
Diffstat (limited to 'indra/llcommon/coro_scheduler.cpp')
-rw-r--r-- | indra/llcommon/coro_scheduler.cpp | 84 |
1 files changed, 66 insertions, 18 deletions
diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp index 2d8b6e1a97..93efac7c7e 100644 --- a/indra/llcommon/coro_scheduler.cpp +++ b/indra/llcommon/coro_scheduler.cpp @@ -20,6 +20,7 @@ #include <boost/fiber/operations.hpp> // other Linden headers #include "llcallbacklist.h" +#include "llcoros.h" #include "lldate.h" #include "llerror.h" @@ -56,17 +57,56 @@ void scheduler::awakened( boost::fibers::context* ctx) noexcept boost::fibers::context* scheduler::pick_next() noexcept { + auto now = LLDate::now().secondsSinceEpoch(); // count calls to pick_next() ++mSwitches; // pick_next() is called when the previous fiber has suspended, and we // need to pick another. Did the previous pick_next() call pick the main - // fiber? If so, it's the main fiber that just suspended. - auto now = LLDate::now().secondsSinceEpoch(); - if (mMainRunning) + // fiber? (Or is this the first pick_next() call?) If so, it's the main + // fiber that just suspended. + if ((! mPrevCtx) || mPrevCtx->get_id() == mMainID) { - mMainRunning = false; mMainLast = now; } + else + { + // How long did we spend in the fiber that just suspended? + // Don't bother with long runs of the main fiber, since (a) it happens + // pretty often and (b) it's moderately likely that we've reached here + // from the canonical yield at the top of mainloop, and what we'd want + // to know about is whatever the main fiber was doing in the + // *previous* iteration of mainloop. + F64 elapsed{ now - mResumeTime }; + LLCoros::CoroData& data{ LLCoros::get_CoroData(mPrevCtx->get_id()) }; + // Find iterator to the first mHistogram key greater than elapsed. + auto past = data.mHistogram.upper_bound(elapsed); + // If the smallest key (mHistogram.begin()->first) is greater than + // elapsed, then we need not bother with this timeslice. + if (past != data.mHistogram.begin()) + { + // Here elapsed was greater than at least one key. Back off to the + // previous entry and increment that count. If it's end(), backing + // off gets us the last entry -- assuming mHistogram isn't empty. + llassert(! data.mHistogram.empty()); + ++(--past)->second; + LL::WorkQueue::ptr_t queue{ getWorkQueue() }; + // make sure the queue exists + if (queue) + { + // If it proves difficult to track down *why* the fiber spent so + // much time, consider also binding and reporting + // boost::stacktrace::stacktrace(). + queue->post( + [name=std::move(data.getName()), elapsed] + { + LL_WARNS_ONCE("LLCoros.scheduler") + << "Coroutine " << name << " ran for " + << elapsed << " seconds" << LL_ENDL; + + }); + } + } + } boost::fibers::context* next; @@ -96,17 +136,9 @@ boost::fibers::context* scheduler::pick_next() noexcept // passage could be skipped. // Record this event for logging, but push it off to a thread pool to - // perform that work. Presumably std::weak_ptr::lock() is cheaper than - // WorkQueue::getInstance(). - LL::WorkQueue::ptr_t queue{ mQueue.lock() }; - // We probably started before the relevant WorkQueue was created. - if (! queue) - { - // Try again to locate the specified WorkQueue. - queue = LL::WorkQueue::getInstance(qname); - mQueue = queue; - } - // Both the lock() call and the getInstance() call might have failed. + // perform that work. + LL::WorkQueue::ptr_t queue{ getWorkQueue() }; + // The work queue we're looking for might not exist right now. if (queue) { // Bind values. Do NOT bind 'this' to avoid cross-thread access! @@ -116,7 +148,6 @@ boost::fibers::context* scheduler::pick_next() noexcept // so we have no access. queue->post( [switches=mSwitches, start=mStart, elapsed, now] - () { U32 runtime(U32(now) - U32(start)); U32 minutes(runtime / 60u); @@ -150,12 +181,29 @@ boost::fibers::context* scheduler::pick_next() noexcept { // we're about to resume the main fiber: it's no longer "ready" mMainCtx = nullptr; - // instead, it's "running" - mMainRunning = true; } + mPrevCtx = next; + // remember when we resumed this fiber so our next call can measure how + // long the previous resumption was + mResumeTime = LLDate::now().secondsSinceEpoch(); return next; } +LL::WorkQueue::ptr_t scheduler::getWorkQueue() +{ + // Cache a weak_ptr to our target work queue, presuming that + // std::weak_ptr::lock() is cheaper than WorkQueue::getInstance(). + LL::WorkQueue::ptr_t queue{ mQueue.lock() }; + // We probably started before the relevant WorkQueue was created. + if (! queue) + { + // Try again to locate the specified WorkQueue. + queue = LL::WorkQueue::getInstance(qname); + mQueue = queue; + } + return queue; +} + void scheduler::use() { boost::fibers::use_scheduling_algorithm<scheduler>(); |