summaryrefslogtreecommitdiff
path: root/indra/llcommon/coro_scheduler.cpp
diff options
context:
space:
mode:
authorNat Goodspeed <nat@lindenlab.com>2024-10-22 11:34:46 -0400
committerNat Goodspeed <nat@lindenlab.com>2024-10-22 11:34:46 -0400
commitb0645835595f3517223329ba62f46277d3e3a9dd (patch)
treea2b523c3beff518dcbec55663185d6ff70ae5097 /indra/llcommon/coro_scheduler.cpp
parent8e737a5d45157426cb84927170a78baea2106813 (diff)
Make llcoro::scheduler log coros that run too long between yields.
Introduce LLCoros::CoroData::mHistogram, a map of cutoff times (bucket breakpoints) with counts of occurrences. The idea is that mHistogram counts how many times the real time taken by a particular coroutine resumption falls into one of those buckets. Initialize the map with guessed buckets; these are set in llcoros.cpp so they can be changed without requiring extensive rebuilds. scheduler::pick_next() now records the timestamp and fiber context just before the fiber manager resumes the next coroutine. If the next pick_next() call reveals that the previous resumption took longer than the minimum bucket breakpoint, it increments the appropriate bucket counter and logs the instance. LLCoros::toplevel() reports nonzero mHistogram entries on coroutine termination.
Diffstat (limited to 'indra/llcommon/coro_scheduler.cpp')
-rw-r--r--indra/llcommon/coro_scheduler.cpp84
1 files changed, 66 insertions, 18 deletions
diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp
index 2d8b6e1a97..93efac7c7e 100644
--- a/indra/llcommon/coro_scheduler.cpp
+++ b/indra/llcommon/coro_scheduler.cpp
@@ -20,6 +20,7 @@
#include <boost/fiber/operations.hpp>
// other Linden headers
#include "llcallbacklist.h"
+#include "llcoros.h"
#include "lldate.h"
#include "llerror.h"
@@ -56,17 +57,56 @@ void scheduler::awakened( boost::fibers::context* ctx) noexcept
boost::fibers::context* scheduler::pick_next() noexcept
{
+ auto now = LLDate::now().secondsSinceEpoch();
// count calls to pick_next()
++mSwitches;
// pick_next() is called when the previous fiber has suspended, and we
// need to pick another. Did the previous pick_next() call pick the main
- // fiber? If so, it's the main fiber that just suspended.
- auto now = LLDate::now().secondsSinceEpoch();
- if (mMainRunning)
+ // fiber? (Or is this the first pick_next() call?) If so, it's the main
+ // fiber that just suspended.
+ if ((! mPrevCtx) || mPrevCtx->get_id() == mMainID)
{
- mMainRunning = false;
mMainLast = now;
}
+ else
+ {
+ // How long did we spend in the fiber that just suspended?
+ // Don't bother with long runs of the main fiber, since (a) it happens
+ // pretty often and (b) it's moderately likely that we've reached here
+ // from the canonical yield at the top of mainloop, and what we'd want
+ // to know about is whatever the main fiber was doing in the
+ // *previous* iteration of mainloop.
+ F64 elapsed{ now - mResumeTime };
+ LLCoros::CoroData& data{ LLCoros::get_CoroData(mPrevCtx->get_id()) };
+ // Find iterator to the first mHistogram key greater than elapsed.
+ auto past = data.mHistogram.upper_bound(elapsed);
+ // If the smallest key (mHistogram.begin()->first) is greater than
+ // elapsed, then we need not bother with this timeslice.
+ if (past != data.mHistogram.begin())
+ {
+ // Here elapsed was greater than at least one key. Back off to the
+ // previous entry and increment that count. If it's end(), backing
+ // off gets us the last entry -- assuming mHistogram isn't empty.
+ llassert(! data.mHistogram.empty());
+ ++(--past)->second;
+ LL::WorkQueue::ptr_t queue{ getWorkQueue() };
+ // make sure the queue exists
+ if (queue)
+ {
+ // If it proves difficult to track down *why* the fiber spent so
+ // much time, consider also binding and reporting
+ // boost::stacktrace::stacktrace().
+ queue->post(
+ [name=std::move(data.getName()), elapsed]
+ {
+ LL_WARNS_ONCE("LLCoros.scheduler")
+ << "Coroutine " << name << " ran for "
+ << elapsed << " seconds" << LL_ENDL;
+
+ });
+ }
+ }
+ }
boost::fibers::context* next;
@@ -96,17 +136,9 @@ boost::fibers::context* scheduler::pick_next() noexcept
// passage could be skipped.
// Record this event for logging, but push it off to a thread pool to
- // perform that work. Presumably std::weak_ptr::lock() is cheaper than
- // WorkQueue::getInstance().
- LL::WorkQueue::ptr_t queue{ mQueue.lock() };
- // We probably started before the relevant WorkQueue was created.
- if (! queue)
- {
- // Try again to locate the specified WorkQueue.
- queue = LL::WorkQueue::getInstance(qname);
- mQueue = queue;
- }
- // Both the lock() call and the getInstance() call might have failed.
+ // perform that work.
+ LL::WorkQueue::ptr_t queue{ getWorkQueue() };
+ // The work queue we're looking for might not exist right now.
if (queue)
{
// Bind values. Do NOT bind 'this' to avoid cross-thread access!
@@ -116,7 +148,6 @@ boost::fibers::context* scheduler::pick_next() noexcept
// so we have no access.
queue->post(
[switches=mSwitches, start=mStart, elapsed, now]
- ()
{
U32 runtime(U32(now) - U32(start));
U32 minutes(runtime / 60u);
@@ -150,12 +181,29 @@ boost::fibers::context* scheduler::pick_next() noexcept
{
// we're about to resume the main fiber: it's no longer "ready"
mMainCtx = nullptr;
- // instead, it's "running"
- mMainRunning = true;
}
+ mPrevCtx = next;
+ // remember when we resumed this fiber so our next call can measure how
+ // long the previous resumption was
+ mResumeTime = LLDate::now().secondsSinceEpoch();
return next;
}
+LL::WorkQueue::ptr_t scheduler::getWorkQueue()
+{
+ // Cache a weak_ptr to our target work queue, presuming that
+ // std::weak_ptr::lock() is cheaper than WorkQueue::getInstance().
+ LL::WorkQueue::ptr_t queue{ mQueue.lock() };
+ // We probably started before the relevant WorkQueue was created.
+ if (! queue)
+ {
+ // Try again to locate the specified WorkQueue.
+ queue = LL::WorkQueue::getInstance(qname);
+ mQueue = queue;
+ }
+ return queue;
+}
+
void scheduler::use()
{
boost::fibers::use_scheduling_algorithm<scheduler>();