diff options
-rw-r--r-- | indra/llcommon/coro_scheduler.cpp | 84 | ||||
-rw-r--r-- | indra/llcommon/coro_scheduler.h | 15 | ||||
-rw-r--r-- | indra/llcommon/llcoros.cpp | 37 | ||||
-rw-r--r-- | indra/llcommon/llcoros.h | 8 |
4 files changed, 119 insertions, 25 deletions
diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp index 2d8b6e1a97..93efac7c7e 100644 --- a/indra/llcommon/coro_scheduler.cpp +++ b/indra/llcommon/coro_scheduler.cpp @@ -20,6 +20,7 @@ #include <boost/fiber/operations.hpp> // other Linden headers #include "llcallbacklist.h" +#include "llcoros.h" #include "lldate.h" #include "llerror.h" @@ -56,17 +57,56 @@ void scheduler::awakened( boost::fibers::context* ctx) noexcept boost::fibers::context* scheduler::pick_next() noexcept { + auto now = LLDate::now().secondsSinceEpoch(); // count calls to pick_next() ++mSwitches; // pick_next() is called when the previous fiber has suspended, and we // need to pick another. Did the previous pick_next() call pick the main - // fiber? If so, it's the main fiber that just suspended. - auto now = LLDate::now().secondsSinceEpoch(); - if (mMainRunning) + // fiber? (Or is this the first pick_next() call?) If so, it's the main + // fiber that just suspended. + if ((! mPrevCtx) || mPrevCtx->get_id() == mMainID) { - mMainRunning = false; mMainLast = now; } + else + { + // How long did we spend in the fiber that just suspended? + // Don't bother with long runs of the main fiber, since (a) it happens + // pretty often and (b) it's moderately likely that we've reached here + // from the canonical yield at the top of mainloop, and what we'd want + // to know about is whatever the main fiber was doing in the + // *previous* iteration of mainloop. + F64 elapsed{ now - mResumeTime }; + LLCoros::CoroData& data{ LLCoros::get_CoroData(mPrevCtx->get_id()) }; + // Find iterator to the first mHistogram key greater than elapsed. + auto past = data.mHistogram.upper_bound(elapsed); + // If the smallest key (mHistogram.begin()->first) is greater than + // elapsed, then we need not bother with this timeslice. + if (past != data.mHistogram.begin()) + { + // Here elapsed was greater than at least one key. Back off to the + // previous entry and increment that count. If it's end(), backing + // off gets us the last entry -- assuming mHistogram isn't empty. + llassert(! data.mHistogram.empty()); + ++(--past)->second; + LL::WorkQueue::ptr_t queue{ getWorkQueue() }; + // make sure the queue exists + if (queue) + { + // If it proves difficult to track down *why* the fiber spent so + // much time, consider also binding and reporting + // boost::stacktrace::stacktrace(). + queue->post( + [name=std::move(data.getName()), elapsed] + { + LL_WARNS_ONCE("LLCoros.scheduler") + << "Coroutine " << name << " ran for " + << elapsed << " seconds" << LL_ENDL; + + }); + } + } + } boost::fibers::context* next; @@ -96,17 +136,9 @@ boost::fibers::context* scheduler::pick_next() noexcept // passage could be skipped. // Record this event for logging, but push it off to a thread pool to - // perform that work. Presumably std::weak_ptr::lock() is cheaper than - // WorkQueue::getInstance(). - LL::WorkQueue::ptr_t queue{ mQueue.lock() }; - // We probably started before the relevant WorkQueue was created. - if (! queue) - { - // Try again to locate the specified WorkQueue. - queue = LL::WorkQueue::getInstance(qname); - mQueue = queue; - } - // Both the lock() call and the getInstance() call might have failed. + // perform that work. + LL::WorkQueue::ptr_t queue{ getWorkQueue() }; + // The work queue we're looking for might not exist right now. if (queue) { // Bind values. Do NOT bind 'this' to avoid cross-thread access! @@ -116,7 +148,6 @@ boost::fibers::context* scheduler::pick_next() noexcept // so we have no access. queue->post( [switches=mSwitches, start=mStart, elapsed, now] - () { U32 runtime(U32(now) - U32(start)); U32 minutes(runtime / 60u); @@ -150,12 +181,29 @@ boost::fibers::context* scheduler::pick_next() noexcept { // we're about to resume the main fiber: it's no longer "ready" mMainCtx = nullptr; - // instead, it's "running" - mMainRunning = true; } + mPrevCtx = next; + // remember when we resumed this fiber so our next call can measure how + // long the previous resumption was + mResumeTime = LLDate::now().secondsSinceEpoch(); return next; } +LL::WorkQueue::ptr_t scheduler::getWorkQueue() +{ + // Cache a weak_ptr to our target work queue, presuming that + // std::weak_ptr::lock() is cheaper than WorkQueue::getInstance(). + LL::WorkQueue::ptr_t queue{ mQueue.lock() }; + // We probably started before the relevant WorkQueue was created. + if (! queue) + { + // Try again to locate the specified WorkQueue. + queue = LL::WorkQueue::getInstance(qname); + mQueue = queue; + } + return queue; +} + void scheduler::use() { boost::fibers::use_scheduling_algorithm<scheduler>(); diff --git a/indra/llcommon/coro_scheduler.h b/indra/llcommon/coro_scheduler.h index eee2d746b5..7af90685dc 100644 --- a/indra/llcommon/coro_scheduler.h +++ b/indra/llcommon/coro_scheduler.h @@ -47,17 +47,20 @@ public: static void use(); private: - // This is the fiber::id of the main fiber. We use this to discover - // whether the fiber passed to awakened() is in fact the main fiber. + LL::WorkQueue::ptr_t getWorkQueue(); + + // This is the fiber::id of the main fiber. boost::fibers::fiber::id mMainID; - // This context* is nullptr until awakened() notices that the main fiber - // has become ready, at which point it contains the main fiber's context*. + // This context* is nullptr while the main fiber is running or suspended, + // but is set to the main fiber's context each time the main fiber is ready. boost::fibers::context* mMainCtx{}; - // Set when pick_next() returns the main fiber. - bool mMainRunning{ false }; + // Remember the context returned by the previous pick_next() call. + boost::fibers::context* mPrevCtx{}; // If it's been at least this long since the last time the main fiber got // control, jump it to the head of the queue. F64 mTimeslice{ DEFAULT_TIMESLICE }; + // Time when we resumed the most recently running fiber + F64 mResumeTime{ 0 }; // Timestamp as of the last time we suspended the main fiber. F64 mMainLast{ 0 }; // Timestamp of start time diff --git a/indra/llcommon/llcoros.cpp b/indra/llcommon/llcoros.cpp index 0a3ca99b05..5a3cbd2ef1 100644 --- a/indra/llcommon/llcoros.cpp +++ b/indra/llcommon/llcoros.cpp @@ -57,6 +57,7 @@ #include "llsdutil.h" #include "lltimer.h" #include "stringize.h" +#include "scope_exit.h" #if LL_WINDOWS #include <excpt.h> @@ -342,6 +343,33 @@ void LLCoros::toplevel(std::string name, callable_t callable) // run the code the caller actually wants in the coroutine try { + LL::scope_exit report{ + [&corodata] + { + bool allzero = true; + for (const auto& [threshold, occurs] : corodata.mHistogram) + { + if (occurs) + { + allzero = false; + break; + } + } + if (! allzero) + { + LL_WARNS("LLCoros") << "coroutine " << corodata.mName; + const char* sep = " exceeded "; + for (const auto& [threshold, occurs] : corodata.mHistogram) + { + if (occurs) + { + LL_CONT << sep << threshold << " " << occurs << " times"; + sep = ", "; + } + } + LL_ENDL; + } + }}; LL::seh::catcher(callable); } catch (const Stop& exc) @@ -466,7 +494,14 @@ LLBoundListener LLCoros::getStopListener(const std::string& caller, LLCoros::CoroData::CoroData(const std::string& name): super(boost::this_fiber::get_id()), mName(name), - mCreationTime(LLTimer::getTotalSeconds()) + mCreationTime(LLTimer::getTotalSeconds()), + // Preset threshold times in mHistogram + mHistogram{ + {0.004, 0}, + {0.040, 0}, + {0.400, 0}, + {1.000, 0} + } { // we expect the empty string for the main coroutine if (name.empty()) diff --git a/indra/llcommon/llcoros.h b/indra/llcommon/llcoros.h index 662be9dcc9..1edcb7e387 100644 --- a/indra/llcommon/llcoros.h +++ b/indra/llcommon/llcoros.h @@ -37,6 +37,7 @@ #include <boost/fiber/fss.hpp> #include <exception> #include <functional> +#include <map> #include <queue> #include <string> #include <unordered_map> @@ -367,6 +368,13 @@ private: // setStatus() state std::string mStatus; F64 mCreationTime; // since epoch + // Histogram of how many times this coroutine's timeslice exceeds + // certain thresholds. mHistogram is pre-populated with those + // thresholds as keys. If k0 is one threshold key and k1 is the next, + // mHistogram[k0] is the number of times a coroutine timeslice tn ran + // (k0 <= tn < k1). A timeslice less than mHistogram.begin()->first is + // fine; we don't need to record those. + std::map<F64, U32> mHistogram; }; // Identify the current coroutine's CoroData. This local_ptr isn't static |