summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--indra/llcommon/coro_scheduler.cpp84
-rw-r--r--indra/llcommon/coro_scheduler.h15
-rw-r--r--indra/llcommon/llcoros.cpp37
-rw-r--r--indra/llcommon/llcoros.h8
4 files changed, 119 insertions, 25 deletions
diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp
index 2d8b6e1a97..93efac7c7e 100644
--- a/indra/llcommon/coro_scheduler.cpp
+++ b/indra/llcommon/coro_scheduler.cpp
@@ -20,6 +20,7 @@
#include <boost/fiber/operations.hpp>
// other Linden headers
#include "llcallbacklist.h"
+#include "llcoros.h"
#include "lldate.h"
#include "llerror.h"
@@ -56,17 +57,56 @@ void scheduler::awakened( boost::fibers::context* ctx) noexcept
boost::fibers::context* scheduler::pick_next() noexcept
{
+ auto now = LLDate::now().secondsSinceEpoch();
// count calls to pick_next()
++mSwitches;
// pick_next() is called when the previous fiber has suspended, and we
// need to pick another. Did the previous pick_next() call pick the main
- // fiber? If so, it's the main fiber that just suspended.
- auto now = LLDate::now().secondsSinceEpoch();
- if (mMainRunning)
+ // fiber? (Or is this the first pick_next() call?) If so, it's the main
+ // fiber that just suspended.
+ if ((! mPrevCtx) || mPrevCtx->get_id() == mMainID)
{
- mMainRunning = false;
mMainLast = now;
}
+ else
+ {
+ // How long did we spend in the fiber that just suspended?
+ // Don't bother with long runs of the main fiber, since (a) it happens
+ // pretty often and (b) it's moderately likely that we've reached here
+ // from the canonical yield at the top of mainloop, and what we'd want
+ // to know about is whatever the main fiber was doing in the
+ // *previous* iteration of mainloop.
+ F64 elapsed{ now - mResumeTime };
+ LLCoros::CoroData& data{ LLCoros::get_CoroData(mPrevCtx->get_id()) };
+ // Find iterator to the first mHistogram key greater than elapsed.
+ auto past = data.mHistogram.upper_bound(elapsed);
+ // If the smallest key (mHistogram.begin()->first) is greater than
+ // elapsed, then we need not bother with this timeslice.
+ if (past != data.mHistogram.begin())
+ {
+ // Here elapsed was greater than at least one key. Back off to the
+ // previous entry and increment that count. If it's end(), backing
+ // off gets us the last entry -- assuming mHistogram isn't empty.
+ llassert(! data.mHistogram.empty());
+ ++(--past)->second;
+ LL::WorkQueue::ptr_t queue{ getWorkQueue() };
+ // make sure the queue exists
+ if (queue)
+ {
+ // If it proves difficult to track down *why* the fiber spent so
+ // much time, consider also binding and reporting
+ // boost::stacktrace::stacktrace().
+ queue->post(
+ [name=std::move(data.getName()), elapsed]
+ {
+ LL_WARNS_ONCE("LLCoros.scheduler")
+ << "Coroutine " << name << " ran for "
+ << elapsed << " seconds" << LL_ENDL;
+
+ });
+ }
+ }
+ }
boost::fibers::context* next;
@@ -96,17 +136,9 @@ boost::fibers::context* scheduler::pick_next() noexcept
// passage could be skipped.
// Record this event for logging, but push it off to a thread pool to
- // perform that work. Presumably std::weak_ptr::lock() is cheaper than
- // WorkQueue::getInstance().
- LL::WorkQueue::ptr_t queue{ mQueue.lock() };
- // We probably started before the relevant WorkQueue was created.
- if (! queue)
- {
- // Try again to locate the specified WorkQueue.
- queue = LL::WorkQueue::getInstance(qname);
- mQueue = queue;
- }
- // Both the lock() call and the getInstance() call might have failed.
+ // perform that work.
+ LL::WorkQueue::ptr_t queue{ getWorkQueue() };
+ // The work queue we're looking for might not exist right now.
if (queue)
{
// Bind values. Do NOT bind 'this' to avoid cross-thread access!
@@ -116,7 +148,6 @@ boost::fibers::context* scheduler::pick_next() noexcept
// so we have no access.
queue->post(
[switches=mSwitches, start=mStart, elapsed, now]
- ()
{
U32 runtime(U32(now) - U32(start));
U32 minutes(runtime / 60u);
@@ -150,12 +181,29 @@ boost::fibers::context* scheduler::pick_next() noexcept
{
// we're about to resume the main fiber: it's no longer "ready"
mMainCtx = nullptr;
- // instead, it's "running"
- mMainRunning = true;
}
+ mPrevCtx = next;
+ // remember when we resumed this fiber so our next call can measure how
+ // long the previous resumption was
+ mResumeTime = LLDate::now().secondsSinceEpoch();
return next;
}
+LL::WorkQueue::ptr_t scheduler::getWorkQueue()
+{
+ // Cache a weak_ptr to our target work queue, presuming that
+ // std::weak_ptr::lock() is cheaper than WorkQueue::getInstance().
+ LL::WorkQueue::ptr_t queue{ mQueue.lock() };
+ // We probably started before the relevant WorkQueue was created.
+ if (! queue)
+ {
+ // Try again to locate the specified WorkQueue.
+ queue = LL::WorkQueue::getInstance(qname);
+ mQueue = queue;
+ }
+ return queue;
+}
+
void scheduler::use()
{
boost::fibers::use_scheduling_algorithm<scheduler>();
diff --git a/indra/llcommon/coro_scheduler.h b/indra/llcommon/coro_scheduler.h
index eee2d746b5..7af90685dc 100644
--- a/indra/llcommon/coro_scheduler.h
+++ b/indra/llcommon/coro_scheduler.h
@@ -47,17 +47,20 @@ public:
static void use();
private:
- // This is the fiber::id of the main fiber. We use this to discover
- // whether the fiber passed to awakened() is in fact the main fiber.
+ LL::WorkQueue::ptr_t getWorkQueue();
+
+ // This is the fiber::id of the main fiber.
boost::fibers::fiber::id mMainID;
- // This context* is nullptr until awakened() notices that the main fiber
- // has become ready, at which point it contains the main fiber's context*.
+ // This context* is nullptr while the main fiber is running or suspended,
+ // but is set to the main fiber's context each time the main fiber is ready.
boost::fibers::context* mMainCtx{};
- // Set when pick_next() returns the main fiber.
- bool mMainRunning{ false };
+ // Remember the context returned by the previous pick_next() call.
+ boost::fibers::context* mPrevCtx{};
// If it's been at least this long since the last time the main fiber got
// control, jump it to the head of the queue.
F64 mTimeslice{ DEFAULT_TIMESLICE };
+ // Time when we resumed the most recently running fiber
+ F64 mResumeTime{ 0 };
// Timestamp as of the last time we suspended the main fiber.
F64 mMainLast{ 0 };
// Timestamp of start time
diff --git a/indra/llcommon/llcoros.cpp b/indra/llcommon/llcoros.cpp
index 0a3ca99b05..5a3cbd2ef1 100644
--- a/indra/llcommon/llcoros.cpp
+++ b/indra/llcommon/llcoros.cpp
@@ -57,6 +57,7 @@
#include "llsdutil.h"
#include "lltimer.h"
#include "stringize.h"
+#include "scope_exit.h"
#if LL_WINDOWS
#include <excpt.h>
@@ -342,6 +343,33 @@ void LLCoros::toplevel(std::string name, callable_t callable)
// run the code the caller actually wants in the coroutine
try
{
+ LL::scope_exit report{
+ [&corodata]
+ {
+ bool allzero = true;
+ for (const auto& [threshold, occurs] : corodata.mHistogram)
+ {
+ if (occurs)
+ {
+ allzero = false;
+ break;
+ }
+ }
+ if (! allzero)
+ {
+ LL_WARNS("LLCoros") << "coroutine " << corodata.mName;
+ const char* sep = " exceeded ";
+ for (const auto& [threshold, occurs] : corodata.mHistogram)
+ {
+ if (occurs)
+ {
+ LL_CONT << sep << threshold << " " << occurs << " times";
+ sep = ", ";
+ }
+ }
+ LL_ENDL;
+ }
+ }};
LL::seh::catcher(callable);
}
catch (const Stop& exc)
@@ -466,7 +494,14 @@ LLBoundListener LLCoros::getStopListener(const std::string& caller,
LLCoros::CoroData::CoroData(const std::string& name):
super(boost::this_fiber::get_id()),
mName(name),
- mCreationTime(LLTimer::getTotalSeconds())
+ mCreationTime(LLTimer::getTotalSeconds()),
+ // Preset threshold times in mHistogram
+ mHistogram{
+ {0.004, 0},
+ {0.040, 0},
+ {0.400, 0},
+ {1.000, 0}
+ }
{
// we expect the empty string for the main coroutine
if (name.empty())
diff --git a/indra/llcommon/llcoros.h b/indra/llcommon/llcoros.h
index 662be9dcc9..1edcb7e387 100644
--- a/indra/llcommon/llcoros.h
+++ b/indra/llcommon/llcoros.h
@@ -37,6 +37,7 @@
#include <boost/fiber/fss.hpp>
#include <exception>
#include <functional>
+#include <map>
#include <queue>
#include <string>
#include <unordered_map>
@@ -367,6 +368,13 @@ private:
// setStatus() state
std::string mStatus;
F64 mCreationTime; // since epoch
+ // Histogram of how many times this coroutine's timeslice exceeds
+ // certain thresholds. mHistogram is pre-populated with those
+ // thresholds as keys. If k0 is one threshold key and k1 is the next,
+ // mHistogram[k0] is the number of times a coroutine timeslice tn ran
+ // (k0 <= tn < k1). A timeslice less than mHistogram.begin()->first is
+ // fine; we don't need to record those.
+ std::map<F64, U32> mHistogram;
};
// Identify the current coroutine's CoroData. This local_ptr isn't static