summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornat-goodspeed <nat@lindenlab.com>2024-10-22 13:17:07 -0400
committerGitHub <noreply@github.com>2024-10-22 13:17:07 -0400
commit394f7b37f2ec05c7cfb32c350432886f1c493c85 (patch)
tree35cee467cdf358379f9f8dcd0cc0a62a3b4b7c79
parentede5af1b18b47acea039e91c48dceb136dfbb337 (diff)
parent77f9ab96779c55a20a5ccd8b9e906227b3e8e5c8 (diff)
Merge pull request #2911 from secondlife/nat/warn-timeslice
Make `llcoro::scheduler` log coros that run too long between yields.
-rw-r--r--indra/llcommon/coro_scheduler.cpp83
-rw-r--r--indra/llcommon/coro_scheduler.h15
-rw-r--r--indra/llcommon/llcoros.cpp156
-rw-r--r--indra/llcommon/llcoros.h105
-rw-r--r--indra/llcommon/workqueue.cpp4
-rw-r--r--indra/llmessage/llcoproceduremanager.cpp24
-rw-r--r--indra/test/sync.h6
7 files changed, 271 insertions, 122 deletions
diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp
index 2d8b6e1a97..b6117fa6a1 100644
--- a/indra/llcommon/coro_scheduler.cpp
+++ b/indra/llcommon/coro_scheduler.cpp
@@ -20,6 +20,7 @@
#include <boost/fiber/operations.hpp>
// other Linden headers
#include "llcallbacklist.h"
+#include "llcoros.h"
#include "lldate.h"
#include "llerror.h"
@@ -56,17 +57,55 @@ void scheduler::awakened( boost::fibers::context* ctx) noexcept
boost::fibers::context* scheduler::pick_next() noexcept
{
+ auto now = LLDate::now().secondsSinceEpoch();
// count calls to pick_next()
++mSwitches;
// pick_next() is called when the previous fiber has suspended, and we
// need to pick another. Did the previous pick_next() call pick the main
- // fiber? If so, it's the main fiber that just suspended.
- auto now = LLDate::now().secondsSinceEpoch();
- if (mMainRunning)
+ // fiber? (Or is this the first pick_next() call?) If so, it's the main
+ // fiber that just suspended.
+ if ((! mPrevCtx) || mPrevCtx->get_id() == mMainID)
{
- mMainRunning = false;
mMainLast = now;
}
+ else
+ {
+ // How long did we spend in the fiber that just suspended?
+ // Don't bother with long runs of the main fiber, since (a) it happens
+ // pretty often and (b) it's moderately likely that we've reached here
+ // from the canonical yield at the top of mainloop, and what we'd want
+ // to know about is whatever the main fiber was doing in the
+ // *previous* iteration of mainloop.
+ F64 elapsed{ now - mResumeTime };
+ LLCoros::CoroData& data{ LLCoros::get_CoroData(mPrevCtx->get_id()) };
+ // Find iterator to the first mHistogram key greater than elapsed.
+ auto past = data.mHistogram.upper_bound(elapsed);
+ // If the smallest key (mHistogram.begin()->first) is greater than
+ // elapsed, then we need not bother with this timeslice.
+ if (past != data.mHistogram.begin())
+ {
+ // Here elapsed was greater than at least one key. Back off to the
+ // previous entry and increment that count. If it's end(), backing
+ // off gets us the last entry -- assuming mHistogram isn't empty.
+ llassert(! data.mHistogram.empty());
+ ++(--past)->second;
+ LL::WorkQueue::ptr_t queue{ getWorkQueue() };
+ // make sure the queue exists
+ if (queue)
+ {
+ // If it proves difficult to track down *why* the fiber spent so
+ // much time, consider also binding and reporting
+ // boost::stacktrace::stacktrace().
+ queue->post(
+ [name=data.getName(), elapsed]
+ {
+ LL_WARNS_ONCE("LLCoros.scheduler")
+ << "Coroutine " << name << " ran for "
+ << elapsed << " seconds" << LL_ENDL;
+ });
+ }
+ }
+ }
boost::fibers::context* next;
@@ -96,17 +135,9 @@ boost::fibers::context* scheduler::pick_next() noexcept
// passage could be skipped.
// Record this event for logging, but push it off to a thread pool to
- // perform that work. Presumably std::weak_ptr::lock() is cheaper than
- // WorkQueue::getInstance().
- LL::WorkQueue::ptr_t queue{ mQueue.lock() };
- // We probably started before the relevant WorkQueue was created.
- if (! queue)
- {
- // Try again to locate the specified WorkQueue.
- queue = LL::WorkQueue::getInstance(qname);
- mQueue = queue;
- }
- // Both the lock() call and the getInstance() call might have failed.
+ // perform that work.
+ LL::WorkQueue::ptr_t queue{ getWorkQueue() };
+ // The work queue we're looking for might not exist right now.
if (queue)
{
// Bind values. Do NOT bind 'this' to avoid cross-thread access!
@@ -116,7 +147,6 @@ boost::fibers::context* scheduler::pick_next() noexcept
// so we have no access.
queue->post(
[switches=mSwitches, start=mStart, elapsed, now]
- ()
{
U32 runtime(U32(now) - U32(start));
U32 minutes(runtime / 60u);
@@ -150,12 +180,29 @@ boost::fibers::context* scheduler::pick_next() noexcept
{
// we're about to resume the main fiber: it's no longer "ready"
mMainCtx = nullptr;
- // instead, it's "running"
- mMainRunning = true;
}
+ mPrevCtx = next;
+ // remember when we resumed this fiber so our next call can measure how
+ // long the previous resumption was
+ mResumeTime = LLDate::now().secondsSinceEpoch();
return next;
}
+LL::WorkQueue::ptr_t scheduler::getWorkQueue()
+{
+ // Cache a weak_ptr to our target work queue, presuming that
+ // std::weak_ptr::lock() is cheaper than WorkQueue::getInstance().
+ LL::WorkQueue::ptr_t queue{ mQueue.lock() };
+ // We probably started before the relevant WorkQueue was created.
+ if (! queue)
+ {
+ // Try again to locate the specified WorkQueue.
+ queue = LL::WorkQueue::getInstance(qname);
+ mQueue = queue;
+ }
+ return queue;
+}
+
void scheduler::use()
{
boost::fibers::use_scheduling_algorithm<scheduler>();
diff --git a/indra/llcommon/coro_scheduler.h b/indra/llcommon/coro_scheduler.h
index eee2d746b5..7af90685dc 100644
--- a/indra/llcommon/coro_scheduler.h
+++ b/indra/llcommon/coro_scheduler.h
@@ -47,17 +47,20 @@ public:
static void use();
private:
- // This is the fiber::id of the main fiber. We use this to discover
- // whether the fiber passed to awakened() is in fact the main fiber.
+ LL::WorkQueue::ptr_t getWorkQueue();
+
+ // This is the fiber::id of the main fiber.
boost::fibers::fiber::id mMainID;
- // This context* is nullptr until awakened() notices that the main fiber
- // has become ready, at which point it contains the main fiber's context*.
+ // This context* is nullptr while the main fiber is running or suspended,
+ // but is set to the main fiber's context each time the main fiber is ready.
boost::fibers::context* mMainCtx{};
- // Set when pick_next() returns the main fiber.
- bool mMainRunning{ false };
+ // Remember the context returned by the previous pick_next() call.
+ boost::fibers::context* mPrevCtx{};
// If it's been at least this long since the last time the main fiber got
// control, jump it to the head of the queue.
F64 mTimeslice{ DEFAULT_TIMESLICE };
+ // Time when we resumed the most recently running fiber
+ F64 mResumeTime{ 0 };
// Timestamp as of the last time we suspended the main fiber.
F64 mMainLast{ 0 };
// Timestamp of start time
diff --git a/indra/llcommon/llcoros.cpp b/indra/llcommon/llcoros.cpp
index 1ae5c87a00..5a3cbd2ef1 100644
--- a/indra/llcommon/llcoros.cpp
+++ b/indra/llcommon/llcoros.cpp
@@ -57,16 +57,19 @@
#include "llsdutil.h"
#include "lltimer.h"
#include "stringize.h"
+#include "scope_exit.h"
#if LL_WINDOWS
#include <excpt.h>
#endif
+thread_local std::unordered_map<std::string, int> LLCoros::mPrefixMap;
+thread_local std::unordered_map<std::string, LLCoros::id> LLCoros::mNameMap;
+
// static
bool LLCoros::on_main_coro()
{
- return (!LLCoros::instanceExists() ||
- LLCoros::getName().empty());
+ return (!instanceExists() || get_CoroData().isMain);
}
// static
@@ -76,11 +79,11 @@ bool LLCoros::on_main_thread_main_coro()
}
// static
-LLCoros::CoroData& LLCoros::get_CoroData(const std::string&)
+LLCoros::CoroData& LLCoros::get_CoroData()
{
CoroData* current{ nullptr };
// be careful about attempted accesses in the final throes of app shutdown
- if (! wasDeleted())
+ if (instanceExists())
{
current = instance().mCurrent.get();
}
@@ -89,16 +92,26 @@ LLCoros::CoroData& LLCoros::get_CoroData(const std::string&)
// canonical values.
if (! current)
{
- static std::atomic<int> which_thread(0);
- // Use alternate CoroData constructor.
- static thread_local CoroData sMain(which_thread++);
// We need not reset() the local_ptr to this instance; we'll simply
// find it again every time we discover that current is null.
- current = &sMain;
+ current = &main_CoroData();
}
return *current;
}
+LLCoros::CoroData& LLCoros::get_CoroData(id id)
+{
+ auto found = CoroData::getInstance(id);
+ return found? *found : main_CoroData();
+}
+
+LLCoros::CoroData& LLCoros::main_CoroData()
+{
+ // tell CoroData we're "main"
+ static thread_local CoroData sMain("");
+ return sMain;
+}
+
//static
LLCoros::coro::id LLCoros::get_self()
{
@@ -108,28 +121,28 @@ LLCoros::coro::id LLCoros::get_self()
//static
void LLCoros::set_consuming(bool consuming)
{
- auto& data(get_CoroData("set_consuming()"));
+ auto& data(get_CoroData());
// DO NOT call this on the main() coroutine.
- llassert_always(! data.mName.empty());
+ llassert_always(! data.isMain);
data.mConsuming = consuming;
}
//static
bool LLCoros::get_consuming()
{
- return get_CoroData("get_consuming()").mConsuming;
+ return get_CoroData().mConsuming;
}
// static
void LLCoros::setStatus(const std::string& status)
{
- get_CoroData("setStatus()").mStatus = status;
+ get_CoroData().mStatus = status;
}
// static
std::string LLCoros::getStatus()
{
- return get_CoroData("getStatus()").mStatus;
+ return get_CoroData().mStatus;
}
LLCoros::LLCoros():
@@ -186,9 +199,8 @@ void LLCoros::cleanupSingleton()
std::string LLCoros::generateDistinctName(const std::string& prefix) const
{
- static int unique = 0;
-
- // Allowing empty name would make getName()'s not-found return ambiguous.
+ // Empty name would trigger CoroData's constructor's special case for the
+ // main coroutine.
if (prefix.empty())
{
LL_ERRS("LLCoros") << "LLCoros::launch(): pass non-empty name string" << LL_ENDL;
@@ -196,9 +208,11 @@ std::string LLCoros::generateDistinctName(const std::string& prefix) const
// If the specified name isn't already in the map, just use that.
std::string name(prefix);
+ // maintain a distinct int suffix for each prefix
+ int& unique = mPrefixMap[prefix];
- // Until we find an unused name, append a numeric suffix for uniqueness.
- while (CoroData::getInstance(name))
+ // Until we find an unused name, append int suffix for uniqueness.
+ while (mNameMap.find(name) != mNameMap.end())
{
name = stringize(prefix, unique++);
}
@@ -207,9 +221,16 @@ std::string LLCoros::generateDistinctName(const std::string& prefix) const
bool LLCoros::killreq(const std::string& name)
{
- auto found = CoroData::getInstance(name);
+ auto foundName = mNameMap.find(name);
+ if (foundName == mNameMap.end())
+ {
+ // couldn't find that name in map
+ return false;
+ }
+ auto found = CoroData::getInstance(foundName->second);
if (! found)
{
+ // found name, but CoroData with that ID key no longer exists
return false;
}
// Next time the subject coroutine calls checkStop(), make it terminate.
@@ -224,14 +245,13 @@ bool LLCoros::killreq(const std::string& name)
//static
std::string LLCoros::getName()
{
- return get_CoroData("getName()").mName;
+ return get_CoroData().getName();
}
-//static
-std::string LLCoros::logname()
+// static
+std::string LLCoros::getName(id id)
{
- auto& data(get_CoroData("logname()"));
- return data.mName.empty()? data.getKey() : data.mName;
+ return get_CoroData(id).getName();
}
void LLCoros::saveException(const std::string& name, std::exception_ptr exc)
@@ -264,7 +284,7 @@ void LLCoros::printActiveCoroutines(const std::string& when)
{
LL_INFOS("LLCoros") << "-------------- List of active coroutines ------------";
F64 time = LLTimer::getTotalSeconds();
- for (auto& cd : CoroData::instance_snapshot())
+ for (const auto& cd : CoroData::instance_snapshot())
{
F64 life_time = time - cd.mCreationTime;
LL_CONT << LL_NEWLINE
@@ -323,6 +343,33 @@ void LLCoros::toplevel(std::string name, callable_t callable)
// run the code the caller actually wants in the coroutine
try
{
+ LL::scope_exit report{
+ [&corodata]
+ {
+ bool allzero = true;
+ for (const auto& [threshold, occurs] : corodata.mHistogram)
+ {
+ if (occurs)
+ {
+ allzero = false;
+ break;
+ }
+ }
+ if (! allzero)
+ {
+ LL_WARNS("LLCoros") << "coroutine " << corodata.mName;
+ const char* sep = " exceeded ";
+ for (const auto& [threshold, occurs] : corodata.mHistogram)
+ {
+ if (occurs)
+ {
+ LL_CONT << sep << threshold << " " << occurs << " times";
+ sep = ", ";
+ }
+ }
+ LL_ENDL;
+ }
+ }};
LL::seh::catcher(callable);
}
catch (const Stop& exc)
@@ -364,8 +411,8 @@ void LLCoros::checkStop(callable_t cleanup)
// do this AFTER the check above, because get_CoroData() depends on the
// local_ptr in our instance().
- auto& data(get_CoroData("checkStop()"));
- if (data.mName.empty())
+ auto& data(get_CoroData());
+ if (data.isMain)
{
// Our Stop exception and its subclasses are intended to stop loitering
// coroutines. Don't throw it from the main coroutine.
@@ -385,7 +432,7 @@ void LLCoros::checkStop(callable_t cleanup)
{
// Someone wants to kill this coroutine
cleanup();
- LLTHROW(Killed(stringize("coroutine ", data.mName, " killed by ", data.mKilledBy)));
+ LLTHROW(Killed(stringize("coroutine ", data.getName(), " killed by ", data.mKilledBy)));
}
}
@@ -445,20 +492,51 @@ LLBoundListener LLCoros::getStopListener(const std::string& caller,
}
LLCoros::CoroData::CoroData(const std::string& name):
- LLInstanceTracker<CoroData, std::string>(name),
+ super(boost::this_fiber::get_id()),
mName(name),
- mCreationTime(LLTimer::getTotalSeconds())
+ mCreationTime(LLTimer::getTotalSeconds()),
+ // Preset threshold times in mHistogram
+ mHistogram{
+ {0.004, 0},
+ {0.040, 0},
+ {0.400, 0},
+ {1.000, 0}
+ }
{
+ // we expect the empty string for the main coroutine
+ if (name.empty())
+ {
+ isMain = true;
+ if (on_main_thread())
+ {
+ // main coroutine on main thread
+ mName = "main";
+ }
+ else
+ {
+ // main coroutine on some other thread
+ static std::atomic<int> main_no{ 0 };
+ mName = stringize("main", ++main_no);
+ }
+ }
+ // maintain LLCoros::mNameMap
+ LLCoros::mNameMap.emplace(mName, getKey());
+}
+
+LLCoros::CoroData::~CoroData()
+{
+ // Don't try to erase the static main CoroData from our static
+ // thread_local mNameMap; that could run into destruction order problems.
+ if (! isMain)
+ {
+ LLCoros::mNameMap.erase(mName);
+ }
}
-LLCoros::CoroData::CoroData(int n):
- // This constructor is used for the thread_local instance belonging to the
- // default coroutine on each thread. We must give each one a different
- // LLInstanceTracker key because LLInstanceTracker's map spans all
- // threads, but we want the default coroutine on each thread to have the
- // empty string as its visible name because some consumers test for that.
- LLInstanceTracker<CoroData, std::string>("main" + stringize(n)),
- mName(),
- mCreationTime(LLTimer::getTotalSeconds())
+std::string LLCoros::CoroData::getName() const
{
+ if (mStatus.empty())
+ return mName;
+ else
+ return stringize(mName, " (", mStatus, ")");
}
diff --git a/indra/llcommon/llcoros.h b/indra/llcommon/llcoros.h
index 0291d7f1d9..1edcb7e387 100644
--- a/indra/llcommon/llcoros.h
+++ b/indra/llcommon/llcoros.h
@@ -37,41 +37,38 @@
#include <boost/fiber/fss.hpp>
#include <exception>
#include <functional>
+#include <map>
#include <queue>
#include <string>
+#include <unordered_map>
+
+namespace llcoro
+{
+class scheduler;
+}
/**
- * Registry of named Boost.Coroutine instances
- *
- * The Boost.Coroutine library supports the general case of a coroutine
- * accepting arbitrary parameters and yielding multiple (sets of) results. For
- * such use cases, it's natural for the invoking code to retain the coroutine
- * instance: the consumer repeatedly calls into the coroutine, perhaps passing
- * new parameter values, prompting it to yield its next result.
- *
- * Our typical coroutine usage is different, though. For us, coroutines
- * provide an alternative to the @c Responder pattern. Our typical coroutine
- * has @c void return, invoked in fire-and-forget mode: the handler for some
- * user gesture launches the coroutine and promptly returns to the main loop.
- * The coroutine initiates some action that will take multiple frames (e.g. a
- * capability request), waits for its result, processes it and silently steals
- * away.
+ * Registry of named Boost.Fiber instances
*
- * This usage poses two (related) problems:
+ * When the viewer first introduced the semi-independent execution agents now
+ * called fibers, the term "fiber" had not yet become current, and the only
+ * available libraries used the term "coroutine" instead. Within the viewer we
+ * continue to use the term "coroutines," though at present they are actually
+ * boost::fibers::fiber instances.
*
- * # Who should own the coroutine instance? If it's simply local to the
- * handler code that launches it, return from the handler will destroy the
- * coroutine object, terminating the coroutine.
- * # Once the coroutine terminates, in whatever way, who's responsible for
- * cleaning up the coroutine object?
+ * Coroutines provide an alternative to the @c Responder pattern. Our typical
+ * coroutine has @c void return, invoked in fire-and-forget mode: the handler
+ * for some user gesture launches the coroutine and promptly returns to the
+ * main loop. The coroutine initiates some action that will take multiple
+ * frames (e.g. a capability request), waits for its result, processes it and
+ * silently steals away.
*
* LLCoros is a Singleton collection of currently-active coroutine instances.
* Each has a name. You ask LLCoros to launch a new coroutine with a suggested
* name prefix; from your prefix it generates a distinct name, registers the
* new coroutine and returns the actual name.
*
- * The name
- * can provide diagnostic info: we can look up the name of the
+ * The name can provide diagnostic info: we can look up the name of the
* currently-running coroutine.
*/
class LL_COMMON_API LLCoros: public LLSingleton<LLCoros>
@@ -91,12 +88,8 @@ public:
// llassert(LLCoros::on_main_thread_main_coro())
static bool on_main_thread_main_coro();
- /// The viewer's use of the term "coroutine" became deeply embedded before
- /// the industry term "fiber" emerged to distinguish userland threads from
- /// simpler, more transient kinds of coroutines. Semantically they've
- /// always been fibers. But at this point in history, we're pretty much
- /// stuck with the term "coroutine."
typedef boost::fibers::fiber coro;
+ typedef coro::id id;
/// Canonical callable type
typedef std::function<void()> callable_t;
@@ -150,13 +143,16 @@ public:
/**
* From within a coroutine, look up the (tweaked) name string by which
- * this coroutine is registered. Returns the empty string if not found
- * (e.g. if the coroutine was launched by hand rather than using
- * LLCoros::launch()).
+ * this coroutine is registered.
*/
static std::string getName();
/**
+ * Given an id, return the name of that coroutine.
+ */
+ static std::string getName(id);
+
+ /**
* rethrow() is called by the thread's main fiber to propagate an
* exception from any coroutine into the main fiber, where it can engage
* the normal unhandled-exception machinery, up to and including crash
@@ -170,13 +166,6 @@ public:
void rethrow();
/**
- * This variation returns a name suitable for log messages: the explicit
- * name for an explicitly-launched coroutine, or "mainN" for the default
- * coroutine on a thread.
- */
- static std::string logname();
-
- /**
* For delayed initialization. To be clear, this will only affect
* coroutines launched @em after this point. The underlying facility
* provides no way to alter the stack size of any running coroutine.
@@ -187,7 +176,7 @@ public:
void printActiveCoroutines(const std::string& when=std::string());
/// get the current coro::id for those who really really care
- static coro::id get_self();
+ static id get_self();
/**
* Most coroutines, most of the time, don't "consume" the events for which
@@ -236,6 +225,7 @@ public:
setStatus(status);
}
TempStatus(const TempStatus&) = delete;
+ TempStatus& operator=(const TempStatus&) = delete;
~TempStatus()
{
setStatus(mOldStatus);
@@ -331,10 +321,14 @@ public:
using local_ptr = boost::fibers::fiber_specific_ptr<T>;
private:
+ friend class llcoro::scheduler;
+
std::string generateDistinctName(const std::string& prefix) const;
void toplevel(std::string name, callable_t callable);
struct CoroData;
- static CoroData& get_CoroData(const std::string& caller);
+ static CoroData& get_CoroData();
+ static CoroData& get_CoroData(id);
+ static CoroData& main_CoroData();
void saveException(const std::string& name, std::exception_ptr exc);
LLTempBoundListener mConn;
@@ -355,13 +349,18 @@ private:
S32 mStackSize;
// coroutine-local storage, as it were: one per coro we track
- struct CoroData: public LLInstanceTracker<CoroData, std::string>
+ struct CoroData: public LLInstanceTracker<CoroData, id>
{
+ using super = LLInstanceTracker<CoroData, id>;
+
CoroData(const std::string& name);
- CoroData(int n);
+ ~CoroData();
+ std::string getName() const;
+
+ bool isMain{ false };
// tweaked name of the current coroutine
- const std::string mName;
+ std::string mName;
// set_consuming() state -- don't consume events unless specifically directed
bool mConsuming{ false };
// killed by which coroutine
@@ -369,20 +368,24 @@ private:
// setStatus() state
std::string mStatus;
F64 mCreationTime; // since epoch
+ // Histogram of how many times this coroutine's timeslice exceeds
+ // certain thresholds. mHistogram is pre-populated with those
+ // thresholds as keys. If k0 is one threshold key and k1 is the next,
+ // mHistogram[k0] is the number of times a coroutine timeslice tn ran
+ // (k0 <= tn < k1). A timeslice less than mHistogram.begin()->first is
+ // fine; we don't need to record those.
+ std::map<F64, U32> mHistogram;
};
// Identify the current coroutine's CoroData. This local_ptr isn't static
// because it's a member of an LLSingleton, and we rely on it being
// cleaned up in proper dependency order.
local_ptr<CoroData> mCurrent;
-};
-namespace llcoro
-{
-
-inline
-std::string logname() { return LLCoros::logname(); }
-
-} // llcoro
+ // ensure name uniqueness
+ static thread_local std::unordered_map<std::string, int> mPrefixMap;
+ // lookup by name
+ static thread_local std::unordered_map<std::string, id> mNameMap;
+};
#endif /* ! defined(LL_LLCOROS_H) */
diff --git a/indra/llcommon/workqueue.cpp b/indra/llcommon/workqueue.cpp
index 9138c862f9..8b7b97a1f9 100644
--- a/indra/llcommon/workqueue.cpp
+++ b/indra/llcommon/workqueue.cpp
@@ -127,9 +127,7 @@ void LL::WorkQueueBase::error(const std::string& msg)
void LL::WorkQueueBase::checkCoroutine(const std::string& method)
{
- // By convention, the default coroutine on each thread has an empty name
- // string. See also LLCoros::logname().
- if (LLCoros::getName().empty())
+ if (LLCoros::on_main_coro())
{
LLTHROW(Error("Do not call " + method + " from a thread's default coroutine"));
}
diff --git a/indra/llmessage/llcoproceduremanager.cpp b/indra/llmessage/llcoproceduremanager.cpp
index 13972ad399..5539ca7b86 100644
--- a/indra/llmessage/llcoproceduremanager.cpp
+++ b/indra/llmessage/llcoproceduremanager.cpp
@@ -403,6 +403,7 @@ void LLCoprocedurePool::coprocedureInvokerCoro(
CoprocQueuePtr pendingCoprocs,
LLCoreHttpUtil::HttpCoroutineAdapter::ptr_t httpAdapter)
{
+ std::string prevtask;
for (;;)
{
// It is VERY IMPORTANT that we instantiate a new ptr_t just before
@@ -424,10 +425,25 @@ void LLCoprocedurePool::coprocedureInvokerCoro(
// destroyed during pop_wait_for().
QueuedCoproc::ptr_t coproc;
boost::fibers::channel_op_status status;
+ // Each time control reaches our custom coroutine scheduler, we check
+ // how long the previous coroutine ran before yielding, and report
+ // coroutines longer than a certain cutoff. But these coprocedure pool
+ // coroutines are generic; the only way we know what work they're
+ // doing is the task 'status' set by LLCoros::setStatus(). But what if
+ // the coroutine runs the task to completion and returns to waiting?
+ // It does no good to report that "waiting" ran long. So each time we
+ // enter "waiting" status, also report the *previous* task name.
+ std::string waiting = "waiting", newstatus;
+ if (prevtask.empty())
{
- LLCoros::TempStatus st("waiting for work for 10s");
- status = pendingCoprocs->pop_wait_for(coproc, std::chrono::seconds(10));
+ newstatus = waiting;
}
+ else
+ {
+ newstatus = stringize("done ", prevtask, "; ", waiting);
+ }
+ LLCoros::setStatus(newstatus);
+ status = pendingCoprocs->pop_wait_for(coproc, std::chrono::seconds(10));
if (status == boost::fibers::channel_op_status::closed)
{
break;
@@ -436,6 +452,7 @@ void LLCoprocedurePool::coprocedureInvokerCoro(
if(status == boost::fibers::channel_op_status::timeout)
{
LL_DEBUGS_ONCE("CoProcMgr") << "pool '" << mPoolName << "' waiting." << LL_ENDL;
+ prevtask.clear();
continue;
}
// we actually popped an item
@@ -446,6 +463,9 @@ void LLCoprocedurePool::coprocedureInvokerCoro(
try
{
+ // set "status" of pool coroutine to the name of the coproc task
+ prevtask = coproc->mName;
+ LLCoros::setStatus(prevtask);
coproc->mProc(httpAdapter, coproc->mId);
}
catch (const LLCoros::Stop &e)
diff --git a/indra/test/sync.h b/indra/test/sync.h
index 82eef1e5f5..abeb4e17a8 100644
--- a/indra/test/sync.h
+++ b/indra/test/sync.h
@@ -69,7 +69,7 @@ public:
// misleading, as it will be emitted after waiting threads have
// already awakened. But emitting the log message within the lock
// would seem to hold the lock longer than we really ought.
- LL_DEBUGS() << llcoro::logname() << " bump(" << n << ") -> " << updated << LL_ENDL;
+ LL_DEBUGS() << LLCoros::getName() << " bump(" << n << ") -> " << updated << LL_ENDL;
}
/**
@@ -82,7 +82,7 @@ public:
*/
void set(int n)
{
- LL_DEBUGS() << llcoro::logname() << " set(" << n << ")" << LL_ENDL;
+ LL_DEBUGS() << LLCoros::getName() << " set(" << n << ")" << LL_ENDL;
mCond.set_all(n);
}
@@ -101,7 +101,7 @@ public:
private:
void yield_until(const char* func, int arg, int until)
{
- std::string name(llcoro::logname());
+ std::string name(LLCoros::getName());
LL_DEBUGS() << name << " yield_until(" << until << ") suspending" << LL_ENDL;
if (! mCond.wait_for_equal(mTimeout, until))
{