From a76209d099c3a18967d26e5b091f83e9ae9fd6a1 Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Tue, 22 Oct 2024 09:44:43 -0400 Subject: Eliminate double names for coros + "empty name is main" convention. Instead, introduce bool CoroData::isMain and test that. Use "main" for the name of the main coroutine. That eliminates the logname() method, also the llcoro::logname() free function. It also obviates the alternate CoroData constructor. Use boost::fibers::fiber::id as the LLInstanceTracker key for CoroData, instead of the coroutine name. Introduce get_CoroData(id), also getName(id). Extract static CoroData for the main coroutine to main_CoroData() so both get_CoroData() overloads can use it. Ditch unused get_CoroData(string) parameter: now get_CoroData(). Introduce LLCoros::mNameMap for lookup by name (e.g. killreq()). CoroData's constructor puts an entry into mNameMap; the destructor removes it. Since mNameMap is thread_local (unlike an LLInstanceTracker key), that theoretically permits duplicate coroutine names on different threads. Introduce mPrefixMap to help generate distinct coroutine names, instead of a single scalar. Introduce CoroData::getName(), and use it in both LLCoros::getName() overloads. CoroData::getName() appends mStatus if it's not empty. This is useful for disambiguating generic pool coroutines based on the current task. --- indra/llcommon/llcoros.cpp | 119 +++++++++++++++++++++++++++++-------------- indra/llcommon/llcoros.h | 97 +++++++++++++++++------------------ indra/llcommon/workqueue.cpp | 4 +- indra/test/sync.h | 6 +-- 4 files changed, 131 insertions(+), 95 deletions(-) diff --git a/indra/llcommon/llcoros.cpp b/indra/llcommon/llcoros.cpp index 1ae5c87a00..0a3ca99b05 100644 --- a/indra/llcommon/llcoros.cpp +++ b/indra/llcommon/llcoros.cpp @@ -62,11 +62,13 @@ #include #endif +thread_local std::unordered_map LLCoros::mPrefixMap; +thread_local std::unordered_map LLCoros::mNameMap; + // static bool LLCoros::on_main_coro() { - return (!LLCoros::instanceExists() || - LLCoros::getName().empty()); + return (!instanceExists() || get_CoroData().isMain); } // static @@ -76,11 +78,11 @@ bool LLCoros::on_main_thread_main_coro() } // static -LLCoros::CoroData& LLCoros::get_CoroData(const std::string&) +LLCoros::CoroData& LLCoros::get_CoroData() { CoroData* current{ nullptr }; // be careful about attempted accesses in the final throes of app shutdown - if (! wasDeleted()) + if (instanceExists()) { current = instance().mCurrent.get(); } @@ -89,16 +91,26 @@ LLCoros::CoroData& LLCoros::get_CoroData(const std::string&) // canonical values. if (! current) { - static std::atomic which_thread(0); - // Use alternate CoroData constructor. - static thread_local CoroData sMain(which_thread++); // We need not reset() the local_ptr to this instance; we'll simply // find it again every time we discover that current is null. - current = &sMain; + current = &main_CoroData(); } return *current; } +LLCoros::CoroData& LLCoros::get_CoroData(id id) +{ + auto found = CoroData::getInstance(id); + return found? *found : main_CoroData(); +} + +LLCoros::CoroData& LLCoros::main_CoroData() +{ + // tell CoroData we're "main" + static thread_local CoroData sMain(""); + return sMain; +} + //static LLCoros::coro::id LLCoros::get_self() { @@ -108,28 +120,28 @@ LLCoros::coro::id LLCoros::get_self() //static void LLCoros::set_consuming(bool consuming) { - auto& data(get_CoroData("set_consuming()")); + auto& data(get_CoroData()); // DO NOT call this on the main() coroutine. - llassert_always(! data.mName.empty()); + llassert_always(! data.isMain); data.mConsuming = consuming; } //static bool LLCoros::get_consuming() { - return get_CoroData("get_consuming()").mConsuming; + return get_CoroData().mConsuming; } // static void LLCoros::setStatus(const std::string& status) { - get_CoroData("setStatus()").mStatus = status; + get_CoroData().mStatus = status; } // static std::string LLCoros::getStatus() { - return get_CoroData("getStatus()").mStatus; + return get_CoroData().mStatus; } LLCoros::LLCoros(): @@ -186,9 +198,8 @@ void LLCoros::cleanupSingleton() std::string LLCoros::generateDistinctName(const std::string& prefix) const { - static int unique = 0; - - // Allowing empty name would make getName()'s not-found return ambiguous. + // Empty name would trigger CoroData's constructor's special case for the + // main coroutine. if (prefix.empty()) { LL_ERRS("LLCoros") << "LLCoros::launch(): pass non-empty name string" << LL_ENDL; @@ -196,9 +207,11 @@ std::string LLCoros::generateDistinctName(const std::string& prefix) const // If the specified name isn't already in the map, just use that. std::string name(prefix); + // maintain a distinct int suffix for each prefix + int& unique = mPrefixMap[prefix]; - // Until we find an unused name, append a numeric suffix for uniqueness. - while (CoroData::getInstance(name)) + // Until we find an unused name, append int suffix for uniqueness. + while (mNameMap.find(name) != mNameMap.end()) { name = stringize(prefix, unique++); } @@ -207,9 +220,16 @@ std::string LLCoros::generateDistinctName(const std::string& prefix) const bool LLCoros::killreq(const std::string& name) { - auto found = CoroData::getInstance(name); + auto foundName = mNameMap.find(name); + if (foundName == mNameMap.end()) + { + // couldn't find that name in map + return false; + } + auto found = CoroData::getInstance(foundName->second); if (! found) { + // found name, but CoroData with that ID key no longer exists return false; } // Next time the subject coroutine calls checkStop(), make it terminate. @@ -224,14 +244,13 @@ bool LLCoros::killreq(const std::string& name) //static std::string LLCoros::getName() { - return get_CoroData("getName()").mName; + return get_CoroData().getName(); } -//static -std::string LLCoros::logname() +// static +std::string LLCoros::getName(id id) { - auto& data(get_CoroData("logname()")); - return data.mName.empty()? data.getKey() : data.mName; + return get_CoroData(id).getName(); } void LLCoros::saveException(const std::string& name, std::exception_ptr exc) @@ -264,7 +283,7 @@ void LLCoros::printActiveCoroutines(const std::string& when) { LL_INFOS("LLCoros") << "-------------- List of active coroutines ------------"; F64 time = LLTimer::getTotalSeconds(); - for (auto& cd : CoroData::instance_snapshot()) + for (const auto& cd : CoroData::instance_snapshot()) { F64 life_time = time - cd.mCreationTime; LL_CONT << LL_NEWLINE @@ -364,8 +383,8 @@ void LLCoros::checkStop(callable_t cleanup) // do this AFTER the check above, because get_CoroData() depends on the // local_ptr in our instance(). - auto& data(get_CoroData("checkStop()")); - if (data.mName.empty()) + auto& data(get_CoroData()); + if (data.isMain) { // Our Stop exception and its subclasses are intended to stop loitering // coroutines. Don't throw it from the main coroutine. @@ -385,7 +404,7 @@ void LLCoros::checkStop(callable_t cleanup) { // Someone wants to kill this coroutine cleanup(); - LLTHROW(Killed(stringize("coroutine ", data.mName, " killed by ", data.mKilledBy))); + LLTHROW(Killed(stringize("coroutine ", data.getName(), " killed by ", data.mKilledBy))); } } @@ -445,20 +464,44 @@ LLBoundListener LLCoros::getStopListener(const std::string& caller, } LLCoros::CoroData::CoroData(const std::string& name): - LLInstanceTracker(name), + super(boost::this_fiber::get_id()), mName(name), mCreationTime(LLTimer::getTotalSeconds()) { + // we expect the empty string for the main coroutine + if (name.empty()) + { + isMain = true; + if (on_main_thread()) + { + // main coroutine on main thread + mName = "main"; + } + else + { + // main coroutine on some other thread + static std::atomic main_no{ 0 }; + mName = stringize("main", ++main_no); + } + } + // maintain LLCoros::mNameMap + LLCoros::mNameMap.emplace(mName, getKey()); } -LLCoros::CoroData::CoroData(int n): - // This constructor is used for the thread_local instance belonging to the - // default coroutine on each thread. We must give each one a different - // LLInstanceTracker key because LLInstanceTracker's map spans all - // threads, but we want the default coroutine on each thread to have the - // empty string as its visible name because some consumers test for that. - LLInstanceTracker("main" + stringize(n)), - mName(), - mCreationTime(LLTimer::getTotalSeconds()) +LLCoros::CoroData::~CoroData() +{ + // Don't try to erase the static main CoroData from our static + // thread_local mNameMap; that could run into destruction order problems. + if (! isMain) + { + LLCoros::mNameMap.erase(mName); + } +} + +std::string LLCoros::CoroData::getName() const { + if (mStatus.empty()) + return mName; + else + return stringize(mName, " (", mStatus, ")"); } diff --git a/indra/llcommon/llcoros.h b/indra/llcommon/llcoros.h index 0291d7f1d9..662be9dcc9 100644 --- a/indra/llcommon/llcoros.h +++ b/indra/llcommon/llcoros.h @@ -39,39 +39,35 @@ #include #include #include +#include + +namespace llcoro +{ +class scheduler; +} /** - * Registry of named Boost.Coroutine instances - * - * The Boost.Coroutine library supports the general case of a coroutine - * accepting arbitrary parameters and yielding multiple (sets of) results. For - * such use cases, it's natural for the invoking code to retain the coroutine - * instance: the consumer repeatedly calls into the coroutine, perhaps passing - * new parameter values, prompting it to yield its next result. - * - * Our typical coroutine usage is different, though. For us, coroutines - * provide an alternative to the @c Responder pattern. Our typical coroutine - * has @c void return, invoked in fire-and-forget mode: the handler for some - * user gesture launches the coroutine and promptly returns to the main loop. - * The coroutine initiates some action that will take multiple frames (e.g. a - * capability request), waits for its result, processes it and silently steals - * away. + * Registry of named Boost.Fiber instances * - * This usage poses two (related) problems: + * When the viewer first introduced the semi-independent execution agents now + * called fibers, the term "fiber" had not yet become current, and the only + * available libraries used the term "coroutine" instead. Within the viewer we + * continue to use the term "coroutines," though at present they are actually + * boost::fibers::fiber instances. * - * # Who should own the coroutine instance? If it's simply local to the - * handler code that launches it, return from the handler will destroy the - * coroutine object, terminating the coroutine. - * # Once the coroutine terminates, in whatever way, who's responsible for - * cleaning up the coroutine object? + * Coroutines provide an alternative to the @c Responder pattern. Our typical + * coroutine has @c void return, invoked in fire-and-forget mode: the handler + * for some user gesture launches the coroutine and promptly returns to the + * main loop. The coroutine initiates some action that will take multiple + * frames (e.g. a capability request), waits for its result, processes it and + * silently steals away. * * LLCoros is a Singleton collection of currently-active coroutine instances. * Each has a name. You ask LLCoros to launch a new coroutine with a suggested * name prefix; from your prefix it generates a distinct name, registers the * new coroutine and returns the actual name. * - * The name - * can provide diagnostic info: we can look up the name of the + * The name can provide diagnostic info: we can look up the name of the * currently-running coroutine. */ class LL_COMMON_API LLCoros: public LLSingleton @@ -91,12 +87,8 @@ public: // llassert(LLCoros::on_main_thread_main_coro()) static bool on_main_thread_main_coro(); - /// The viewer's use of the term "coroutine" became deeply embedded before - /// the industry term "fiber" emerged to distinguish userland threads from - /// simpler, more transient kinds of coroutines. Semantically they've - /// always been fibers. But at this point in history, we're pretty much - /// stuck with the term "coroutine." typedef boost::fibers::fiber coro; + typedef coro::id id; /// Canonical callable type typedef std::function callable_t; @@ -150,12 +142,15 @@ public: /** * From within a coroutine, look up the (tweaked) name string by which - * this coroutine is registered. Returns the empty string if not found - * (e.g. if the coroutine was launched by hand rather than using - * LLCoros::launch()). + * this coroutine is registered. */ static std::string getName(); + /** + * Given an id, return the name of that coroutine. + */ + static std::string getName(id); + /** * rethrow() is called by the thread's main fiber to propagate an * exception from any coroutine into the main fiber, where it can engage @@ -169,13 +164,6 @@ public: */ void rethrow(); - /** - * This variation returns a name suitable for log messages: the explicit - * name for an explicitly-launched coroutine, or "mainN" for the default - * coroutine on a thread. - */ - static std::string logname(); - /** * For delayed initialization. To be clear, this will only affect * coroutines launched @em after this point. The underlying facility @@ -187,7 +175,7 @@ public: void printActiveCoroutines(const std::string& when=std::string()); /// get the current coro::id for those who really really care - static coro::id get_self(); + static id get_self(); /** * Most coroutines, most of the time, don't "consume" the events for which @@ -236,6 +224,7 @@ public: setStatus(status); } TempStatus(const TempStatus&) = delete; + TempStatus& operator=(const TempStatus&) = delete; ~TempStatus() { setStatus(mOldStatus); @@ -331,10 +320,14 @@ public: using local_ptr = boost::fibers::fiber_specific_ptr; private: + friend class llcoro::scheduler; + std::string generateDistinctName(const std::string& prefix) const; void toplevel(std::string name, callable_t callable); struct CoroData; - static CoroData& get_CoroData(const std::string& caller); + static CoroData& get_CoroData(); + static CoroData& get_CoroData(id); + static CoroData& main_CoroData(); void saveException(const std::string& name, std::exception_ptr exc); LLTempBoundListener mConn; @@ -355,13 +348,18 @@ private: S32 mStackSize; // coroutine-local storage, as it were: one per coro we track - struct CoroData: public LLInstanceTracker + struct CoroData: public LLInstanceTracker { + using super = LLInstanceTracker; + CoroData(const std::string& name); - CoroData(int n); + ~CoroData(); + std::string getName() const; + + bool isMain{ false }; // tweaked name of the current coroutine - const std::string mName; + std::string mName; // set_consuming() state -- don't consume events unless specifically directed bool mConsuming{ false }; // killed by which coroutine @@ -375,14 +373,11 @@ private: // because it's a member of an LLSingleton, and we rely on it being // cleaned up in proper dependency order. local_ptr mCurrent; -}; -namespace llcoro -{ - -inline -std::string logname() { return LLCoros::logname(); } - -} // llcoro + // ensure name uniqueness + static thread_local std::unordered_map mPrefixMap; + // lookup by name + static thread_local std::unordered_map mNameMap; +}; #endif /* ! defined(LL_LLCOROS_H) */ diff --git a/indra/llcommon/workqueue.cpp b/indra/llcommon/workqueue.cpp index 9138c862f9..8b7b97a1f9 100644 --- a/indra/llcommon/workqueue.cpp +++ b/indra/llcommon/workqueue.cpp @@ -127,9 +127,7 @@ void LL::WorkQueueBase::error(const std::string& msg) void LL::WorkQueueBase::checkCoroutine(const std::string& method) { - // By convention, the default coroutine on each thread has an empty name - // string. See also LLCoros::logname(). - if (LLCoros::getName().empty()) + if (LLCoros::on_main_coro()) { LLTHROW(Error("Do not call " + method + " from a thread's default coroutine")); } diff --git a/indra/test/sync.h b/indra/test/sync.h index 82eef1e5f5..abeb4e17a8 100644 --- a/indra/test/sync.h +++ b/indra/test/sync.h @@ -69,7 +69,7 @@ public: // misleading, as it will be emitted after waiting threads have // already awakened. But emitting the log message within the lock // would seem to hold the lock longer than we really ought. - LL_DEBUGS() << llcoro::logname() << " bump(" << n << ") -> " << updated << LL_ENDL; + LL_DEBUGS() << LLCoros::getName() << " bump(" << n << ") -> " << updated << LL_ENDL; } /** @@ -82,7 +82,7 @@ public: */ void set(int n) { - LL_DEBUGS() << llcoro::logname() << " set(" << n << ")" << LL_ENDL; + LL_DEBUGS() << LLCoros::getName() << " set(" << n << ")" << LL_ENDL; mCond.set_all(n); } @@ -101,7 +101,7 @@ public: private: void yield_until(const char* func, int arg, int until) { - std::string name(llcoro::logname()); + std::string name(LLCoros::getName()); LL_DEBUGS() << name << " yield_until(" << until << ") suspending" << LL_ENDL; if (! mCond.wait_for_equal(mTimeout, until)) { -- cgit v1.2.3 From 8e737a5d45157426cb84927170a78baea2106813 Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Tue, 22 Oct 2024 09:52:36 -0400 Subject: LLCoprocedurePool::coprocedureInvokerCoro() sets status to task name LLCoprocedurePool coroutines are generic; its coroutine name alone doesn't tell us much. We can only know what one is doing by its current task name. Use LLCoros::setStatus() to associate the task name with the coroutine. --- indra/llmessage/llcoproceduremanager.cpp | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/indra/llmessage/llcoproceduremanager.cpp b/indra/llmessage/llcoproceduremanager.cpp index 13972ad399..5539ca7b86 100644 --- a/indra/llmessage/llcoproceduremanager.cpp +++ b/indra/llmessage/llcoproceduremanager.cpp @@ -403,6 +403,7 @@ void LLCoprocedurePool::coprocedureInvokerCoro( CoprocQueuePtr pendingCoprocs, LLCoreHttpUtil::HttpCoroutineAdapter::ptr_t httpAdapter) { + std::string prevtask; for (;;) { // It is VERY IMPORTANT that we instantiate a new ptr_t just before @@ -424,10 +425,25 @@ void LLCoprocedurePool::coprocedureInvokerCoro( // destroyed during pop_wait_for(). QueuedCoproc::ptr_t coproc; boost::fibers::channel_op_status status; + // Each time control reaches our custom coroutine scheduler, we check + // how long the previous coroutine ran before yielding, and report + // coroutines longer than a certain cutoff. But these coprocedure pool + // coroutines are generic; the only way we know what work they're + // doing is the task 'status' set by LLCoros::setStatus(). But what if + // the coroutine runs the task to completion and returns to waiting? + // It does no good to report that "waiting" ran long. So each time we + // enter "waiting" status, also report the *previous* task name. + std::string waiting = "waiting", newstatus; + if (prevtask.empty()) { - LLCoros::TempStatus st("waiting for work for 10s"); - status = pendingCoprocs->pop_wait_for(coproc, std::chrono::seconds(10)); + newstatus = waiting; } + else + { + newstatus = stringize("done ", prevtask, "; ", waiting); + } + LLCoros::setStatus(newstatus); + status = pendingCoprocs->pop_wait_for(coproc, std::chrono::seconds(10)); if (status == boost::fibers::channel_op_status::closed) { break; @@ -436,6 +452,7 @@ void LLCoprocedurePool::coprocedureInvokerCoro( if(status == boost::fibers::channel_op_status::timeout) { LL_DEBUGS_ONCE("CoProcMgr") << "pool '" << mPoolName << "' waiting." << LL_ENDL; + prevtask.clear(); continue; } // we actually popped an item @@ -446,6 +463,9 @@ void LLCoprocedurePool::coprocedureInvokerCoro( try { + // set "status" of pool coroutine to the name of the coproc task + prevtask = coproc->mName; + LLCoros::setStatus(prevtask); coproc->mProc(httpAdapter, coproc->mId); } catch (const LLCoros::Stop &e) -- cgit v1.2.3 From b0645835595f3517223329ba62f46277d3e3a9dd Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Tue, 22 Oct 2024 11:34:46 -0400 Subject: Make llcoro::scheduler log coros that run too long between yields. Introduce LLCoros::CoroData::mHistogram, a map of cutoff times (bucket breakpoints) with counts of occurrences. The idea is that mHistogram counts how many times the real time taken by a particular coroutine resumption falls into one of those buckets. Initialize the map with guessed buckets; these are set in llcoros.cpp so they can be changed without requiring extensive rebuilds. scheduler::pick_next() now records the timestamp and fiber context just before the fiber manager resumes the next coroutine. If the next pick_next() call reveals that the previous resumption took longer than the minimum bucket breakpoint, it increments the appropriate bucket counter and logs the instance. LLCoros::toplevel() reports nonzero mHistogram entries on coroutine termination. --- indra/llcommon/coro_scheduler.cpp | 84 ++++++++++++++++++++++++++++++--------- indra/llcommon/coro_scheduler.h | 15 ++++--- indra/llcommon/llcoros.cpp | 37 ++++++++++++++++- indra/llcommon/llcoros.h | 8 ++++ 4 files changed, 119 insertions(+), 25 deletions(-) diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp index 2d8b6e1a97..93efac7c7e 100644 --- a/indra/llcommon/coro_scheduler.cpp +++ b/indra/llcommon/coro_scheduler.cpp @@ -20,6 +20,7 @@ #include // other Linden headers #include "llcallbacklist.h" +#include "llcoros.h" #include "lldate.h" #include "llerror.h" @@ -56,17 +57,56 @@ void scheduler::awakened( boost::fibers::context* ctx) noexcept boost::fibers::context* scheduler::pick_next() noexcept { + auto now = LLDate::now().secondsSinceEpoch(); // count calls to pick_next() ++mSwitches; // pick_next() is called when the previous fiber has suspended, and we // need to pick another. Did the previous pick_next() call pick the main - // fiber? If so, it's the main fiber that just suspended. - auto now = LLDate::now().secondsSinceEpoch(); - if (mMainRunning) + // fiber? (Or is this the first pick_next() call?) If so, it's the main + // fiber that just suspended. + if ((! mPrevCtx) || mPrevCtx->get_id() == mMainID) { - mMainRunning = false; mMainLast = now; } + else + { + // How long did we spend in the fiber that just suspended? + // Don't bother with long runs of the main fiber, since (a) it happens + // pretty often and (b) it's moderately likely that we've reached here + // from the canonical yield at the top of mainloop, and what we'd want + // to know about is whatever the main fiber was doing in the + // *previous* iteration of mainloop. + F64 elapsed{ now - mResumeTime }; + LLCoros::CoroData& data{ LLCoros::get_CoroData(mPrevCtx->get_id()) }; + // Find iterator to the first mHistogram key greater than elapsed. + auto past = data.mHistogram.upper_bound(elapsed); + // If the smallest key (mHistogram.begin()->first) is greater than + // elapsed, then we need not bother with this timeslice. + if (past != data.mHistogram.begin()) + { + // Here elapsed was greater than at least one key. Back off to the + // previous entry and increment that count. If it's end(), backing + // off gets us the last entry -- assuming mHistogram isn't empty. + llassert(! data.mHistogram.empty()); + ++(--past)->second; + LL::WorkQueue::ptr_t queue{ getWorkQueue() }; + // make sure the queue exists + if (queue) + { + // If it proves difficult to track down *why* the fiber spent so + // much time, consider also binding and reporting + // boost::stacktrace::stacktrace(). + queue->post( + [name=std::move(data.getName()), elapsed] + { + LL_WARNS_ONCE("LLCoros.scheduler") + << "Coroutine " << name << " ran for " + << elapsed << " seconds" << LL_ENDL; + + }); + } + } + } boost::fibers::context* next; @@ -96,17 +136,9 @@ boost::fibers::context* scheduler::pick_next() noexcept // passage could be skipped. // Record this event for logging, but push it off to a thread pool to - // perform that work. Presumably std::weak_ptr::lock() is cheaper than - // WorkQueue::getInstance(). - LL::WorkQueue::ptr_t queue{ mQueue.lock() }; - // We probably started before the relevant WorkQueue was created. - if (! queue) - { - // Try again to locate the specified WorkQueue. - queue = LL::WorkQueue::getInstance(qname); - mQueue = queue; - } - // Both the lock() call and the getInstance() call might have failed. + // perform that work. + LL::WorkQueue::ptr_t queue{ getWorkQueue() }; + // The work queue we're looking for might not exist right now. if (queue) { // Bind values. Do NOT bind 'this' to avoid cross-thread access! @@ -116,7 +148,6 @@ boost::fibers::context* scheduler::pick_next() noexcept // so we have no access. queue->post( [switches=mSwitches, start=mStart, elapsed, now] - () { U32 runtime(U32(now) - U32(start)); U32 minutes(runtime / 60u); @@ -150,12 +181,29 @@ boost::fibers::context* scheduler::pick_next() noexcept { // we're about to resume the main fiber: it's no longer "ready" mMainCtx = nullptr; - // instead, it's "running" - mMainRunning = true; } + mPrevCtx = next; + // remember when we resumed this fiber so our next call can measure how + // long the previous resumption was + mResumeTime = LLDate::now().secondsSinceEpoch(); return next; } +LL::WorkQueue::ptr_t scheduler::getWorkQueue() +{ + // Cache a weak_ptr to our target work queue, presuming that + // std::weak_ptr::lock() is cheaper than WorkQueue::getInstance(). + LL::WorkQueue::ptr_t queue{ mQueue.lock() }; + // We probably started before the relevant WorkQueue was created. + if (! queue) + { + // Try again to locate the specified WorkQueue. + queue = LL::WorkQueue::getInstance(qname); + mQueue = queue; + } + return queue; +} + void scheduler::use() { boost::fibers::use_scheduling_algorithm(); diff --git a/indra/llcommon/coro_scheduler.h b/indra/llcommon/coro_scheduler.h index eee2d746b5..7af90685dc 100644 --- a/indra/llcommon/coro_scheduler.h +++ b/indra/llcommon/coro_scheduler.h @@ -47,17 +47,20 @@ public: static void use(); private: - // This is the fiber::id of the main fiber. We use this to discover - // whether the fiber passed to awakened() is in fact the main fiber. + LL::WorkQueue::ptr_t getWorkQueue(); + + // This is the fiber::id of the main fiber. boost::fibers::fiber::id mMainID; - // This context* is nullptr until awakened() notices that the main fiber - // has become ready, at which point it contains the main fiber's context*. + // This context* is nullptr while the main fiber is running or suspended, + // but is set to the main fiber's context each time the main fiber is ready. boost::fibers::context* mMainCtx{}; - // Set when pick_next() returns the main fiber. - bool mMainRunning{ false }; + // Remember the context returned by the previous pick_next() call. + boost::fibers::context* mPrevCtx{}; // If it's been at least this long since the last time the main fiber got // control, jump it to the head of the queue. F64 mTimeslice{ DEFAULT_TIMESLICE }; + // Time when we resumed the most recently running fiber + F64 mResumeTime{ 0 }; // Timestamp as of the last time we suspended the main fiber. F64 mMainLast{ 0 }; // Timestamp of start time diff --git a/indra/llcommon/llcoros.cpp b/indra/llcommon/llcoros.cpp index 0a3ca99b05..5a3cbd2ef1 100644 --- a/indra/llcommon/llcoros.cpp +++ b/indra/llcommon/llcoros.cpp @@ -57,6 +57,7 @@ #include "llsdutil.h" #include "lltimer.h" #include "stringize.h" +#include "scope_exit.h" #if LL_WINDOWS #include @@ -342,6 +343,33 @@ void LLCoros::toplevel(std::string name, callable_t callable) // run the code the caller actually wants in the coroutine try { + LL::scope_exit report{ + [&corodata] + { + bool allzero = true; + for (const auto& [threshold, occurs] : corodata.mHistogram) + { + if (occurs) + { + allzero = false; + break; + } + } + if (! allzero) + { + LL_WARNS("LLCoros") << "coroutine " << corodata.mName; + const char* sep = " exceeded "; + for (const auto& [threshold, occurs] : corodata.mHistogram) + { + if (occurs) + { + LL_CONT << sep << threshold << " " << occurs << " times"; + sep = ", "; + } + } + LL_ENDL; + } + }}; LL::seh::catcher(callable); } catch (const Stop& exc) @@ -466,7 +494,14 @@ LLBoundListener LLCoros::getStopListener(const std::string& caller, LLCoros::CoroData::CoroData(const std::string& name): super(boost::this_fiber::get_id()), mName(name), - mCreationTime(LLTimer::getTotalSeconds()) + mCreationTime(LLTimer::getTotalSeconds()), + // Preset threshold times in mHistogram + mHistogram{ + {0.004, 0}, + {0.040, 0}, + {0.400, 0}, + {1.000, 0} + } { // we expect the empty string for the main coroutine if (name.empty()) diff --git a/indra/llcommon/llcoros.h b/indra/llcommon/llcoros.h index 662be9dcc9..1edcb7e387 100644 --- a/indra/llcommon/llcoros.h +++ b/indra/llcommon/llcoros.h @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -367,6 +368,13 @@ private: // setStatus() state std::string mStatus; F64 mCreationTime; // since epoch + // Histogram of how many times this coroutine's timeslice exceeds + // certain thresholds. mHistogram is pre-populated with those + // thresholds as keys. If k0 is one threshold key and k1 is the next, + // mHistogram[k0] is the number of times a coroutine timeslice tn ran + // (k0 <= tn < k1). A timeslice less than mHistogram.begin()->first is + // fine; we don't need to record those. + std::map mHistogram; }; // Identify the current coroutine's CoroData. This local_ptr isn't static -- cgit v1.2.3 From 7f094aa20418b3c46a0c2c1d0730b0a14c058a6f Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Tue, 22 Oct 2024 12:00:26 -0400 Subject: Eliminate meaningless blank line --- indra/llcommon/coro_scheduler.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp index 93efac7c7e..7b900a13f6 100644 --- a/indra/llcommon/coro_scheduler.cpp +++ b/indra/llcommon/coro_scheduler.cpp @@ -102,7 +102,6 @@ boost::fibers::context* scheduler::pick_next() noexcept LL_WARNS_ONCE("LLCoros.scheduler") << "Coroutine " << name << " ran for " << elapsed << " seconds" << LL_ENDL; - }); } } -- cgit v1.2.3 From 77f9ab96779c55a20a5ccd8b9e906227b3e8e5c8 Mon Sep 17 00:00:00 2001 From: Nat Goodspeed Date: Tue, 22 Oct 2024 12:04:59 -0400 Subject: Remove move --- indra/llcommon/coro_scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp index 7b900a13f6..b6117fa6a1 100644 --- a/indra/llcommon/coro_scheduler.cpp +++ b/indra/llcommon/coro_scheduler.cpp @@ -97,7 +97,7 @@ boost::fibers::context* scheduler::pick_next() noexcept // much time, consider also binding and reporting // boost::stacktrace::stacktrace(). queue->post( - [name=std::move(data.getName()), elapsed] + [name=data.getName(), elapsed] { LL_WARNS_ONCE("LLCoros.scheduler") << "Coroutine " << name << " ran for " -- cgit v1.2.3