From b0645835595f3517223329ba62f46277d3e3a9dd Mon Sep 17 00:00:00 2001
From: Nat Goodspeed <nat@lindenlab.com>
Date: Tue, 22 Oct 2024 11:34:46 -0400
Subject: Make llcoro::scheduler log coros that run too long between yields.

Introduce LLCoros::CoroData::mHistogram, a map of cutoff times (bucket
breakpoints) with counts of occurrences. The idea is that mHistogram counts
how many times the real time taken by a particular coroutine resumption falls
into one of those buckets. Initialize the map with guessed buckets; these are
set in llcoros.cpp so they can be changed without requiring extensive rebuilds.

scheduler::pick_next() now records the timestamp and fiber context just before
the fiber manager resumes the next coroutine. If the next pick_next() call
reveals that the previous resumption took longer than the minimum bucket
breakpoint, it increments the appropriate bucket counter and logs the instance.

LLCoros::toplevel() reports nonzero mHistogram entries on coroutine
termination.
---
 indra/llcommon/coro_scheduler.cpp | 84 ++++++++++++++++++++++++++++++---------
 1 file changed, 66 insertions(+), 18 deletions(-)

(limited to 'indra/llcommon/coro_scheduler.cpp')

diff --git a/indra/llcommon/coro_scheduler.cpp b/indra/llcommon/coro_scheduler.cpp
index 2d8b6e1a97..93efac7c7e 100644
--- a/indra/llcommon/coro_scheduler.cpp
+++ b/indra/llcommon/coro_scheduler.cpp
@@ -20,6 +20,7 @@
 #include <boost/fiber/operations.hpp>
 // other Linden headers
 #include "llcallbacklist.h"
+#include "llcoros.h"
 #include "lldate.h"
 #include "llerror.h"
 
@@ -56,17 +57,56 @@ void scheduler::awakened( boost::fibers::context* ctx) noexcept
 
 boost::fibers::context* scheduler::pick_next() noexcept
 {
+    auto now = LLDate::now().secondsSinceEpoch();
     // count calls to pick_next()
     ++mSwitches;
     // pick_next() is called when the previous fiber has suspended, and we
     // need to pick another. Did the previous pick_next() call pick the main
-    // fiber? If so, it's the main fiber that just suspended.
-    auto now = LLDate::now().secondsSinceEpoch();
-    if (mMainRunning)
+    // fiber? (Or is this the first pick_next() call?) If so, it's the main
+    // fiber that just suspended.
+    if ((! mPrevCtx) || mPrevCtx->get_id() == mMainID)
     {
-        mMainRunning = false;
         mMainLast = now;
     }
+    else
+    {
+        // How long did we spend in the fiber that just suspended?
+        // Don't bother with long runs of the main fiber, since (a) it happens
+        // pretty often and (b) it's moderately likely that we've reached here
+        // from the canonical yield at the top of mainloop, and what we'd want
+        // to know about is whatever the main fiber was doing in the
+        // *previous* iteration of mainloop.
+        F64 elapsed{ now - mResumeTime };
+        LLCoros::CoroData& data{ LLCoros::get_CoroData(mPrevCtx->get_id()) };
+        // Find iterator to the first mHistogram key greater than elapsed.
+        auto past = data.mHistogram.upper_bound(elapsed);
+        // If the smallest key (mHistogram.begin()->first) is greater than
+        // elapsed, then we need not bother with this timeslice.
+        if (past != data.mHistogram.begin())
+        {
+            // Here elapsed was greater than at least one key. Back off to the
+            // previous entry and increment that count. If it's end(), backing
+            // off gets us the last entry -- assuming mHistogram isn't empty.
+            llassert(! data.mHistogram.empty());
+            ++(--past)->second;
+            LL::WorkQueue::ptr_t queue{ getWorkQueue() };
+            // make sure the queue exists
+            if (queue)
+            {
+                // If it proves difficult to track down *why* the fiber spent so
+                // much time, consider also binding and reporting
+                // boost::stacktrace::stacktrace().
+                queue->post(
+                    [name=std::move(data.getName()), elapsed]
+                    {
+                        LL_WARNS_ONCE("LLCoros.scheduler")
+                            << "Coroutine " << name << " ran for "
+                            << elapsed << " seconds" << LL_ENDL;
+                        
+                    });
+            }
+        }
+    }
 
     boost::fibers::context* next;
 
@@ -96,17 +136,9 @@ boost::fibers::context* scheduler::pick_next() noexcept
         // passage could be skipped.
 
         // Record this event for logging, but push it off to a thread pool to
-        // perform that work. Presumably std::weak_ptr::lock() is cheaper than
-        // WorkQueue::getInstance().
-        LL::WorkQueue::ptr_t queue{ mQueue.lock() };
-        // We probably started before the relevant WorkQueue was created.
-        if (! queue)
-        {
-            // Try again to locate the specified WorkQueue.
-            queue = LL::WorkQueue::getInstance(qname);
-            mQueue = queue;
-        }
-        // Both the lock() call and the getInstance() call might have failed.
+        // perform that work.
+        LL::WorkQueue::ptr_t queue{ getWorkQueue() };
+        // The work queue we're looking for might not exist right now.
         if (queue)
         {
             // Bind values. Do NOT bind 'this' to avoid cross-thread access!
@@ -116,7 +148,6 @@ boost::fibers::context* scheduler::pick_next() noexcept
             // so we have no access.
             queue->post(
                 [switches=mSwitches, start=mStart, elapsed, now]
-                ()
                 {
                     U32 runtime(U32(now) - U32(start));
                     U32 minutes(runtime / 60u);
@@ -150,12 +181,29 @@ boost::fibers::context* scheduler::pick_next() noexcept
     {
         // we're about to resume the main fiber: it's no longer "ready"
         mMainCtx = nullptr;
-        // instead, it's "running"
-        mMainRunning = true;
     }
+    mPrevCtx = next;
+    // remember when we resumed this fiber so our next call can measure how
+    // long the previous resumption was
+    mResumeTime = LLDate::now().secondsSinceEpoch();
     return next;
 }
 
+LL::WorkQueue::ptr_t scheduler::getWorkQueue()
+{
+    // Cache a weak_ptr to our target work queue, presuming that
+    // std::weak_ptr::lock() is cheaper than WorkQueue::getInstance().
+    LL::WorkQueue::ptr_t queue{ mQueue.lock() };
+    // We probably started before the relevant WorkQueue was created.
+    if (! queue)
+    {
+        // Try again to locate the specified WorkQueue.
+        queue = LL::WorkQueue::getInstance(qname);
+        mQueue = queue;
+    }
+    return queue;
+}
+
 void scheduler::use()
 {
     boost::fibers::use_scheduling_algorithm<scheduler>();
-- 
cgit v1.2.3