indra/llcommon/coro_scheduler.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211

/**
 * @file   coro_scheduler.cpp
 * @author Nat Goodspeed
 * @date   2024-08-05
 * @brief  Implementation for llcoro::scheduler.
 *
 * $LicenseInfo:firstyear=2024&license=viewerlgpl$
 * Copyright (c) 2024, Linden Research, Inc.
 * $/LicenseInfo$
 */

// Precompiled header
#include "linden_common.h"
// associated header
#include "coro_scheduler.h"
// STL headers
// std headers
#include <iomanip>
// external library headers
#include <boost/fiber/operations.hpp>
// other Linden headers
#include "llcallbacklist.h"
#include "llcoros.h"
#include "lldate.h"
#include "llerror.h"

namespace llcoro
{

const F64 scheduler::DEFAULT_TIMESLICE{ LL::Timers::DEFAULT_TIMESLICE };

const std::string qname("General");

scheduler::scheduler():
    // Since use_scheduling_algorithm() must be called before any other
    // Boost.Fibers operations, we can assume that the calling fiber is in
    // fact the main fiber.
    mMainID(boost::this_fiber::get_id()),
    mStart(LLDate::now().secondsSinceEpoch()),
    mQueue(LL::WorkQueue::getInstance(qname))
{}

void scheduler::awakened( boost::fibers::context* ctx) noexcept
{
    if (ctx->get_id() == mMainID)
    {
        // If the fiber that just came ready is the main fiber, record its
        // pointer.
        llassert(! mMainCtx);
        mMainCtx = ctx;
    }
    // Delegate to round_robin::awakened() as usual, even for the main fiber.
    // This way, as long as other fibers don't take too long, we can just let
    // normal round_robin processing pass control to the main fiber.
    super::awakened(ctx);
}

boost::fibers::context* scheduler::pick_next() noexcept
{
    auto now = LLDate::now().secondsSinceEpoch();
    // count calls to pick_next()
    ++mSwitches;
    // pick_next() is called when the previous fiber has suspended, and we
    // need to pick another. Did the previous pick_next() call pick the main
    // fiber? (Or is this the first pick_next() call?) If so, it's the main
    // fiber that just suspended.
    if ((! mPrevCtx) || mPrevCtx->get_id() == mMainID)
    {
        mMainLast = now;
    }
    else
    {
        // How long did we spend in the fiber that just suspended?
        // Don't bother with long runs of the main fiber, since (a) it happens
        // pretty often and (b) it's moderately likely that we've reached here
        // from the canonical yield at the top of mainloop, and what we'd want
        // to know about is whatever the main fiber was doing in the
        // *previous* iteration of mainloop.
        F64 elapsed{ now - mResumeTime };
        LLCoros::CoroData& data{ LLCoros::get_CoroData(mPrevCtx->get_id()) };
        // Find iterator to the first mHistogram key greater than elapsed.
        auto past = data.mHistogram.upper_bound(elapsed);
        // If the smallest key (mHistogram.begin()->first) is greater than
        // elapsed, then we need not bother with this timeslice.
        if (past != data.mHistogram.begin())
        {
            // Here elapsed was greater than at least one key. Back off to the
            // previous entry and increment that count. If it's end(), backing
            // off gets us the last entry -- assuming mHistogram isn't empty.
            llassert(! data.mHistogram.empty());
            ++(--past)->second;
            LL::WorkQueue::ptr_t queue{ getWorkQueue() };
            // make sure the queue exists
            if (queue)
            {
                // If it proves difficult to track down *why* the fiber spent so
                // much time, consider also binding and reporting
                // boost::stacktrace::stacktrace().
                queue->post(
                    [name=data.getName(), elapsed]
                    {
                        LL_WARNS_ONCE("LLCoros.scheduler")
                            << "Coroutine " << name << " ran for "
                            << elapsed << " seconds" << LL_ENDL;
                    });
            }
        }
    }

    boost::fibers::context* next;

    // When the main fiber is ready, and it's been more than mTimeslice since
    // the main fiber last ran, it's time to intervene.
    F64 elapsed(now - mMainLast);
    if (mMainCtx && elapsed > mTimeslice)
    {
        // We claim that the main fiber is not only stored in mMainCtx, but is
        // also queued (somewhere) in our ready list.
        llassert(mMainCtx->ready_is_linked());
        // The usefulness of a doubly-linked list is that, given only a
        // pointer to an item, we can unlink it.
        mMainCtx->ready_unlink();
        // Instead of delegating to round_robin::pick_next() to pop the head
        // of the queue, override by returning mMainCtx.
        next = mMainCtx;

        /*------------------------- logging stuff --------------------------*/
        // Unless this log tag is enabled, don't even bother posting.
        LL_DEBUGS("LLCoros.scheduler") << " ";
        // This feature is inherently hard to verify. The logging in the
        // lambda below seems useful, but also seems like a lot of overhead
        // for a coroutine context switch. Try posting the logging lambda to a
        // ThreadPool to offload that overhead. However, if this is still
        // taking an unreasonable amount of context-switch time, this whole
        // passage could be skipped.

        // Record this event for logging, but push it off to a thread pool to
        // perform that work.
        LL::WorkQueue::ptr_t queue{ getWorkQueue() };
        // The work queue we're looking for might not exist right now.
        if (queue)
        {
            // Bind values. Do NOT bind 'this' to avoid cross-thread access!
            // It would be interesting to know from what queue position we
            // unlinked the main fiber, out of how many in the ready list.
            // Unfortunately round_robin::rqueue_ is private, not protected,
            // so we have no access.
            queue->post(
                [switches=mSwitches, start=mStart, elapsed, now]
                {
                    U32 runtime(U32(now) - U32(start));
                    U32 minutes(runtime / 60u);
                    U32 seconds(runtime % 60u);
                    // use stringize to avoid lasting side effects to the
                    // logging ostream
                    LL_DEBUGS("LLCoros.scheduler")
                        << "At time "
                        << stringize(minutes, ":", std::setw(2), std::setfill('0'), seconds)
                        << " (" << switches << " switches), coroutines took "
                        << stringize(std::setprecision(4), elapsed)
                        << " sec, main coroutine jumped queue"
                        << LL_ENDL;
                });
        }
        LL_ENDL;
        /*----------------------- end logging stuff ------------------------*/
    }
    else
    {
        // Either the main fiber isn't yet ready, or it hasn't yet been
        // mTimeslice seconds since the last time the main fiber ran. Business
        // as usual.
        next = super::pick_next();
    }

    // super::pick_next() could also have returned the main fiber, which is
    // why this is a separate test instead of being folded into the override
    // case above.
    if (next && next->get_id() == mMainID)
    {
        // we're about to resume the main fiber: it's no longer "ready"
        mMainCtx = nullptr;
    }
    mPrevCtx = next;
    // remember when we resumed this fiber so our next call can measure how
    // long the previous resumption was
    mResumeTime = LLDate::now().secondsSinceEpoch();
    return next;
}

LL::WorkQueue::ptr_t scheduler::getWorkQueue()
{
    // Cache a weak_ptr to our target work queue, presuming that
    // std::weak_ptr::lock() is cheaper than WorkQueue::getInstance().
    LL::WorkQueue::ptr_t queue{ mQueue.lock() };
    // We probably started before the relevant WorkQueue was created.
    if (! queue)
    {
        // Try again to locate the specified WorkQueue.
        queue = LL::WorkQueue::getInstance(qname);
        mQueue = queue;
    }
    return queue;
}

void scheduler::use()
{
    boost::fibers::use_scheduling_algorithm<scheduler>();
}

} // namespace llcoro