#4685 EventQueueGet shouldn't use llcorehttp's backoff-retry logic

- Event poll already has own backoff logic with 10 retries, using llcorehttp's one on top makes no sense. - Better logging coverage - Better in-code documentation - Add timing-based error detection
author: Andrey Kleshchev <117672381+akleshchev@users.noreply.github.com> 2025-09-17 18:24:35 +0300
committer: GitHub <noreply@github.com> 2025-09-17 18:24:35 +0300
commit: 42b09b236bae797303af94fc003466ca649259a2 (patch)
tree: 6c7195f4aebe5e038a7835de57234150c00c8931
parent: e33b5c5e6c8db1759e34db43ea9defc5455ab6c7 (diff)
2 files changed, 70 insertions, 19 deletions
diff --git a/indra/newview/lleventpoll.cpp b/indra/newview/lleventpoll.cpp
index c6fea1ba82..86c58a3497 100644
--- a/indra/newview/lleventpoll.cpp
+++ b/indra/newview/lleventpoll.cpp
@@ -54,13 +54,6 @@ namespace Details
         void stop();
 
     private:
-        // We will wait RETRY_SECONDS + (errorCount * RETRY_SECONDS_INC) before retrying after an error.
-        // This means we attempt to recover relatively quickly but back off giving more time to recover
-        // until we finally give up after MAX_EVENT_POLL_HTTP_ERRORS attempts.
-        static const F32                EVENT_POLL_ERROR_RETRY_SECONDS;
-        static const F32                EVENT_POLL_ERROR_RETRY_SECONDS_INC;
-        static const S32                MAX_EVENT_POLL_HTTP_ERRORS;
-
         void                            eventPollCoro(std::string url);
 
         void                            handleMessage(const LLSD &content);
@@ -76,9 +69,13 @@ namespace Details
     };
 
 
-    const F32 LLEventPollImpl::EVENT_POLL_ERROR_RETRY_SECONDS = 15.f; // ~ half of a normal timeout.
-    const F32 LLEventPollImpl::EVENT_POLL_ERROR_RETRY_SECONDS_INC = 5.f; // ~ half of a normal timeout.
-    const S32 LLEventPollImpl::MAX_EVENT_POLL_HTTP_ERRORS = 10; // ~5 minutes, by the above rules.
+    // We will wait RETRY_SECONDS + (errorCount * RETRY_SECONDS_INC) before retrying after an error.
+    // This means we attempt to recover relatively quickly but back off giving more time to recover
+    // until we finally give up after MAX_EVENT_POLL_HTTP_ERRORS attempts.
+    constexpr F32 EVENT_POLL_ERROR_RETRY_SECONDS = 15.f; // ~ half of a normal timeout.
+    constexpr F32 EVENT_POLL_ERROR_RETRY_SECONDS_INC = 5.f; // ~ half of a normal timeout.
+    constexpr S32 MAX_EVENT_POLL_HTTP_ERRORS = 10; // ~5 minutes, by the above rules.
+    constexpr F64 MIN_SECONDS_PASSED = 10.0; // Minimum time we expect the server to hold the request.
 
     int LLEventPollImpl::sNextCounter = 1;
 
@@ -151,11 +148,17 @@ namespace Details
         LLSD acknowledge;
         int errorCount = 0;
         int counter = mCounter; // saved on the stack for logging.
+        LLTimer message_time;
 
         LL_DEBUGS("LLEventPollImpl") << " <" << counter << "> entering coroutine." << LL_ENDL;
 
         mAdapter = httpAdapter;
 
+        // This is a loop with its own waitToRetry implementation,
+        // so disable retries.
+        LLCore::HttpOptions::ptr_t httpOpts(new LLCore::HttpOptions);
+        httpOpts->setRetries(0);
+
         LL::WorkQueue::ptr_t main_queue = nullptr;
 
         // HACK -- grab the mainloop workqueue to move execution of the handler
@@ -172,11 +175,13 @@ namespace Details
             request["ack"] = acknowledge;
             request["done"] = mDone;
 
+            message_time.reset();
+
 //          LL_DEBUGS("LLEventPollImpl::eventPollCoro") << "<" << counter << "> request = "
 //              << LLSDXMLStreamer(request) << LL_ENDL;
 
             LL_DEBUGS("LLEventPollImpl") << " <" << counter << "> posting and yielding." << LL_ENDL;
-            LLSD result = httpAdapter->postAndSuspend(mHttpRequest, url, request);
+            LLSD result = httpAdapter->postAndSuspend(mHttpRequest, url, request, httpOpts);
 
 //          LL_DEBUGS("LLEventPollImpl::eventPollCoro") << "<" << counter << "> result = "
 //              << LLSDXMLStreamer(result) << LL_ENDL;
@@ -194,11 +199,30 @@ namespace Details
 
             if (!status)
             {
-                if (status == LLCore::HttpStatus(LLCore::HttpStatus::EXT_CURL_EASY, CURLE_OPERATION_TIMEDOUT))
-                {   // A standard timeout response we get this when there are no events.
-                    LL_DEBUGS("LLEventPollImpl") << "All is very quiet on target server. It may have gone idle?" << LL_ENDL;
-                    errorCount = 0;
-                    continue;
+                if (status == LLCore::HttpStatus(LLCore::HttpStatus::EXT_CURL_EASY, CURLE_OPERATION_TIMEDOUT) // A standard timeout, no events.
+                    || status == LLCore::HttpStatus(HTTP_BAD_GATEWAY) // An expected 'No events' case.
+                    || status == LLCore::HttpStatus(HTTP_INTERNAL_ERROR)
+                    || status == LLCore::HttpStatus(HTTP_SERVICE_UNAVAILABLE)
+                    || status == LLCore::HttpStatus(HTTP_GATEWAY_TIME_OUT))
+                {
+                    if (message_time.getElapsedSeconds() < MIN_SECONDS_PASSED)
+                    {
+                        // Server is supposed to hold request for 20 to 30 seconds.
+                        // If it didn't hold the request at least for 10s, treat as an error.
+                        LL_WARNS("LLEventPollImpl") << "Response arrived too early, status: " << status.toTerseString()
+                            << ", time passed: " << message_time.getElapsedSeconds() << LL_ENDL;
+                    }
+                    else
+                    {
+                        // Timeout, expected and means 'no events'. Request is to be re-issued immediately.
+                        // Current definition of a timeout is any of :
+                        // - libcurl easy 28 status code
+                        // - Linden 499 special http status code
+                        // - RFC - standard 502 - 504 http status codes
+                        LL_DEBUGS("LLEventPollImpl") << "No events, from: " << mSenderIp <<" status: " << (S32)status.getStatus() << LL_ENDL;
+                        errorCount = 0;
+                        continue;
+                    }
                 }
                 else if ((status == LLCore::HttpStatus(LLCore::HttpStatus::LLCORE, LLCore::HE_OP_CANCELED)) ||
                         (status == LLCore::HttpStatus(HTTP_NOT_FOUND)))
@@ -206,13 +230,13 @@ namespace Details
                     // some cases the server gets ahead of the viewer and will
                     // return a 404 error (Not Found) before the cancel event
                     // comes back in the queue
-                    LL_WARNS("LLEventPollImpl") << "Canceling coroutine" << LL_ENDL;
+                    LL_WARNS("LLEventPollImpl") << "<" << counter << "> Canceling coroutine, status: " << status.toTerseString() << LL_ENDL;
                     break;
                 }
                 else if (!status.isHttpStatus())
                 {
                     /// Some LLCore or LIBCurl error was returned.  This is unlikely to be recoverable
-                    LL_WARNS("LLEventPollImpl") << "Critical error from poll request returned from libraries.  Canceling coroutine." << LL_ENDL;
+                    LL_WARNS("LLEventPollImpl") << "<" << counter << "> Critical error from poll request returned from libraries.  Canceling coroutine." << LL_ENDL;
                     break;
                 }
                 LL_WARNS("LLEventPollImpl") << "<" << counter << "> Error result from LLCoreHttpUtil::HttpCoroHandler. Code "
@@ -255,6 +279,10 @@ namespace Details
                         LL_WARNS("LLEventPollImpl") << "< " << counter << "> Forcing disconnect due to stalled main region event poll." << LL_ENDL;
                         LLAppViewer::instance()->forceDisconnect(LLTrans::getString("AgentLostConnection"));
                     }
+                    else
+                    {
+                        LL_WARNS("LLEventPollImpl") << "< " << counter << "> Stopping event poll for " << mSenderIp << " due to failures." << LL_ENDL;
+                    }
                     break;
                 }
             }
diff --git a/indra/newview/lleventpoll.h b/indra/newview/lleventpoll.h
index bb407b3799..ea186aa803 100644
--- a/indra/newview/lleventpoll.h
+++ b/indra/newview/lleventpoll.h
@@ -40,7 +40,30 @@ namespace Details
 
 
 class LLEventPoll
-    ///< implements the viewer side of server-to-viewer pushed events.
+    ///< Implements the viewer side of server-to-viewer pushed events.
+    ///
+    /// This class implements the sole consumer of the EventQueueGet capability
+    /// and delivers data, including llsd-encoded llmessage payloads, from
+    /// simulator to viewer.
+    ///
+    /// https://wiki.secondlife.com/wiki/EventQueueGet
+    /// The wiki page is neither complete nor entirely correct. Request timeouts
+    /// became the de facto method of returning an empty event set to the viewer.
+    /// But the timeout behavior was never defined. It was simply whatever
+    /// behavior a given grid implementation implemented.
+    ///
+    /// In SL's case, the path may include reverse proxies, http caches, http and
+    /// socks proxies, transparent hijacking, and other horrors. A pitfall for
+    /// implementors.
+    ///
+    /// Current definition of a timeout is any of :
+    /// - libcurl easy 28 status code
+    /// - Linden 499 special http status code
+    /// - RFC - standard 502 - 504 http status codes
+    /// If requests are failing too quickly with the above errors, they are treated
+    /// as actual errors and not an empty payload. These will count towards a final
+    /// error declaration and can lead to disconnection from a simulator or the
+    /// entire grid.
 {
 public:
     LLEventPoll(const std::string& pollURL, const LLHost& sender);
author	Andrey Kleshchev <117672381+akleshchev@users.noreply.github.com>	2025-09-17 18:24:35 +0300
committer	GitHub <noreply@github.com>	2025-09-17 18:24:35 +0300
commit	42b09b236bae797303af94fc003466ca649259a2 (patch)
tree	6c7195f4aebe5e038a7835de57234150c00c8931
parent	e33b5c5e6c8db1759e34db43ea9defc5455ab6c7 (diff)