On Windows, defend test.cpp against structured exceptions too.

Since August 2023, we've seen occasional GitHub Windows build test runs terminate with 0xC00000FD: stack overflow. We've usually responded by bumping up the default coroutine stack size. On closer examination, it's always llleap_test.cpp that blows up that way -- and llleap_test.cpp doesn't appear to use coroutines at all. So apparently we've been consuming more address space for ALL viewer coroutines without actually addressing the problem. Reset the default coroutine stack size to where it was before we started bumping it up in response to these llleap_test.cpp stack overflow failures. Note that LLCoros already catches and reports Windows structured exceptions, underscoring that the observed stack overflow is not from within a coroutine. While at it, restore the Windows llleap_test.cpp data volume to match Posix. We think the problem that led to reducing that data volume was an APR bug, which we hope has been fixed. Equip test.cpp, the test driver program for all our TUT unit and integration tests, with a Windows structured exception handler. Try to treat a Windows structured exception as a test failure -- instead of silently terminating with 0xC00000FD. Moreover, when a structured exception occurs, output a stack trace so we can try to track it down.
author: Nat Goodspeed <nat@lindenlab.com> 2024-05-16 13:30:14 -0400
committer: Nat Goodspeed <nat@lindenlab.com> 2024-05-16 13:30:14 -0400
commit: 1b6e2ef62cec9608d160ea25d99080f0e2964ee5 (patch)
tree: e8ad6e077301e6d9d36fc74f74303d18278eb790
parent: da469588244d490673ec08d186526d8832dcbd62 (diff)
3 files changed, 107 insertions, 24 deletions
diff --git a/indra/llcommon/llcoros.cpp b/indra/llcommon/llcoros.cpp
index aa8eca7d90..a70e3d9ae7 100644
--- a/indra/llcommon/llcoros.cpp
+++ b/indra/llcommon/llcoros.cpp
@@ -123,7 +123,7 @@ LLCoros::LLCoros():
     // Previously we used
     // boost::context::guarded_stack_allocator::default_stacksize();
     // empirically this is insufficient.
-    mStackSize(1024*1024),
+    mStackSize(512*1024),
     // mCurrent does NOT own the current CoroData instance -- it simply
     // points to it. So initialize it with a no-op deleter.
     mCurrent{ [](CoroData*){} }
@@ -155,7 +155,7 @@ void LLCoros::cleanupSingleton()
         // don't use llcoro::suspend() because that module depends
         // on this one
         // This will yield current(main) thread and will let active
-        // corutines run once
+        // coroutines run once
         boost::this_fiber::yield();
     }
     printActiveCoroutines("after pumping");
diff --git a/indra/llcommon/tests/llleap_test.cpp b/indra/llcommon/tests/llleap_test.cpp
index fa48bcdefd..3fb25b4cef 100644
--- a/indra/llcommon/tests/llleap_test.cpp
+++ b/indra/llcommon/tests/llleap_test.cpp
@@ -35,7 +35,7 @@
 // causes Windows abdominal pain such that it later fails code-signing in some
 // mysterious way. Entirely suppressing these LLLeap tests pushes the failure
 // rate MUCH lower. Can we re-enable them with a smaller data size on Windows?
-const size_t BUFFERED_LENGTH =  100*1024;
+const size_t BUFFERED_LENGTH = 1023*1024;
 
 #else // not Windows
 const size_t BUFFERED_LENGTH = 1023*1024; // try wrangling just under a megabyte of data
diff --git a/indra/test/test.cpp b/indra/test/test.cpp
index 61a4eb07c5..0b2abbc650 100644
--- a/indra/test/test.cpp
+++ b/indra/test/test.cpp
@@ -68,10 +68,7 @@
 #pragma warning (pop)
 #endif
 
-#include <boost/scoped_ptr.hpp>
-#include <boost/shared_ptr.hpp>
-#include <boost/make_shared.hpp>
-#include <boost/foreach.hpp>
+#include <boost/stacktrace.hpp>
 
 #include <fstream>
 
@@ -181,10 +178,6 @@ public:
     LLTestCallback(bool verbose_mode, std::ostream *stream,
                    std::shared_ptr<LLReplayLog> replayer) :
         mVerboseMode(verbose_mode),
-        mTotalTests(0),
-        mPassedTests(0),
-        mFailedTests(0),
-        mSkippedTests(0),
         // By default, capture a shared_ptr to std::cout, with a no-op "deleter"
         // so that destroying the shared_ptr makes no attempt to delete std::cout.
         mStream(std::shared_ptr<std::ostream>(&std::cout, [](std::ostream*){})),
@@ -220,6 +213,8 @@ public:
     virtual void group_started(const std::string& name) {
         LL_INFOS("TestRunner")<<"Unit test group_started name=" << name << LL_ENDL;
         *mStream << "Unit test group_started name=" << name << std::endl;
+        mGroup = name;
+        mGroupTests = 0;
         super::group_started(name);
     }
 
@@ -232,6 +227,7 @@ public:
     virtual void test_completed(const tut::test_result& tr)
     {
         ++mTotalTests;
+        ++mGroupTests;
 
         // If this test failed, dump requested log messages BEFORE stating the
         // test result.
@@ -319,12 +315,15 @@ public:
         super::run_completed();
     }
 
+    std::string mGroup;
+    int mGroupTests{ 0 };
+
 protected:
-    bool mVerboseMode;
-    int mTotalTests;
-    int mPassedTests;
-    int mFailedTests;
-    int mSkippedTests;
+    bool mVerboseMode{ false };
+    int mTotalTests{ 0 };
+    int mPassedTests{ 0 };
+    int mFailedTests{ 0 };
+    int mSkippedTests{ 0 };
     std::shared_ptr<std::ostream> mStream;
     std::shared_ptr<LLReplayLog> mReplayer;
 };
@@ -520,6 +519,57 @@ void wouldHaveCrashed(const std::string& message)
 
 static LLTrace::ThreadRecorder* sMasterThreadRecorder = NULL;
 
+// this is used in platform-generic code -- define outside #if LL_WINDOWS
+struct Windows_SEH_exception: public std::runtime_error
+{
+    Windows_SEH_exception(const std::string& what): std::runtime_error(what) {}
+};
+
+#if LL_WINDOWS
+
+static const U32 STATUS_MSC_EXCEPTION = 0xE06D7363; // compiler specific
+
+U32 seh_filter(U32 code, struct _EXCEPTION_POINTERS*)
+{
+    if (code == STATUS_MSC_EXCEPTION)
+    {
+        // C++ exception, go on -- but TUT is supposed to have caught those already?!
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+    else
+    {
+        // This is a non-C++ exception, e.g. hardware check.
+        // By the time the handler gets control, the stack has been unwound,
+        // so report the stack trace now at filter() time.
+        std::cerr << boost::stacktrace::stacktrace() << std::endl;
+        // pass control into the handler block
+        return EXCEPTION_EXECUTE_HANDLER;
+    }
+}
+
+template <typename CALLABLE0, typename CALLABLE1>
+void seh_catcher(CALLABLE0&& trycode, CALLABLE1&& handler)
+{
+    __try
+    {
+        trycode();
+    }
+    __except (seh_filter(GetExceptionCode(), GetExceptionInformation()))
+    {
+        handler(GetExceptionCode());
+    }
+}
+
+#else  // not LL_WINDOWS
+
+template <typename CALLABLE0, typename CALLABLE1>
+void seh_catcher(CALLABLE0&& trycode, CALLABLE1&&)
+{
+    trycode();
+}
+
+#endif // not LL_WINDOWS
+
 int main(int argc, char **argv)
 {
     // The following line must be executed to initialize Google Mock
@@ -658,14 +708,47 @@ int main(int argc, char **argv)
     // a chained_callback subclass must be linked with previous
     mycallback->link();
 
-    if(test_group.empty())
-    {
-        tut::runner.get().run_tests();
-    }
-    else
-    {
-        tut::runner.get().run_tests(test_group);
-    }
+    seh_catcher(
+        // __try
+        [test_group]
+        {
+            if(test_group.empty())
+            {
+                tut::runner.get().run_tests();
+            }
+            else
+            {
+                tut::runner.get().run_tests(test_group);
+            }
+        },
+        // __except
+        [mycallback](U32 code)
+        {
+            static std::map<U32, const char*> codes = {
+                { 0xC0000005, "Access Violation" },
+                { 0xC00000FD, "Stack Overflow" },
+                // ... continue filling in as desired
+            };
+
+            auto found{ codes.find(code) };
+            const char* name = ((found == codes.end())? "unknown" : found->second);
+            auto msg{ stringize("test threw ", std::hex, code, " (", name, ")") };
+
+            // Instead of bombing the whole test run, report this as a test
+            // failure. Arguably, catching structured exceptions should be
+            // hacked into TUT itself.
+            mycallback->test_completed(tut::test_result(
+                mycallback->mGroup,
+                mycallback->mGroupTests+1, // test within group
+                "unknown",                 // test name
+                tut::test_result::ex,      // result: exception
+                // we don't have to throw this exception subclass to use it to
+                // populate the test_result struct
+                Windows_SEH_exception(msg)));
+            // we've left the TUT framework -- finish up by hand
+            mycallback->group_completed(mycallback->mGroup);
+            mycallback->run_completed();
+        });
 
     bool success = (mycallback->getFailedTests() == 0);
author	Nat Goodspeed <nat@lindenlab.com>	2024-05-16 13:30:14 -0400
committer	Nat Goodspeed <nat@lindenlab.com>	2024-05-16 13:30:14 -0400
commit	1b6e2ef62cec9608d160ea25d99080f0e2964ee5 (patch)
tree	e8ad6e077301e6d9d36fc74f74303d18278eb790
parent	da469588244d490673ec08d186526d8832dcbd62 (diff)