On Windows, defend test.cpp against structured exceptions too.

Since August 2023, we've seen occasional GitHub Windows build test runs terminate with 0xC00000FD: stack overflow. We've usually responded by bumping up the default coroutine stack size. On closer examination, it's always llleap_test.cpp that blows up that way -- and llleap_test.cpp doesn't appear to use coroutines at all. So apparently we've been consuming more address space for ALL viewer coroutines without actually addressing the problem. Reset the default coroutine stack size to where it was before we started bumping it up in response to these llleap_test.cpp stack overflow failures. Note that LLCoros already catches and reports Windows structured exceptions, underscoring that the observed stack overflow is not from within a coroutine. While at it, restore the Windows llleap_test.cpp data volume to match Posix. We think the problem that led to reducing that data volume was an APR bug, which we hope has been fixed. Equip test.cpp, the test driver program for all our TUT unit and integration tests, with a Windows structured exception handler. Try to treat a Windows structured exception as a test failure -- instead of silently terminating with 0xC00000FD. Moreover, when a structured exception occurs, output a stack trace so we can try to track it down.
author: Nat Goodspeed <nat@lindenlab.com> 2024-05-16 13:30:14 -0400
committer: Nat Goodspeed <nat@lindenlab.com> 2024-05-16 13:30:14 -0400
commit: 1b6e2ef62cec9608d160ea25d99080f0e2964ee5 (patch)
tree: e8ad6e077301e6d9d36fc74f74303d18278eb790 /indra/test
parent: da469588244d490673ec08d186526d8832dcbd62 (diff)
1 files changed, 104 insertions, 21 deletions
diff --git a/indra/test/test.cpp b/indra/test/test.cpp
index 61a4eb07c5..0b2abbc650 100644
--- a/indra/test/test.cpp
+++ b/indra/test/test.cpp
@@ -68,10 +68,7 @@
 #pragma warning (pop)
 #endif
 
-#include <boost/scoped_ptr.hpp>
-#include <boost/shared_ptr.hpp>
-#include <boost/make_shared.hpp>
-#include <boost/foreach.hpp>
+#include <boost/stacktrace.hpp>
 
 #include <fstream>
 
@@ -181,10 +178,6 @@ public:
     LLTestCallback(bool verbose_mode, std::ostream *stream,
                    std::shared_ptr<LLReplayLog> replayer) :
         mVerboseMode(verbose_mode),
-        mTotalTests(0),
-        mPassedTests(0),
-        mFailedTests(0),
-        mSkippedTests(0),
         // By default, capture a shared_ptr to std::cout, with a no-op "deleter"
         // so that destroying the shared_ptr makes no attempt to delete std::cout.
         mStream(std::shared_ptr<std::ostream>(&std::cout, [](std::ostream*){})),
@@ -220,6 +213,8 @@ public:
     virtual void group_started(const std::string& name) {
         LL_INFOS("TestRunner")<<"Unit test group_started name=" << name << LL_ENDL;
         *mStream << "Unit test group_started name=" << name << std::endl;
+        mGroup = name;
+        mGroupTests = 0;
         super::group_started(name);
     }
 
@@ -232,6 +227,7 @@ public:
     virtual void test_completed(const tut::test_result& tr)
     {
         ++mTotalTests;
+        ++mGroupTests;
 
         // If this test failed, dump requested log messages BEFORE stating the
         // test result.
@@ -319,12 +315,15 @@ public:
         super::run_completed();
     }
 
+    std::string mGroup;
+    int mGroupTests{ 0 };
+
 protected:
-    bool mVerboseMode;
-    int mTotalTests;
-    int mPassedTests;
-    int mFailedTests;
-    int mSkippedTests;
+    bool mVerboseMode{ false };
+    int mTotalTests{ 0 };
+    int mPassedTests{ 0 };
+    int mFailedTests{ 0 };
+    int mSkippedTests{ 0 };
     std::shared_ptr<std::ostream> mStream;
     std::shared_ptr<LLReplayLog> mReplayer;
 };
@@ -520,6 +519,57 @@ void wouldHaveCrashed(const std::string& message)
 
 static LLTrace::ThreadRecorder* sMasterThreadRecorder = NULL;
 
+// this is used in platform-generic code -- define outside #if LL_WINDOWS
+struct Windows_SEH_exception: public std::runtime_error
+{
+    Windows_SEH_exception(const std::string& what): std::runtime_error(what) {}
+};
+
+#if LL_WINDOWS
+
+static const U32 STATUS_MSC_EXCEPTION = 0xE06D7363; // compiler specific
+
+U32 seh_filter(U32 code, struct _EXCEPTION_POINTERS*)
+{
+    if (code == STATUS_MSC_EXCEPTION)
+    {
+        // C++ exception, go on -- but TUT is supposed to have caught those already?!
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+    else
+    {
+        // This is a non-C++ exception, e.g. hardware check.
+        // By the time the handler gets control, the stack has been unwound,
+        // so report the stack trace now at filter() time.
+        std::cerr << boost::stacktrace::stacktrace() << std::endl;
+        // pass control into the handler block
+        return EXCEPTION_EXECUTE_HANDLER;
+    }
+}
+
+template <typename CALLABLE0, typename CALLABLE1>
+void seh_catcher(CALLABLE0&& trycode, CALLABLE1&& handler)
+{
+    __try
+    {
+        trycode();
+    }
+    __except (seh_filter(GetExceptionCode(), GetExceptionInformation()))
+    {
+        handler(GetExceptionCode());
+    }
+}
+
+#else  // not LL_WINDOWS
+
+template <typename CALLABLE0, typename CALLABLE1>
+void seh_catcher(CALLABLE0&& trycode, CALLABLE1&&)
+{
+    trycode();
+}
+
+#endif // not LL_WINDOWS
+
 int main(int argc, char **argv)
 {
     // The following line must be executed to initialize Google Mock
@@ -658,14 +708,47 @@ int main(int argc, char **argv)
     // a chained_callback subclass must be linked with previous
     mycallback->link();
 
-    if(test_group.empty())
-    {
-        tut::runner.get().run_tests();
-    }
-    else
-    {
-        tut::runner.get().run_tests(test_group);
-    }
+    seh_catcher(
+        // __try
+        [test_group]
+        {
+            if(test_group.empty())
+            {
+                tut::runner.get().run_tests();
+            }
+            else
+            {
+                tut::runner.get().run_tests(test_group);
+            }
+        },
+        // __except
+        [mycallback](U32 code)
+        {
+            static std::map<U32, const char*> codes = {
+                { 0xC0000005, "Access Violation" },
+                { 0xC00000FD, "Stack Overflow" },
+                // ... continue filling in as desired
+            };
+
+            auto found{ codes.find(code) };
+            const char* name = ((found == codes.end())? "unknown" : found->second);
+            auto msg{ stringize("test threw ", std::hex, code, " (", name, ")") };
+
+            // Instead of bombing the whole test run, report this as a test
+            // failure. Arguably, catching structured exceptions should be
+            // hacked into TUT itself.
+            mycallback->test_completed(tut::test_result(
+                mycallback->mGroup,
+                mycallback->mGroupTests+1, // test within group
+                "unknown",                 // test name
+                tut::test_result::ex,      // result: exception
+                // we don't have to throw this exception subclass to use it to
+                // populate the test_result struct
+                Windows_SEH_exception(msg)));
+            // we've left the TUT framework -- finish up by hand
+            mycallback->group_completed(mycallback->mGroup);
+            mycallback->run_completed();
+        });
 
     bool success = (mycallback->getFailedTests() == 0);
author	Nat Goodspeed <nat@lindenlab.com>	2024-05-16 13:30:14 -0400
committer	Nat Goodspeed <nat@lindenlab.com>	2024-05-16 13:30:14 -0400
commit	1b6e2ef62cec9608d160ea25d99080f0e2964ee5 (patch)
tree	e8ad6e077301e6d9d36fc74f74303d18278eb790 /indra/test
parent	da469588244d490673ec08d186526d8832dcbd62 (diff)