summaryrefslogtreecommitdiff
path: root/indra/llcommon
diff options
context:
space:
mode:
authorAdam Moss <moss@lindenlab.com>2008-04-16 09:29:07 +0000
committerAdam Moss <moss@lindenlab.com>2008-04-16 09:29:07 +0000
commitfb42741b620cea3d3b6380f1f099c92fcf4f3b04 (patch)
tree73c9e027e31810a582dfaeeaae6c51ef7208d6d1 /indra/llcommon
parent68988bddebfaf63314ded09e7ec6bade8b7bb951 (diff)
QAR-455 Launcher and Heartbeat fixes MergeMe
svn merge svn+ssh://svn.lindenlab.com/svn/linden/release@84853 svn+ssh://svn.lindenlab.com/svn/linden/branches/moss/signalcrap-merge1
Diffstat (limited to 'indra/llcommon')
-rw-r--r--indra/llcommon/llapp.cpp163
-rw-r--r--indra/llcommon/llapp.h3
-rw-r--r--indra/llcommon/llheartbeat.cpp165
-rw-r--r--indra/llcommon/llheartbeat.h73
-rw-r--r--indra/llcommon/lltimer.cpp79
-rw-r--r--indra/llcommon/lltimer.h6
6 files changed, 378 insertions, 111 deletions
diff --git a/indra/llcommon/llapp.cpp b/indra/llcommon/llapp.cpp
index 6591bbc070..067dc4fc43 100644
--- a/indra/llcommon/llapp.cpp
+++ b/indra/llcommon/llapp.cpp
@@ -49,11 +49,24 @@
LONG WINAPI default_windows_exception_handler(struct _EXCEPTION_POINTERS *exception_infop);
BOOL ConsoleCtrlHandler(DWORD fdwCtrlType);
#else
-#include <unistd.h> // for fork()
+# include <signal.h>
+# include <unistd.h> // for fork()
void setup_signals();
void default_unix_signal_handler(int signum, siginfo_t *info, void *);
-const S32 LL_SMACKDOWN_SIGNAL = SIGUSR1;
-#endif
+# if LL_DARWIN
+/* OSX doesn't support SIGRT* */
+S32 LL_SMACKDOWN_SIGNAL = SIGUSR1;
+S32 LL_HEARTBEAT_SIGNAL = SIGUSR2;
+# else
+/* We want reliable delivery of our signals - SIGRT* is it. */
+/* Old LinuxThreads versions eat SIGRTMIN+0 to SIGRTMIN+2, avoid those. */
+/* Note that SIGRTMIN/SIGRTMAX may expand to a glibc function call with a
+ nonconstant result so these are not consts and cannot be used in constant-
+ expressions. SIGRTMAX may return -1 on rare broken setups. */
+S32 LL_SMACKDOWN_SIGNAL = (SIGRTMAX >= 0) ? (SIGRTMAX-1) : SIGUSR1;
+S32 LL_HEARTBEAT_SIGNAL = (SIGRTMAX >= 0) ? (SIGRTMAX-0) : SIGUSR2;
+# endif // LL_DARWIN
+#endif // LL_WINDOWS
// the static application instance
LLApp* LLApp::sApplication = NULL;
@@ -501,6 +514,9 @@ void setup_signals()
sigaction(SIGSEGV, &act, NULL);
sigaction(SIGSYS, &act, NULL);
+ sigaction(LL_HEARTBEAT_SIGNAL, &act, NULL);
+ sigaction(LL_SMACKDOWN_SIGNAL, &act, NULL);
+
// Asynchronous signals that are normally ignored
sigaction(SIGCHLD, &act, NULL);
sigaction(SIGUSR2, &act, NULL);
@@ -511,7 +527,6 @@ void setup_signals()
sigaction(SIGINT, &act, NULL);
// Asynchronous signals that result in core
- sigaction(LL_SMACKDOWN_SIGNAL, &act, NULL);
sigaction(SIGQUIT, &act, NULL);
}
@@ -533,6 +548,9 @@ void clear_signals()
sigaction(SIGSEGV, &act, NULL);
sigaction(SIGSYS, &act, NULL);
+ sigaction(LL_HEARTBEAT_SIGNAL, &act, NULL);
+ sigaction(LL_SMACKDOWN_SIGNAL, &act, NULL);
+
// Asynchronous signals that are normally ignored
sigaction(SIGCHLD, &act, NULL);
@@ -543,7 +561,6 @@ void clear_signals()
// Asynchronous signals that result in core
sigaction(SIGUSR2, &act, NULL);
- sigaction(LL_SMACKDOWN_SIGNAL, &act, NULL);
sigaction(SIGQUIT, &act, NULL);
}
@@ -564,16 +581,7 @@ void default_unix_signal_handler(int signum, siginfo_t *info, void *)
switch (signum)
{
- case SIGALRM:
- case SIGPIPE:
- case SIGUSR2:
- // We don't care about these signals, ignore them
- if (LLApp::sLogInSignal)
- {
- llinfos << "Signal handler - Ignoring this signal" << llendl;
- }
- return;
- case SIGCHLD:
+ case SIGCHLD:
if (LLApp::sLogInSignal)
{
llinfos << "Signal handler - Got SIGCHLD from " << info->si_pid << llendl;
@@ -602,59 +610,6 @@ void default_unix_signal_handler(int signum, siginfo_t *info, void *)
clear_signals();
raise(signum);
return;
- case LL_SMACKDOWN_SIGNAL: // Smackdown treated just like any other app termination, for now
- if (LLApp::sLogInSignal)
- {
- llwarns << "Signal handler - Handling smackdown signal!" << llendl;
- }
- else
- {
- // Don't log anything, even errors - this is because this signal could happen anywhere.
- LLError::setDefaultLevel(LLError::LEVEL_NONE);
- }
-
- // Change the signal that we reraise to SIGABRT, so we generate a core dump.
- signum = SIGABRT;
- case SIGBUS:
- case SIGSEGV:
- case SIGQUIT:
- if (LLApp::sLogInSignal)
- {
- llwarns << "Signal handler - Handling fatal signal!" << llendl;
- }
- if (LLApp::isError())
- {
- // Received second fatal signal while handling first, just die right now
- // Set the signal handlers back to default before handling the signal - this makes the next signal wipe out the app.
- clear_signals();
-
- if (LLApp::sLogInSignal)
- {
- llwarns << "Signal handler - Got another fatal signal while in the error handler, die now!" << llendl;
- }
- raise(signum);
- return;
- }
-
- if (LLApp::sLogInSignal)
- {
- llwarns << "Signal handler - Flagging error status and waiting for shutdown" << llendl;
- }
- // Flag status to ERROR, so thread_error does its work.
- LLApp::setError();
- // Block in the signal handler until somebody says that we're done.
- while (LLApp::sErrorThreadRunning && !LLApp::isStopped())
- {
- ms_sleep(10);
- }
-
- if (LLApp::sLogInSignal)
- {
- llwarns << "Signal handler - App is stopped, reraising signal" << llendl;
- }
- clear_signals();
- raise(signum);
- return;
case SIGINT:
case SIGHUP:
case SIGTERM:
@@ -675,10 +630,76 @@ void default_unix_signal_handler(int signum, siginfo_t *info, void *)
}
LLApp::setQuitting();
return;
+ case SIGALRM:
+ case SIGPIPE:
+ case SIGUSR2:
default:
- if (LLApp::sLogInSignal)
- {
- llwarns << "Signal handler - Unhandled signal, ignoring!" << llendl;
+ if (signum == LL_SMACKDOWN_SIGNAL ||
+ signum == SIGBUS ||
+ signum == SIGILL ||
+ signum == SIGFPE ||
+ signum == SIGSEGV ||
+ signum == SIGQUIT)
+ {
+ if (signum == LL_SMACKDOWN_SIGNAL)
+ {
+ // Smackdown treated just like any other app termination, for now
+ if (LLApp::sLogInSignal)
+ {
+ llwarns << "Signal handler - Handling smackdown signal!" << llendl;
+ }
+ else
+ {
+ // Don't log anything, even errors - this is because this signal could happen anywhere.
+ LLError::setDefaultLevel(LLError::LEVEL_NONE);
+ }
+
+ // Change the signal that we reraise to SIGABRT, so we generate a core dump.
+ signum = SIGABRT;
+ }
+
+ if (LLApp::sLogInSignal)
+ {
+ llwarns << "Signal handler - Handling fatal signal!" << llendl;
+ }
+ if (LLApp::isError())
+ {
+ // Received second fatal signal while handling first, just die right now
+ // Set the signal handlers back to default before handling the signal - this makes the next signal wipe out the app.
+ clear_signals();
+
+ if (LLApp::sLogInSignal)
+ {
+ llwarns << "Signal handler - Got another fatal signal while in the error handler, die now!" << llendl;
+ }
+ raise(signum);
+ return;
+ }
+
+ if (LLApp::sLogInSignal)
+ {
+ llwarns << "Signal handler - Flagging error status and waiting for shutdown" << llendl;
+ }
+ // Flag status to ERROR, so thread_error does its work.
+ LLApp::setError();
+ // Block in the signal handler until somebody says that we're done.
+ while (LLApp::sErrorThreadRunning && !LLApp::isStopped())
+ {
+ ms_sleep(10);
+ }
+
+ if (LLApp::sLogInSignal)
+ {
+ llwarns << "Signal handler - App is stopped, reraising signal" << llendl;
+ }
+ clear_signals();
+ raise(signum);
+ return;
+ } else {
+ if (LLApp::sLogInSignal)
+ {
+ llinfos << "Signal handler - Unhandled signal " << signum << ", ignoring!" << llendl;
+ }
}
}
}
diff --git a/indra/llcommon/llapp.h b/indra/llcommon/llapp.h
index d64af62538..c199601c20 100644
--- a/indra/llcommon/llapp.h
+++ b/indra/llcommon/llapp.h
@@ -46,7 +46,8 @@ typedef void (*LLAppErrorHandler)();
typedef void (*LLAppChildCallback)(int pid, bool exited, int status);
#if !LL_WINDOWS
-extern const S32 LL_SMACKDOWN_SIGNAL;
+extern S32 LL_SMACKDOWN_SIGNAL;
+extern S32 LL_HEARTBEAT_SIGNAL;
// Clear all of the signal handlers (which we want to do for the child process when we fork
void clear_signals();
diff --git a/indra/llcommon/llheartbeat.cpp b/indra/llcommon/llheartbeat.cpp
new file mode 100644
index 0000000000..782a4f7ff6
--- /dev/null
+++ b/indra/llcommon/llheartbeat.cpp
@@ -0,0 +1,165 @@
+/**
+ * @file llheartbeat.cpp
+ * @brief Class encapsulating logic for telling a watchdog that we live.
+ *
+ * $LicenseInfo:firstyear=2008&license=viewergpl$
+ *
+ * Copyright (c) 2008, Linden Research, Inc.
+ *
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab. Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlife.com/developers/opensource/gplv2
+ *
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at http://secondlife.com/developers/opensource/flossexception
+ *
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ *
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#include <errno.h>
+#include <signal.h>
+
+#include "linden_common.h"
+#include "llapp.h"
+
+#include "llheartbeat.h"
+
+LLHeartbeat::LLHeartbeat(F32 secs_between_heartbeat,
+ F32 aggressive_heartbeat_panic_secs,
+ F32 aggressive_heartbeat_max_blocking_secs)
+ : mSecsBetweenHeartbeat(secs_between_heartbeat),
+ mAggressiveHeartbeatPanicSecs(aggressive_heartbeat_panic_secs),
+ mAggressiveHeartbeatMaxBlockingSecs(aggressive_heartbeat_max_blocking_secs),
+ mSuppressed(false)
+{
+ mBeatTimer.reset();
+ mBeatTimer.setTimerExpirySec(mSecsBetweenHeartbeat);
+ mPanicTimer.reset();
+ mPanicTimer.setTimerExpirySec(mAggressiveHeartbeatPanicSecs);
+}
+
+LLHeartbeat::~LLHeartbeat()
+{
+ // do nothing.
+}
+
+void
+LLHeartbeat::setSuppressed(bool is_suppressed)
+{
+ mSuppressed = is_suppressed;
+}
+
+// returns 0 on success, -1 on permanent failure, 1 on temporary failure
+int
+LLHeartbeat::rawSend()
+{
+#if LL_WINDOWS
+ return 0; // Pretend we succeeded.
+#else
+ if (mSuppressed)
+ return 0; // Pretend we succeeded.
+
+ union sigval dummy;
+ int result = sigqueue(getppid(), LL_HEARTBEAT_SIGNAL, dummy);
+ if (result == 0)
+ return 0; // success
+
+ int err = errno;
+ if (err == EAGAIN)
+ return 1; // failed to queue, try again
+
+ return -1; // other failure.
+#endif
+}
+
+int
+LLHeartbeat::rawSendWithTimeout(F32 timeout_sec)
+{
+ int result = 0;
+
+ // Spin tightly until our heartbeat is digested by the watchdog
+ // or we time-out. We don't really want to sleep because our
+ // wake-up time might be undesirably synchronised to a hidden
+ // clock by the system's scheduler.
+ mTimeoutTimer.reset();
+ mTimeoutTimer.setTimerExpirySec(timeout_sec);
+ do {
+ result = rawSend();
+ //llinfos << " HEARTSENDc=" << result << llendl;
+ } while (result==1 && !mTimeoutTimer.hasExpired());
+
+ return result;
+}
+
+bool
+LLHeartbeat::send(F32 timeout_sec)
+{
+ bool total_success = false;
+ int result = 1;
+
+ if (timeout_sec > 0.f) {
+ // force a spin until success or timeout
+ result = rawSendWithTimeout(timeout_sec);
+ } else {
+ if (mBeatTimer.hasExpired()) {
+ // zero-timeout; we don't care too much whether our
+ // heartbeat was digested.
+ result = rawSend();
+ //llinfos << " HEARTSENDb=" << result << llendl;
+ }
+ }
+
+ if (result == -1) {
+ // big failure.
+ } else if (result == 0) {
+ total_success = true;
+ } else {
+ // need to retry at some point
+ }
+
+ if (total_success) {
+ mBeatTimer.reset();
+ mBeatTimer.setTimerExpirySec(mSecsBetweenHeartbeat);
+ // reset the time until we start panicking about lost
+ // heartbeats again.
+ mPanicTimer.reset();
+ mPanicTimer.setTimerExpirySec(mAggressiveHeartbeatPanicSecs);
+ } else {
+ // leave mBeatTimer as expired so we'll lazily poke the
+ // watchdog again next time through.
+ }
+
+ if (mPanicTimer.hasExpired()) {
+ // It's been ages since we successfully had a heartbeat
+ // digested by the watchdog. Sit here and spin a while
+ // in the hope that we can force it through.
+ llwarns << "Unable to deliver heartbeat to launcher for " << mPanicTimer.getElapsedTimeF32() << " seconds. Going to try very hard for up to " << mAggressiveHeartbeatMaxBlockingSecs << " seconds." << llendl;
+ result = rawSendWithTimeout(mAggressiveHeartbeatMaxBlockingSecs);
+ if (result == 0) {
+ total_success = true;
+ } else {
+ // we couldn't even force it through. That's bad,
+ // but we'll try again in a while.
+ llwarns << "Could not deliver heartbeat to launcher even after trying very hard for " << mAggressiveHeartbeatMaxBlockingSecs << " seconds." << llendl;
+ }
+
+ // in any case, reset the panic timer.
+ mPanicTimer.reset();
+ mPanicTimer.setTimerExpirySec(mAggressiveHeartbeatPanicSecs);
+ }
+
+ return total_success;
+}
diff --git a/indra/llcommon/llheartbeat.h b/indra/llcommon/llheartbeat.h
new file mode 100644
index 0000000000..0761642e68
--- /dev/null
+++ b/indra/llcommon/llheartbeat.h
@@ -0,0 +1,73 @@
+/**
+ * @file llheartbeat.h
+ * @brief Class encapsulating logic for telling a watchdog that we live.
+ *
+ * $LicenseInfo:firstyear=2008&license=viewergpl$
+ *
+ * Copyright (c) 2008, Linden Research, Inc.
+ *
+ * Second Life Viewer Source Code
+ * The source code in this file ("Source Code") is provided by Linden Lab
+ * to you under the terms of the GNU General Public License, version 2.0
+ * ("GPL"), unless you have obtained a separate licensing agreement
+ * ("Other License"), formally executed by you and Linden Lab. Terms of
+ * the GPL can be found in doc/GPL-license.txt in this distribution, or
+ * online at http://secondlife.com/developers/opensource/gplv2
+ *
+ * There are special exceptions to the terms and conditions of the GPL as
+ * it is applied to this Source Code. View the full text of the exception
+ * in the file doc/FLOSS-exception.txt in this software distribution, or
+ * online at http://secondlife.com/developers/opensource/flossexception
+ *
+ * By copying, modifying or distributing this software, you acknowledge
+ * that you have read and understood your obligations described above,
+ * and agree to abide by those obligations.
+ *
+ * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
+ * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
+ * COMPLETENESS OR PERFORMANCE.
+ * $/LicenseInfo$
+ */
+
+#ifndef LL_LLHEARTBEAT_H
+#define LL_LLHEARTBEAT_H
+
+#include "linden_common.h"
+
+#include "lltimer.h"
+
+// Note: Win32 does not support the heartbeat/smackdown system;
+// heartbeat-delivery turns into a no-op there.
+
+class LLHeartbeat
+{
+public:
+ // secs_between_heartbeat: after a heartbeat is successfully delivered,
+ // we suppress sending more for this length of time.
+ // aggressive_heartbeat_panic_secs: if we've been failing to
+ // successfully deliver heartbeats for this length of time then
+ // we block for a while until we're really sure we got one delivered.
+ // aggressive_heartbeat_max_blocking_secs: this is the length of
+ // time we block for when we're aggressively ensuring that a 'panic'
+ // heartbeat was delivered.
+ LLHeartbeat(F32 secs_between_heartbeat = 5.0f,
+ F32 aggressive_heartbeat_panic_secs = 10.0f,
+ F32 aggressive_heartbeat_max_blocking_secs = 4.0f);
+ ~LLHeartbeat();
+
+ bool send(F32 timeout_sec = 0.0f);
+ void setSuppressed(bool is_suppressed);
+
+private:
+ int rawSend();
+ int rawSendWithTimeout(F32 timeout_sec);
+ F32 mSecsBetweenHeartbeat;
+ F32 mAggressiveHeartbeatPanicSecs;
+ F32 mAggressiveHeartbeatMaxBlockingSecs;
+ bool mSuppressed;
+ LLTimer mBeatTimer;
+ LLTimer mPanicTimer;
+ LLTimer mTimeoutTimer;
+};
+
+#endif // LL_HEARTBEAT_H
diff --git a/indra/llcommon/lltimer.cpp b/indra/llcommon/lltimer.cpp
index 9786d44899..cf984e4fe2 100644
--- a/indra/llcommon/lltimer.cpp
+++ b/indra/llcommon/lltimer.cpp
@@ -39,10 +39,8 @@
# define WIN32_LEAN_AND_MEAN
# include <winsock2.h>
# include <windows.h>
-#elif LL_LINUX || LL_SOLARIS
-# include <sys/time.h>
-# include <sched.h>
-#elif LL_DARWIN
+#elif LL_LINUX || LL_SOLARIS || LL_DARWIN
+# include <errno.h>
# include <sys/time.h>
#else
# error "architecture not supported"
@@ -81,42 +79,55 @@ U64 gLastTotalTimeClockCount = 0;
//---------------------------------------------------------------------------
#if LL_WINDOWS
-void ms_sleep(long ms)
+void ms_sleep(U32 ms)
{
- Sleep((U32)ms);
+ Sleep(ms);
}
-
-void llyield()
+#elif LL_LINUX || LL_SOLARIS || LL_DARWIN
+void ms_sleep(U32 ms)
{
- SleepEx(0, TRUE); // Relinquishes time slice to any thread of equal priority, can be woken up by extended IO functions
-}
-#elif LL_LINUX || LL_SOLARIS
-void ms_sleep(long ms)
-{
- struct timespec t;
- t.tv_sec = ms / 1000;
- t.tv_nsec = (ms % 1000) * 1000000l;
- nanosleep(&t, NULL);
-}
+ long mslong = ms; // tv_nsec is a long
+ struct timespec thiswait, nextwait;
+ bool sleep_more = false;
-void llyield()
-{
- sched_yield();
-}
-#elif LL_DARWIN
-void ms_sleep(long ms)
-{
- struct timespec t;
- t.tv_sec = ms / 1000;
- t.tv_nsec = (ms % 1000) * 1000000l;
- nanosleep(&t, NULL);
-}
+ thiswait.tv_sec = ms / 1000;
+ thiswait.tv_nsec = (mslong % 1000) * 1000000l;
+ do {
+ int result = nanosleep(&thiswait, &nextwait);
-void llyield()
-{
-// sched_yield();
+ // check if sleep was interrupted by a signal; unslept
+ // remainder was written back into 't' and we just nanosleep
+ // again.
+ sleep_more = (result == -1 && EINTR == errno);
+
+ if (sleep_more)
+ {
+ if ( nextwait.tv_sec > thiswait.tv_sec ||
+ (nextwait.tv_sec == thiswait.tv_sec &&
+ nextwait.tv_nsec >= thiswait.tv_nsec) )
+ {
+ // if the remaining time isn't actually going
+ // down then we're being shafted by low clock
+ // resolution - manually massage the sleep time
+ // downward.
+ if (nextwait.tv_nsec > 1000000) {
+ // lose 1ms
+ nextwait.tv_nsec -= 1000000;
+ } else {
+ if (nextwait.tv_sec == 0) {
+ // already so close to finished
+ sleep_more = false;
+ } else {
+ // lose up to 1ms
+ nextwait.tv_nsec = 0;
+ }
+ }
+ }
+ thiswait = nextwait;
+ }
+ } while (sleep_more);
}
-#else
+#else
# error "architecture not supported"
#endif
diff --git a/indra/llcommon/lltimer.h b/indra/llcommon/lltimer.h
index 647f042828..113eb1e9e3 100644
--- a/indra/llcommon/lltimer.h
+++ b/indra/llcommon/lltimer.h
@@ -112,12 +112,8 @@ U64 get_clock_count();
F64 calc_clock_frequency(U32 msecs);
void update_clock_frequencies();
-
// Sleep for milliseconds
-void ms_sleep(long ms);
-
-// Yield
-//void llyield(); // Yield your timeslice - not implemented yet for Mac, so commented out.
+void ms_sleep(U32 ms);
// Returns the correct UTC time in seconds, like time(NULL).
// Useful on the viewer, which may have its local clock set wrong.