diff options
author | Adam Moss <moss@lindenlab.com> | 2008-04-16 09:29:07 +0000 |
---|---|---|
committer | Adam Moss <moss@lindenlab.com> | 2008-04-16 09:29:07 +0000 |
commit | fb42741b620cea3d3b6380f1f099c92fcf4f3b04 (patch) | |
tree | 73c9e027e31810a582dfaeeaae6c51ef7208d6d1 /indra/llcommon | |
parent | 68988bddebfaf63314ded09e7ec6bade8b7bb951 (diff) |
QAR-455 Launcher and Heartbeat fixes MergeMe
svn merge svn+ssh://svn.lindenlab.com/svn/linden/release@84853
svn+ssh://svn.lindenlab.com/svn/linden/branches/moss/signalcrap-merge1
Diffstat (limited to 'indra/llcommon')
-rw-r--r-- | indra/llcommon/llapp.cpp | 163 | ||||
-rw-r--r-- | indra/llcommon/llapp.h | 3 | ||||
-rw-r--r-- | indra/llcommon/llheartbeat.cpp | 165 | ||||
-rw-r--r-- | indra/llcommon/llheartbeat.h | 73 | ||||
-rw-r--r-- | indra/llcommon/lltimer.cpp | 79 | ||||
-rw-r--r-- | indra/llcommon/lltimer.h | 6 |
6 files changed, 378 insertions, 111 deletions
diff --git a/indra/llcommon/llapp.cpp b/indra/llcommon/llapp.cpp index 6591bbc070..067dc4fc43 100644 --- a/indra/llcommon/llapp.cpp +++ b/indra/llcommon/llapp.cpp @@ -49,11 +49,24 @@ LONG WINAPI default_windows_exception_handler(struct _EXCEPTION_POINTERS *exception_infop); BOOL ConsoleCtrlHandler(DWORD fdwCtrlType); #else -#include <unistd.h> // for fork() +# include <signal.h> +# include <unistd.h> // for fork() void setup_signals(); void default_unix_signal_handler(int signum, siginfo_t *info, void *); -const S32 LL_SMACKDOWN_SIGNAL = SIGUSR1; -#endif +# if LL_DARWIN +/* OSX doesn't support SIGRT* */ +S32 LL_SMACKDOWN_SIGNAL = SIGUSR1; +S32 LL_HEARTBEAT_SIGNAL = SIGUSR2; +# else +/* We want reliable delivery of our signals - SIGRT* is it. */ +/* Old LinuxThreads versions eat SIGRTMIN+0 to SIGRTMIN+2, avoid those. */ +/* Note that SIGRTMIN/SIGRTMAX may expand to a glibc function call with a + nonconstant result so these are not consts and cannot be used in constant- + expressions. SIGRTMAX may return -1 on rare broken setups. */ +S32 LL_SMACKDOWN_SIGNAL = (SIGRTMAX >= 0) ? (SIGRTMAX-1) : SIGUSR1; +S32 LL_HEARTBEAT_SIGNAL = (SIGRTMAX >= 0) ? (SIGRTMAX-0) : SIGUSR2; +# endif // LL_DARWIN +#endif // LL_WINDOWS // the static application instance LLApp* LLApp::sApplication = NULL; @@ -501,6 +514,9 @@ void setup_signals() sigaction(SIGSEGV, &act, NULL); sigaction(SIGSYS, &act, NULL); + sigaction(LL_HEARTBEAT_SIGNAL, &act, NULL); + sigaction(LL_SMACKDOWN_SIGNAL, &act, NULL); + // Asynchronous signals that are normally ignored sigaction(SIGCHLD, &act, NULL); sigaction(SIGUSR2, &act, NULL); @@ -511,7 +527,6 @@ void setup_signals() sigaction(SIGINT, &act, NULL); // Asynchronous signals that result in core - sigaction(LL_SMACKDOWN_SIGNAL, &act, NULL); sigaction(SIGQUIT, &act, NULL); } @@ -533,6 +548,9 @@ void clear_signals() sigaction(SIGSEGV, &act, NULL); sigaction(SIGSYS, &act, NULL); + sigaction(LL_HEARTBEAT_SIGNAL, &act, NULL); + sigaction(LL_SMACKDOWN_SIGNAL, &act, NULL); + // Asynchronous signals that are normally ignored sigaction(SIGCHLD, &act, NULL); @@ -543,7 +561,6 @@ void clear_signals() // Asynchronous signals that result in core sigaction(SIGUSR2, &act, NULL); - sigaction(LL_SMACKDOWN_SIGNAL, &act, NULL); sigaction(SIGQUIT, &act, NULL); } @@ -564,16 +581,7 @@ void default_unix_signal_handler(int signum, siginfo_t *info, void *) switch (signum) { - case SIGALRM: - case SIGPIPE: - case SIGUSR2: - // We don't care about these signals, ignore them - if (LLApp::sLogInSignal) - { - llinfos << "Signal handler - Ignoring this signal" << llendl; - } - return; - case SIGCHLD: + case SIGCHLD: if (LLApp::sLogInSignal) { llinfos << "Signal handler - Got SIGCHLD from " << info->si_pid << llendl; @@ -602,59 +610,6 @@ void default_unix_signal_handler(int signum, siginfo_t *info, void *) clear_signals(); raise(signum); return; - case LL_SMACKDOWN_SIGNAL: // Smackdown treated just like any other app termination, for now - if (LLApp::sLogInSignal) - { - llwarns << "Signal handler - Handling smackdown signal!" << llendl; - } - else - { - // Don't log anything, even errors - this is because this signal could happen anywhere. - LLError::setDefaultLevel(LLError::LEVEL_NONE); - } - - // Change the signal that we reraise to SIGABRT, so we generate a core dump. - signum = SIGABRT; - case SIGBUS: - case SIGSEGV: - case SIGQUIT: - if (LLApp::sLogInSignal) - { - llwarns << "Signal handler - Handling fatal signal!" << llendl; - } - if (LLApp::isError()) - { - // Received second fatal signal while handling first, just die right now - // Set the signal handlers back to default before handling the signal - this makes the next signal wipe out the app. - clear_signals(); - - if (LLApp::sLogInSignal) - { - llwarns << "Signal handler - Got another fatal signal while in the error handler, die now!" << llendl; - } - raise(signum); - return; - } - - if (LLApp::sLogInSignal) - { - llwarns << "Signal handler - Flagging error status and waiting for shutdown" << llendl; - } - // Flag status to ERROR, so thread_error does its work. - LLApp::setError(); - // Block in the signal handler until somebody says that we're done. - while (LLApp::sErrorThreadRunning && !LLApp::isStopped()) - { - ms_sleep(10); - } - - if (LLApp::sLogInSignal) - { - llwarns << "Signal handler - App is stopped, reraising signal" << llendl; - } - clear_signals(); - raise(signum); - return; case SIGINT: case SIGHUP: case SIGTERM: @@ -675,10 +630,76 @@ void default_unix_signal_handler(int signum, siginfo_t *info, void *) } LLApp::setQuitting(); return; + case SIGALRM: + case SIGPIPE: + case SIGUSR2: default: - if (LLApp::sLogInSignal) - { - llwarns << "Signal handler - Unhandled signal, ignoring!" << llendl; + if (signum == LL_SMACKDOWN_SIGNAL || + signum == SIGBUS || + signum == SIGILL || + signum == SIGFPE || + signum == SIGSEGV || + signum == SIGQUIT) + { + if (signum == LL_SMACKDOWN_SIGNAL) + { + // Smackdown treated just like any other app termination, for now + if (LLApp::sLogInSignal) + { + llwarns << "Signal handler - Handling smackdown signal!" << llendl; + } + else + { + // Don't log anything, even errors - this is because this signal could happen anywhere. + LLError::setDefaultLevel(LLError::LEVEL_NONE); + } + + // Change the signal that we reraise to SIGABRT, so we generate a core dump. + signum = SIGABRT; + } + + if (LLApp::sLogInSignal) + { + llwarns << "Signal handler - Handling fatal signal!" << llendl; + } + if (LLApp::isError()) + { + // Received second fatal signal while handling first, just die right now + // Set the signal handlers back to default before handling the signal - this makes the next signal wipe out the app. + clear_signals(); + + if (LLApp::sLogInSignal) + { + llwarns << "Signal handler - Got another fatal signal while in the error handler, die now!" << llendl; + } + raise(signum); + return; + } + + if (LLApp::sLogInSignal) + { + llwarns << "Signal handler - Flagging error status and waiting for shutdown" << llendl; + } + // Flag status to ERROR, so thread_error does its work. + LLApp::setError(); + // Block in the signal handler until somebody says that we're done. + while (LLApp::sErrorThreadRunning && !LLApp::isStopped()) + { + ms_sleep(10); + } + + if (LLApp::sLogInSignal) + { + llwarns << "Signal handler - App is stopped, reraising signal" << llendl; + } + clear_signals(); + raise(signum); + return; + } else { + if (LLApp::sLogInSignal) + { + llinfos << "Signal handler - Unhandled signal " << signum << ", ignoring!" << llendl; + } } } } diff --git a/indra/llcommon/llapp.h b/indra/llcommon/llapp.h index d64af62538..c199601c20 100644 --- a/indra/llcommon/llapp.h +++ b/indra/llcommon/llapp.h @@ -46,7 +46,8 @@ typedef void (*LLAppErrorHandler)(); typedef void (*LLAppChildCallback)(int pid, bool exited, int status); #if !LL_WINDOWS -extern const S32 LL_SMACKDOWN_SIGNAL; +extern S32 LL_SMACKDOWN_SIGNAL; +extern S32 LL_HEARTBEAT_SIGNAL; // Clear all of the signal handlers (which we want to do for the child process when we fork void clear_signals(); diff --git a/indra/llcommon/llheartbeat.cpp b/indra/llcommon/llheartbeat.cpp new file mode 100644 index 0000000000..782a4f7ff6 --- /dev/null +++ b/indra/llcommon/llheartbeat.cpp @@ -0,0 +1,165 @@ +/** + * @file llheartbeat.cpp + * @brief Class encapsulating logic for telling a watchdog that we live. + * + * $LicenseInfo:firstyear=2008&license=viewergpl$ + * + * Copyright (c) 2008, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlife.com/developers/opensource/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at http://secondlife.com/developers/opensource/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#include <errno.h> +#include <signal.h> + +#include "linden_common.h" +#include "llapp.h" + +#include "llheartbeat.h" + +LLHeartbeat::LLHeartbeat(F32 secs_between_heartbeat, + F32 aggressive_heartbeat_panic_secs, + F32 aggressive_heartbeat_max_blocking_secs) + : mSecsBetweenHeartbeat(secs_between_heartbeat), + mAggressiveHeartbeatPanicSecs(aggressive_heartbeat_panic_secs), + mAggressiveHeartbeatMaxBlockingSecs(aggressive_heartbeat_max_blocking_secs), + mSuppressed(false) +{ + mBeatTimer.reset(); + mBeatTimer.setTimerExpirySec(mSecsBetweenHeartbeat); + mPanicTimer.reset(); + mPanicTimer.setTimerExpirySec(mAggressiveHeartbeatPanicSecs); +} + +LLHeartbeat::~LLHeartbeat() +{ + // do nothing. +} + +void +LLHeartbeat::setSuppressed(bool is_suppressed) +{ + mSuppressed = is_suppressed; +} + +// returns 0 on success, -1 on permanent failure, 1 on temporary failure +int +LLHeartbeat::rawSend() +{ +#if LL_WINDOWS + return 0; // Pretend we succeeded. +#else + if (mSuppressed) + return 0; // Pretend we succeeded. + + union sigval dummy; + int result = sigqueue(getppid(), LL_HEARTBEAT_SIGNAL, dummy); + if (result == 0) + return 0; // success + + int err = errno; + if (err == EAGAIN) + return 1; // failed to queue, try again + + return -1; // other failure. +#endif +} + +int +LLHeartbeat::rawSendWithTimeout(F32 timeout_sec) +{ + int result = 0; + + // Spin tightly until our heartbeat is digested by the watchdog + // or we time-out. We don't really want to sleep because our + // wake-up time might be undesirably synchronised to a hidden + // clock by the system's scheduler. + mTimeoutTimer.reset(); + mTimeoutTimer.setTimerExpirySec(timeout_sec); + do { + result = rawSend(); + //llinfos << " HEARTSENDc=" << result << llendl; + } while (result==1 && !mTimeoutTimer.hasExpired()); + + return result; +} + +bool +LLHeartbeat::send(F32 timeout_sec) +{ + bool total_success = false; + int result = 1; + + if (timeout_sec > 0.f) { + // force a spin until success or timeout + result = rawSendWithTimeout(timeout_sec); + } else { + if (mBeatTimer.hasExpired()) { + // zero-timeout; we don't care too much whether our + // heartbeat was digested. + result = rawSend(); + //llinfos << " HEARTSENDb=" << result << llendl; + } + } + + if (result == -1) { + // big failure. + } else if (result == 0) { + total_success = true; + } else { + // need to retry at some point + } + + if (total_success) { + mBeatTimer.reset(); + mBeatTimer.setTimerExpirySec(mSecsBetweenHeartbeat); + // reset the time until we start panicking about lost + // heartbeats again. + mPanicTimer.reset(); + mPanicTimer.setTimerExpirySec(mAggressiveHeartbeatPanicSecs); + } else { + // leave mBeatTimer as expired so we'll lazily poke the + // watchdog again next time through. + } + + if (mPanicTimer.hasExpired()) { + // It's been ages since we successfully had a heartbeat + // digested by the watchdog. Sit here and spin a while + // in the hope that we can force it through. + llwarns << "Unable to deliver heartbeat to launcher for " << mPanicTimer.getElapsedTimeF32() << " seconds. Going to try very hard for up to " << mAggressiveHeartbeatMaxBlockingSecs << " seconds." << llendl; + result = rawSendWithTimeout(mAggressiveHeartbeatMaxBlockingSecs); + if (result == 0) { + total_success = true; + } else { + // we couldn't even force it through. That's bad, + // but we'll try again in a while. + llwarns << "Could not deliver heartbeat to launcher even after trying very hard for " << mAggressiveHeartbeatMaxBlockingSecs << " seconds." << llendl; + } + + // in any case, reset the panic timer. + mPanicTimer.reset(); + mPanicTimer.setTimerExpirySec(mAggressiveHeartbeatPanicSecs); + } + + return total_success; +} diff --git a/indra/llcommon/llheartbeat.h b/indra/llcommon/llheartbeat.h new file mode 100644 index 0000000000..0761642e68 --- /dev/null +++ b/indra/llcommon/llheartbeat.h @@ -0,0 +1,73 @@ +/** + * @file llheartbeat.h + * @brief Class encapsulating logic for telling a watchdog that we live. + * + * $LicenseInfo:firstyear=2008&license=viewergpl$ + * + * Copyright (c) 2008, Linden Research, Inc. + * + * Second Life Viewer Source Code + * The source code in this file ("Source Code") is provided by Linden Lab + * to you under the terms of the GNU General Public License, version 2.0 + * ("GPL"), unless you have obtained a separate licensing agreement + * ("Other License"), formally executed by you and Linden Lab. Terms of + * the GPL can be found in doc/GPL-license.txt in this distribution, or + * online at http://secondlife.com/developers/opensource/gplv2 + * + * There are special exceptions to the terms and conditions of the GPL as + * it is applied to this Source Code. View the full text of the exception + * in the file doc/FLOSS-exception.txt in this software distribution, or + * online at http://secondlife.com/developers/opensource/flossexception + * + * By copying, modifying or distributing this software, you acknowledge + * that you have read and understood your obligations described above, + * and agree to abide by those obligations. + * + * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO + * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY, + * COMPLETENESS OR PERFORMANCE. + * $/LicenseInfo$ + */ + +#ifndef LL_LLHEARTBEAT_H +#define LL_LLHEARTBEAT_H + +#include "linden_common.h" + +#include "lltimer.h" + +// Note: Win32 does not support the heartbeat/smackdown system; +// heartbeat-delivery turns into a no-op there. + +class LLHeartbeat +{ +public: + // secs_between_heartbeat: after a heartbeat is successfully delivered, + // we suppress sending more for this length of time. + // aggressive_heartbeat_panic_secs: if we've been failing to + // successfully deliver heartbeats for this length of time then + // we block for a while until we're really sure we got one delivered. + // aggressive_heartbeat_max_blocking_secs: this is the length of + // time we block for when we're aggressively ensuring that a 'panic' + // heartbeat was delivered. + LLHeartbeat(F32 secs_between_heartbeat = 5.0f, + F32 aggressive_heartbeat_panic_secs = 10.0f, + F32 aggressive_heartbeat_max_blocking_secs = 4.0f); + ~LLHeartbeat(); + + bool send(F32 timeout_sec = 0.0f); + void setSuppressed(bool is_suppressed); + +private: + int rawSend(); + int rawSendWithTimeout(F32 timeout_sec); + F32 mSecsBetweenHeartbeat; + F32 mAggressiveHeartbeatPanicSecs; + F32 mAggressiveHeartbeatMaxBlockingSecs; + bool mSuppressed; + LLTimer mBeatTimer; + LLTimer mPanicTimer; + LLTimer mTimeoutTimer; +}; + +#endif // LL_HEARTBEAT_H diff --git a/indra/llcommon/lltimer.cpp b/indra/llcommon/lltimer.cpp index 9786d44899..cf984e4fe2 100644 --- a/indra/llcommon/lltimer.cpp +++ b/indra/llcommon/lltimer.cpp @@ -39,10 +39,8 @@ # define WIN32_LEAN_AND_MEAN # include <winsock2.h> # include <windows.h> -#elif LL_LINUX || LL_SOLARIS -# include <sys/time.h> -# include <sched.h> -#elif LL_DARWIN +#elif LL_LINUX || LL_SOLARIS || LL_DARWIN +# include <errno.h> # include <sys/time.h> #else # error "architecture not supported" @@ -81,42 +79,55 @@ U64 gLastTotalTimeClockCount = 0; //--------------------------------------------------------------------------- #if LL_WINDOWS -void ms_sleep(long ms) +void ms_sleep(U32 ms) { - Sleep((U32)ms); + Sleep(ms); } - -void llyield() +#elif LL_LINUX || LL_SOLARIS || LL_DARWIN +void ms_sleep(U32 ms) { - SleepEx(0, TRUE); // Relinquishes time slice to any thread of equal priority, can be woken up by extended IO functions -} -#elif LL_LINUX || LL_SOLARIS -void ms_sleep(long ms) -{ - struct timespec t; - t.tv_sec = ms / 1000; - t.tv_nsec = (ms % 1000) * 1000000l; - nanosleep(&t, NULL); -} + long mslong = ms; // tv_nsec is a long + struct timespec thiswait, nextwait; + bool sleep_more = false; -void llyield() -{ - sched_yield(); -} -#elif LL_DARWIN -void ms_sleep(long ms) -{ - struct timespec t; - t.tv_sec = ms / 1000; - t.tv_nsec = (ms % 1000) * 1000000l; - nanosleep(&t, NULL); -} + thiswait.tv_sec = ms / 1000; + thiswait.tv_nsec = (mslong % 1000) * 1000000l; + do { + int result = nanosleep(&thiswait, &nextwait); -void llyield() -{ -// sched_yield(); + // check if sleep was interrupted by a signal; unslept + // remainder was written back into 't' and we just nanosleep + // again. + sleep_more = (result == -1 && EINTR == errno); + + if (sleep_more) + { + if ( nextwait.tv_sec > thiswait.tv_sec || + (nextwait.tv_sec == thiswait.tv_sec && + nextwait.tv_nsec >= thiswait.tv_nsec) ) + { + // if the remaining time isn't actually going + // down then we're being shafted by low clock + // resolution - manually massage the sleep time + // downward. + if (nextwait.tv_nsec > 1000000) { + // lose 1ms + nextwait.tv_nsec -= 1000000; + } else { + if (nextwait.tv_sec == 0) { + // already so close to finished + sleep_more = false; + } else { + // lose up to 1ms + nextwait.tv_nsec = 0; + } + } + } + thiswait = nextwait; + } + } while (sleep_more); } -#else +#else # error "architecture not supported" #endif diff --git a/indra/llcommon/lltimer.h b/indra/llcommon/lltimer.h index 647f042828..113eb1e9e3 100644 --- a/indra/llcommon/lltimer.h +++ b/indra/llcommon/lltimer.h @@ -112,12 +112,8 @@ U64 get_clock_count(); F64 calc_clock_frequency(U32 msecs); void update_clock_frequencies(); - // Sleep for milliseconds -void ms_sleep(long ms); - -// Yield -//void llyield(); // Yield your timeslice - not implemented yet for Mac, so commented out. +void ms_sleep(U32 ms); // Returns the correct UTC time in seconds, like time(NULL). // Useful on the viewer, which may have its local clock set wrong. |