Commit 425fe06d authored by Adam Wujek's avatar Adam Wujek 💬

userspace: save which process caused restart triggered by monit

When process is restarted more than 5 times within 10*10 seconds (10 cycles,
each 10 seconds), then monit triggers system restart. Save information, which
process caused restart. Store it in flash to survive over restart, then copy
to /tmp.
Signed-off-by: Adam Wujek's avatarAdam Wujek <adam.wujek@cern.ch>
parent 6ee69278
......@@ -1841,6 +1841,10 @@ switch. Check is done every 10 seconds. As for now supervised processes are:
In case any of the supervised processes does not run anymore (because of a crash,
exit etc.), @t{monit} restarts missing process. If 5 restarts of a process
occur during 10 cycles (10*10 seconds), the entire switch is restarted.
The process' name causing restart is saved in the file
@t{/update/monit_restart_reason} on the flash partition. After next boot this
file is moved to @t{/tmp/monit_restart_reason}, where can be read. Since it is
@t{/tmp} partition, file with restart reason is lost after next boot.
Since @t{monit} is started from the inittab, even if @t{monit} crashes for some
reason it will be re-spawned by the @t{init}.
......
......@@ -509,6 +509,7 @@ between devices connected to the ports.\\
\item [] \underline{SNMP objects}:\\
\texttt{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing whether switch booted correctly\\
\texttt{WR-SWITCH-MIB::wrsRestartReason}\\
\texttt{WR-SWITCH-MIB::wrsRestartReasonMonit}\\
\texttt{WR-SWITCH-MIB::wrsConfigSource}\\
\texttt{WR-SWITCH-MIB::wrsConfigSourceHost}\\
\texttt{WR-SWITCH-MIB::wrsConfigSourceFilename}\\
......@@ -545,7 +546,7 @@ between devices connected to the ports.\\
\subsubsection{\bf Any userspace daemon has crashed/restarted}
\label{fail:other:daemon_crash}
\begin{packed_enum}
\item [] \underline{Status}: QUESTION, TODO \emph{(depends on monit)}
\item [] \underline{Status}: TODO \emph{(depends on monit)}
\item [] \underline{Severity}: ERROR / WARNING (depending on the process)
\item [] \underline{Description}:\\
Running processes are monitored by \texttt{Monit}. When any of them crash,
......
......@@ -202,6 +202,8 @@ following values:
\item \texttt{wrsGwWatchdogTimeouts} -- Number of times the watchdog
has restarted the HDL module responsible for the Ethernet
switching process (issue \ref{fail:other:hdl_freeze}).
\item \texttt{wrsRestartReasonMonit} -- Process that caused
\texttt{monit} to trigger restart.
\end{itemize}
\item \texttt{wrsTemperatureGroup}
\begin{itemize}
......
......@@ -2,4 +2,5 @@
# simple wrapper for monit to announce reboot to console
echo "Monit triggered reboot due to $1" > /dev/console
echo "$1" > /update/monit_restart_reason
/sbin/reboot
......@@ -2,6 +2,9 @@
export WR_HOME="/wr"
LOAD_FPGA_STATUS_FILE="/tmp/load_fpga_status"
LOAD_LM32_STATUS_FILE="/tmp/load_lm32_status"
#files for monit's restart reason
MONIT_RR_FLASH="/update/monit_restart_reason"
MONIT_RR_TMP="/tmp/monit_restart_reason"
# Get parameter from kernel commandline
for arg in $(cat /proc/cmdline); do
......@@ -11,6 +14,13 @@ for arg in $(cat /proc/cmdline); do
fi;
done
# handle monit's restat reason
# no need to remove $MONIT_RR_TMP, since tmp is not persistent
if [ -f "$MONIT_RR_FLASH" ]; then
# move restart reason to tmp so there is no need to remove it later
mv -f "$MONIT_RR_FLASH" "$MONIT_RR_TMP"
fi
# Obtain the type of FPGA (LX130XT or LX240XT)
tfpga=$($WR_HOME/bin/wrs_version -F)
if [ "$tfpga" = "UNKNOWN" ]; then
......
......@@ -535,7 +535,8 @@ wrsRestartReason OBJECT-TYPE
wakeUpReset(3),
watchdogReset(4),
softwareReset(5),
userReset(6)
userReset(6),
restartByMonit(7)
}
MAX-ACCESS read-only
STATUS current
......@@ -546,7 +547,9 @@ wrsRestartReason OBJECT-TYPE
wakeUpReset(3) - VDDCORE rising
watchdogReset(4) - Watchdog fault occurred
softwareReset(5) - Processor reset required by the software (system reboot)
userReset(6) - NRST pin detected low (reset button)"
userReset(6) - NRST pin detected low (reset button)
restartByMonit(7) - Restart caused by Monit, please check wrsRestartReasonMonit
for program that caused problems"
::= { wrsBootStatusGroup 3 }
wrsFaultIP OBJECT-TYPE
......@@ -713,6 +716,14 @@ wrsGwWatchdogTimeouts OBJECT-TYPE
for the Ethernet switching process."
::= { wrsBootStatusGroup 15 }
wrsRestartReasonMonit OBJECT-TYPE
SYNTAX DisplayString (SIZE (0..32))
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"Program's name that monit failed to start several times"
::= { wrsBootStatusGroup 16 }
-- wrsTemperatureGroup (.7.1.3)
wrsTemperatureGroup OBJECT IDENTIFIER ::= { wrsOperationStatus 3 }
......
......@@ -3,6 +3,7 @@
#include "wrsBootStatusGroup.h"
#define BOOTCOUNT_FILE "/proc/wrs-bootcount"
#define MONIT_REASON_FILE "/tmp/monit_restart_reason"
#define DOTCONFIGDIR "/tmp"
#define DOTCONFIG_PROTO "dot-config_proto"
......@@ -46,6 +47,7 @@ static struct pickinfo wrsBootStatus_pickinfo[] = {
FIELD(wrsBootStatus_s, ASN_INTEGER, wrsBootKernelModulesMissing),
FIELD(wrsBootStatus_s, ASN_INTEGER, wrsBootUserspaceDaemonsMissing),
FIELD(wrsBootStatus_s, ASN_COUNTER, wrsGwWatchdogTimeouts),
FIELD(wrsBootStatus_s, ASN_OCTET_STR, wrsRestartReasonMonit),
};
struct wrsBootStatus_s wrsBootStatus_s;
......@@ -164,6 +166,18 @@ static void get_boot_info(void){
snprintf(wrsBootStatus_s.wrsFaultLR,
sizeof(wrsBootStatus_s.wrsFaultLR), "0x%.8x",
boot_info[3].value);
/* try to find whether monit caused restart */
f = fopen(MONIT_REASON_FILE, "r");
if (f) {
/* when MONIT_REASON_FILE exists means that last restart was
* triggered by monit */
wrsBootStatus_s.wrsRestartReason = WRS_RESTART_REASON_MONIT;
/* try to get program that caused restart */
fscanf(f, LINE_READ_LEN(31),
wrsBootStatus_s.wrsRestartReasonMonit);
fclose(f);
}
}
static void get_dotconfig_source(void)
......
......@@ -5,6 +5,7 @@
#define WRSBOOTSTATUS_OID WRS_OID, 7, 1, 2
#define WRS_RESTART_REASON_ERROR 1 /* error */
#define WRS_RESTART_REASON_MONIT 7 /* ok */
#define WRS_CONFIG_SOURCE_HOST_LEN 64
#define WRS_CONFIG_SOURCE_FILENAME_LEN 128
......@@ -54,6 +55,7 @@ struct wrsBootStatus_s {
int32_t wrsBootKernelModulesMissing;
int32_t wrsBootUserspaceDaemonsMissing;
int32_t wrsGwWatchdogTimeouts;
char wrsRestartReasonMonit[32];
};
extern struct wrsBootStatus_s wrsBootStatus_s;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment