Commit 425fe06d authored by Adam Wujek's avatar Adam Wujek 💬

userspace: save which process caused restart triggered by monit

When process is restarted more than 5 times within 10*10 seconds (10 cycles,
each 10 seconds), then monit triggers system restart. Save information, which
process caused restart. Store it in flash to survive over restart, then copy
to /tmp.
Signed-off-by: Adam Wujek's avatarAdam Wujek <adam.wujek@cern.ch>
parent 6ee69278
...@@ -1841,6 +1841,10 @@ switch. Check is done every 10 seconds. As for now supervised processes are: ...@@ -1841,6 +1841,10 @@ switch. Check is done every 10 seconds. As for now supervised processes are:
In case any of the supervised processes does not run anymore (because of a crash, In case any of the supervised processes does not run anymore (because of a crash,
exit etc.), @t{monit} restarts missing process. If 5 restarts of a process exit etc.), @t{monit} restarts missing process. If 5 restarts of a process
occur during 10 cycles (10*10 seconds), the entire switch is restarted. occur during 10 cycles (10*10 seconds), the entire switch is restarted.
The process' name causing restart is saved in the file
@t{/update/monit_restart_reason} on the flash partition. After next boot this
file is moved to @t{/tmp/monit_restart_reason}, where can be read. Since it is
@t{/tmp} partition, file with restart reason is lost after next boot.
Since @t{monit} is started from the inittab, even if @t{monit} crashes for some Since @t{monit} is started from the inittab, even if @t{monit} crashes for some
reason it will be re-spawned by the @t{init}. reason it will be re-spawned by the @t{init}.
......
...@@ -509,6 +509,7 @@ between devices connected to the ports.\\ ...@@ -509,6 +509,7 @@ between devices connected to the ports.\\
\item [] \underline{SNMP objects}:\\ \item [] \underline{SNMP objects}:\\
\texttt{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing whether switch booted correctly\\ \texttt{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing whether switch booted correctly\\
\texttt{WR-SWITCH-MIB::wrsRestartReason}\\ \texttt{WR-SWITCH-MIB::wrsRestartReason}\\
\texttt{WR-SWITCH-MIB::wrsRestartReasonMonit}\\
\texttt{WR-SWITCH-MIB::wrsConfigSource}\\ \texttt{WR-SWITCH-MIB::wrsConfigSource}\\
\texttt{WR-SWITCH-MIB::wrsConfigSourceHost}\\ \texttt{WR-SWITCH-MIB::wrsConfigSourceHost}\\
\texttt{WR-SWITCH-MIB::wrsConfigSourceFilename}\\ \texttt{WR-SWITCH-MIB::wrsConfigSourceFilename}\\
...@@ -545,7 +546,7 @@ between devices connected to the ports.\\ ...@@ -545,7 +546,7 @@ between devices connected to the ports.\\
\subsubsection{\bf Any userspace daemon has crashed/restarted} \subsubsection{\bf Any userspace daemon has crashed/restarted}
\label{fail:other:daemon_crash} \label{fail:other:daemon_crash}
\begin{packed_enum} \begin{packed_enum}
\item [] \underline{Status}: QUESTION, TODO \emph{(depends on monit)} \item [] \underline{Status}: TODO \emph{(depends on monit)}
\item [] \underline{Severity}: ERROR / WARNING (depending on the process) \item [] \underline{Severity}: ERROR / WARNING (depending on the process)
\item [] \underline{Description}:\\ \item [] \underline{Description}:\\
Running processes are monitored by \texttt{Monit}. When any of them crash, Running processes are monitored by \texttt{Monit}. When any of them crash,
......
...@@ -202,6 +202,8 @@ following values: ...@@ -202,6 +202,8 @@ following values:
\item \texttt{wrsGwWatchdogTimeouts} -- Number of times the watchdog \item \texttt{wrsGwWatchdogTimeouts} -- Number of times the watchdog
has restarted the HDL module responsible for the Ethernet has restarted the HDL module responsible for the Ethernet
switching process (issue \ref{fail:other:hdl_freeze}). switching process (issue \ref{fail:other:hdl_freeze}).
\item \texttt{wrsRestartReasonMonit} -- Process that caused
\texttt{monit} to trigger restart.
\end{itemize} \end{itemize}
\item \texttt{wrsTemperatureGroup} \item \texttt{wrsTemperatureGroup}
\begin{itemize} \begin{itemize}
......
...@@ -2,4 +2,5 @@ ...@@ -2,4 +2,5 @@
# simple wrapper for monit to announce reboot to console # simple wrapper for monit to announce reboot to console
echo "Monit triggered reboot due to $1" > /dev/console echo "Monit triggered reboot due to $1" > /dev/console
echo "$1" > /update/monit_restart_reason
/sbin/reboot /sbin/reboot
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
export WR_HOME="/wr" export WR_HOME="/wr"
LOAD_FPGA_STATUS_FILE="/tmp/load_fpga_status" LOAD_FPGA_STATUS_FILE="/tmp/load_fpga_status"
LOAD_LM32_STATUS_FILE="/tmp/load_lm32_status" LOAD_LM32_STATUS_FILE="/tmp/load_lm32_status"
#files for monit's restart reason
MONIT_RR_FLASH="/update/monit_restart_reason"
MONIT_RR_TMP="/tmp/monit_restart_reason"
# Get parameter from kernel commandline # Get parameter from kernel commandline
for arg in $(cat /proc/cmdline); do for arg in $(cat /proc/cmdline); do
...@@ -11,6 +14,13 @@ for arg in $(cat /proc/cmdline); do ...@@ -11,6 +14,13 @@ for arg in $(cat /proc/cmdline); do
fi; fi;
done done
# handle monit's restat reason
# no need to remove $MONIT_RR_TMP, since tmp is not persistent
if [ -f "$MONIT_RR_FLASH" ]; then
# move restart reason to tmp so there is no need to remove it later
mv -f "$MONIT_RR_FLASH" "$MONIT_RR_TMP"
fi
# Obtain the type of FPGA (LX130XT or LX240XT) # Obtain the type of FPGA (LX130XT or LX240XT)
tfpga=$($WR_HOME/bin/wrs_version -F) tfpga=$($WR_HOME/bin/wrs_version -F)
if [ "$tfpga" = "UNKNOWN" ]; then if [ "$tfpga" = "UNKNOWN" ]; then
......
...@@ -535,7 +535,8 @@ wrsRestartReason OBJECT-TYPE ...@@ -535,7 +535,8 @@ wrsRestartReason OBJECT-TYPE
wakeUpReset(3), wakeUpReset(3),
watchdogReset(4), watchdogReset(4),
softwareReset(5), softwareReset(5),
userReset(6) userReset(6),
restartByMonit(7)
} }
MAX-ACCESS read-only MAX-ACCESS read-only
STATUS current STATUS current
...@@ -546,7 +547,9 @@ wrsRestartReason OBJECT-TYPE ...@@ -546,7 +547,9 @@ wrsRestartReason OBJECT-TYPE
wakeUpReset(3) - VDDCORE rising wakeUpReset(3) - VDDCORE rising
watchdogReset(4) - Watchdog fault occurred watchdogReset(4) - Watchdog fault occurred
softwareReset(5) - Processor reset required by the software (system reboot) softwareReset(5) - Processor reset required by the software (system reboot)
userReset(6) - NRST pin detected low (reset button)" userReset(6) - NRST pin detected low (reset button)
restartByMonit(7) - Restart caused by Monit, please check wrsRestartReasonMonit
for program that caused problems"
::= { wrsBootStatusGroup 3 } ::= { wrsBootStatusGroup 3 }
wrsFaultIP OBJECT-TYPE wrsFaultIP OBJECT-TYPE
...@@ -713,6 +716,14 @@ wrsGwWatchdogTimeouts OBJECT-TYPE ...@@ -713,6 +716,14 @@ wrsGwWatchdogTimeouts OBJECT-TYPE
for the Ethernet switching process." for the Ethernet switching process."
::= { wrsBootStatusGroup 15 } ::= { wrsBootStatusGroup 15 }
wrsRestartReasonMonit OBJECT-TYPE
SYNTAX DisplayString (SIZE (0..32))
MAX-ACCESS read-only
STATUS current
DESCRIPTION
"Program's name that monit failed to start several times"
::= { wrsBootStatusGroup 16 }
-- wrsTemperatureGroup (.7.1.3) -- wrsTemperatureGroup (.7.1.3)
wrsTemperatureGroup OBJECT IDENTIFIER ::= { wrsOperationStatus 3 } wrsTemperatureGroup OBJECT IDENTIFIER ::= { wrsOperationStatus 3 }
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include "wrsBootStatusGroup.h" #include "wrsBootStatusGroup.h"
#define BOOTCOUNT_FILE "/proc/wrs-bootcount" #define BOOTCOUNT_FILE "/proc/wrs-bootcount"
#define MONIT_REASON_FILE "/tmp/monit_restart_reason"
#define DOTCONFIGDIR "/tmp" #define DOTCONFIGDIR "/tmp"
#define DOTCONFIG_PROTO "dot-config_proto" #define DOTCONFIG_PROTO "dot-config_proto"
...@@ -46,6 +47,7 @@ static struct pickinfo wrsBootStatus_pickinfo[] = { ...@@ -46,6 +47,7 @@ static struct pickinfo wrsBootStatus_pickinfo[] = {
FIELD(wrsBootStatus_s, ASN_INTEGER, wrsBootKernelModulesMissing), FIELD(wrsBootStatus_s, ASN_INTEGER, wrsBootKernelModulesMissing),
FIELD(wrsBootStatus_s, ASN_INTEGER, wrsBootUserspaceDaemonsMissing), FIELD(wrsBootStatus_s, ASN_INTEGER, wrsBootUserspaceDaemonsMissing),
FIELD(wrsBootStatus_s, ASN_COUNTER, wrsGwWatchdogTimeouts), FIELD(wrsBootStatus_s, ASN_COUNTER, wrsGwWatchdogTimeouts),
FIELD(wrsBootStatus_s, ASN_OCTET_STR, wrsRestartReasonMonit),
}; };
struct wrsBootStatus_s wrsBootStatus_s; struct wrsBootStatus_s wrsBootStatus_s;
...@@ -164,6 +166,18 @@ static void get_boot_info(void){ ...@@ -164,6 +166,18 @@ static void get_boot_info(void){
snprintf(wrsBootStatus_s.wrsFaultLR, snprintf(wrsBootStatus_s.wrsFaultLR,
sizeof(wrsBootStatus_s.wrsFaultLR), "0x%.8x", sizeof(wrsBootStatus_s.wrsFaultLR), "0x%.8x",
boot_info[3].value); boot_info[3].value);
/* try to find whether monit caused restart */
f = fopen(MONIT_REASON_FILE, "r");
if (f) {
/* when MONIT_REASON_FILE exists means that last restart was
* triggered by monit */
wrsBootStatus_s.wrsRestartReason = WRS_RESTART_REASON_MONIT;
/* try to get program that caused restart */
fscanf(f, LINE_READ_LEN(31),
wrsBootStatus_s.wrsRestartReasonMonit);
fclose(f);
}
} }
static void get_dotconfig_source(void) static void get_dotconfig_source(void)
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#define WRSBOOTSTATUS_OID WRS_OID, 7, 1, 2 #define WRSBOOTSTATUS_OID WRS_OID, 7, 1, 2
#define WRS_RESTART_REASON_ERROR 1 /* error */ #define WRS_RESTART_REASON_ERROR 1 /* error */
#define WRS_RESTART_REASON_MONIT 7 /* ok */
#define WRS_CONFIG_SOURCE_HOST_LEN 64 #define WRS_CONFIG_SOURCE_HOST_LEN 64
#define WRS_CONFIG_SOURCE_FILENAME_LEN 128 #define WRS_CONFIG_SOURCE_FILENAME_LEN 128
...@@ -54,6 +55,7 @@ struct wrsBootStatus_s { ...@@ -54,6 +55,7 @@ struct wrsBootStatus_s {
int32_t wrsBootKernelModulesMissing; int32_t wrsBootKernelModulesMissing;
int32_t wrsBootUserspaceDaemonsMissing; int32_t wrsBootUserspaceDaemonsMissing;
int32_t wrsGwWatchdogTimeouts; int32_t wrsGwWatchdogTimeouts;
char wrsRestartReasonMonit[32];
}; };
extern struct wrsBootStatus_s wrsBootStatus_s; extern struct wrsBootStatus_s wrsBootStatus_s;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment