From 9b4d19ebd3e5a91b5f22db61b975fd3c7d92cb7b Mon Sep 17 00:00:00 2001 From: Adam Wujek <adam.wujek@cern.ch> Date: Fri, 22 Jan 2016 17:39:11 +0100 Subject: [PATCH] doc/wrs_failures: add statuses' objects to problems Signed-off-by: Adam Wujek <adam.wujek@cern.ch> --- doc/wrs_failures/fail.tex | 132 +++++++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 37 deletions(-) diff --git a/doc/wrs_failures/fail.tex b/doc/wrs_failures/fail.tex index aaf3c4bfc..61878a921 100644 --- a/doc/wrs_failures/fail.tex +++ b/doc/wrs_failures/fail.tex @@ -19,7 +19,10 @@ WR network. \snmpadd{WR-SWITCH-MIB::wrsPtpServoState.<n>} -- PTP servo state as string\\ \snmpadd{WR-SWITCH-MIB::wrsPtpServoStateN.<n>} -- PTP servo state as number\\ \snmpadd{WR-SWITCH-MIB::wrsPtpServoStateErrCnt.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsPTPStatus} + \snmpadd{WR-SWITCH-MIB::wrsPTPStatus} \\ + \snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} + \item [] \underline{Note}: PTP servo state is exported as a string and a number. \end{pck_descr} @@ -38,7 +41,9 @@ WR network. \snmpadd{WR-SWITCH-MIB::wrsPtpClockOffsetPsHR.<n>} -- 32-bit signed value of the offset in ps; with saturation on overflow and underflow\\ \snmpadd{WR-SWITCH-MIB::wrsPtpClockOffsetErrCnt.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsPTPStatus} + \snmpadd{WR-SWITCH-MIB::wrsPTPStatus} \\ + \snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Detected jump in the RTT value calculated by \emph{PTP/PPSi}} @@ -55,7 +60,9 @@ WR network. \item [] \underline{SNMP objects}:\\ \snmpadd{WR-SWITCH-MIB::wrsPtpRTT.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPtpRTTErrCnt.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsPTPStatus} + \snmpadd{WR-SWITCH-MIB::wrsPTPStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Wrong $\Delta_{TXM}$, $\Delta_{RXM}$, $\Delta_{TXS}$, @@ -75,7 +82,9 @@ WR network. \snmpadd{WR-SWITCH-MIB::wrsPtpDeltaRxM.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPtpDeltaTxS.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPtpDeltaRxS.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsPTPStatus} + \snmpadd{WR-SWITCH-MIB::wrsPTPStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf \emph{SoftPLL} became unlocked} @@ -101,7 +110,9 @@ WR network. \snmpadd{WR-SWITCH-MIB::wrsSpllHlock}\\ \snmpadd{WR-SWITCH-MIB::wrsSpllMlock}\\ \snmpadd{WR-SWITCH-MIB::wrsSpllDelCnt}\\ - \snmpadd{WR-SWITCH-MIB::wrsSoftPLLStatus} + \snmpadd{WR-SWITCH-MIB::wrsSoftPLLStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf \emph{SoftPLL} has crashed/restarted} @@ -140,7 +151,9 @@ WR network. \item [] \underline{SNMP objects}:\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus} + \snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Link to WR Master is up for master} @@ -156,7 +169,9 @@ WR network. \item [] \underline{SNMP objects}:\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus} + \snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf PTP frames don't reach ARM} @@ -181,7 +196,9 @@ WR network. \snmpadd{WR-SWITCH-MIB::wrsPortStatusPtpRxFrames.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsPTPFramesFlowing} + \snmpadd{WR-SWITCH-MIB::wrsPTPFramesFlowing}\\ + \snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \item [] \underline{Note}: If the kernel driver crashes, there is not much we can do. We end up with either our system frozen or a reboot. For wrong VLAN configuration and HDL problems we can monitor if PTP frames @@ -213,7 +230,9 @@ WR network. \snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpInDB.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpGbE.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpError.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsSFPsStatus} + \snmpadd{WR-SWITCH-MIB::wrsSFPsStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \item [] \underline{Note}: WRS configuration allow to disable this check on some ports. That is because ports may be used for regular (non-WR) PTP synchronization or for data transfer only (no timing). In that case any @@ -235,7 +254,10 @@ WR network. \item [] \underline{SNMP objects}:\\ \snmpadd{WR-SWITCH-MIB::wrsStartCntPTP}\\ \snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\ - \snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>} + \snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}\\ + \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf \emph{HAL} process has crashed/restarted} @@ -251,7 +273,10 @@ WR network. \item [] \underline{SNMP objects}:\\ \snmpadd{WR-SWITCH-MIB::wrsStartCntHAL}\\ \snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\ - \snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>} + \snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}\\ + \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Wrong configuration applied} @@ -336,7 +361,10 @@ between devices connected to the ports.\\ However, we are not able to distinguish between them inside the switch. \item [] \underline{SNMP objects}:\\ \snmpadd{IF-MIB::ifOperStatus.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>} + \snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\ + \snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Fault in the Endpoint's transmission/reception path} @@ -356,7 +384,9 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPfilterDropped.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPCSErrors.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPstatsRXCRCErrors.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsEndpointStatus} + \snmpadd{WR-SWITCH-MIB::wrsEndpointStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Problem with the SwCore or Endpoint HDL module} @@ -376,7 +406,9 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsGwWatchdogTimeouts}\\ \snmpadd{WR-SWITCH-MIB::wrsPstatsTXFrames.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPstatsForwarded.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus} + \snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \item [] \underline{Note}: For Endpoint monitoring we could compare per-port \emph{RTUfwd} counter with the \emph{Tx} Endpoint counter for each port. \emph{RTUfwd} counts all forwarding decisions from RTU to the @@ -396,7 +428,9 @@ between devices connected to the ports.\\ Rx path of the Endpoint. \item [] \underline{SNMP objects}:\\ \snmpadd{WR-SWITCH-MIB::wrsPstatsRXDropRTUFull.<n>} \\ - \snmpadd{WR-SWITCH-MIB::wrsRTUStatus} + \snmpadd{WR-SWITCH-MIB::wrsRTUStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Too much HP traffic / Per-priority queue full} @@ -412,17 +446,19 @@ between devices connected to the ports.\\ unacceptable. \item [] \underline{SNMP objects}:\\ \snmpadd{WR-SWITCH-MIB::wrsPstatsFastMatchPriority.<n>} - HP frames on a port\\ - \snmpadd{WR-SWITCH-MIB::wrsPstatsRXFrames.<n>} - Total number of Rx frames on + \snmpadd{WR-SWITCH-MIB::wrsPstatsRXFrames.<n>} -- Total number of Rx frames on the port\\ - \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio0.<n>} - Rx priorities 0-7\\ - \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio1.<n>} \\ - \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio2.<n>} \\ - \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio3.<n>} \\ - \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio4.<n>} \\ - \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio5.<n>} \\ - \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio6.<n>} \\ - \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio7.<n>} \\ - \snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus} + \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio0.<n>} -- Rx priority 0\\ + \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio1.<n>} -- Rx priority 1\\ + \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio2.<n>} -- Rx priority 2\\ + \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio3.<n>} -- Rx priority 3\\ + \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio4.<n>} -- Rx priority 4\\ + \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio5.<n>} -- Rx priority 5\\ + \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio6.<n>} -- Rx priority 6\\ + \snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio7.<n>} -- Rx priority 7\\ + \snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \item [] \underline{Note}: we need to get from SwCore the information about per-priority queue utilization, or at least an event when it's full. @@ -444,7 +480,10 @@ between devices connected to the ports.\\ \item [] \underline{SNMP objects}:\\ \snmpadd{WR-SWITCH-MIB::wrsStartCntRTUd}\\ \snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\ - \snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>} + \snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}\\ + \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Network loop - two or more identical MACs on two or more ports} @@ -515,7 +554,6 @@ between devices connected to the ports.\\ \item status of starting userspace daemons \end{itemize} \item [] \underline{SNMP objects}:\\ - \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing whether switch booted correctly\\ \snmpadd{WR-SWITCH-MIB::wrsRestartReason}\\ \snmpadd{WR-SWITCH-MIB::wrsRestartReasonMonit}\\ \snmpadd{WR-SWITCH-MIB::wrsConfigSource}\\ @@ -524,7 +562,10 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsBootLoadFPGA}\\ \snmpadd{WR-SWITCH-MIB::wrsBootLoadLM32}\\ \snmpadd{WR-SWITCH-MIB::wrsBootKernelModulesMissing}\\ - \snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing} + \snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\ + \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing whether switch booted correctly\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \item [] \underline{Note}: The idea is to reboot the system if it was not able to boot correctly. Then we use the scratchpad registers of the processor to keep @@ -546,8 +587,6 @@ between devices connected to the ports.\\ it has to be verified before being applied. If downloading or verification has failed an alarm is raised. \item [] \underline{SNMP objects}:\\ - \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing - whether switch booted correctly\\ \snmpadd{WR-SWITCH-MIB::wrsConfigSource} -- source of a dot-config, local, remote or get URL to the dot-config via DHCP. When \texttt{wrsConfigSource} is set to the \texttt{tryDhcp}, then failure of @@ -555,7 +594,11 @@ between devices connected to the ports.\\ \texttt{wrsBootSuccessful}\\ \snmpadd{WR-SWITCH-MIB::wrsConfigSourceUrl} -- path to the dot-config on a server (if not local)\\ - \snmpadd{WR-SWITCH-MIB::wrsBootConfigStatus} -- result of the dot-config verification + \snmpadd{WR-SWITCH-MIB::wrsBootConfigStatus} -- result of the dot-config verification\\ + \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing + whether switch booted correctly\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Any userspace daemon has crashed/restarted} @@ -580,7 +623,9 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsStartCntWrsWatchdog}\\ \snmpadd{WR-SWITCH-MIB::wrsStartCntSPLL} \emph{(not implemented)}\\ \snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing} - number of missing processes\\ - \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} - status word informing whether switch booted correctly + \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} - status word informing whether switch booted correctly\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \item [] \underline{Note}: We shall distinguish between crucial processes - error should be reported if one of them crashes; and less important processes (warning should be reported if they crash). If any @@ -640,7 +685,10 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsRebootCnt}\\ \snmpadd{WR-SWITCH-MIB::wrsRestartReason}\\ \snmpadd{WR-SWITCH-MIB::wrsFaultIP} \emph{(not implemented)}\\ - \snmpadd{WR-SWITCH-MIB::wrsFaultLR} \emph{(not implemented)} + \snmpadd{WR-SWITCH-MIB::wrsFaultLR} \emph{(not implemented)}\\ + \snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \item [] \underline{Note}: Unfortunately, right now it is not possible to distinguish whether the reboot was caused by the kernel panic function or the \texttt{reboot} @@ -661,7 +709,9 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsMemoryUsed}\\ \snmpadd{WR-SWITCH-MIB::wrsMemoryUsedPerc} - percentage of used memory\\ \snmpadd{WR-SWITCH-MIB::wrsMemoryFree}\\ - \snmpadd{WR-SWITCH-MIB::wrsMemoryFreeLow} - warning or error on low memory + \snmpadd{WR-SWITCH-MIB::wrsMemoryFreeLow} - warning or error on low memory\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Disk space low} \label{fail:other:no_disk} @@ -680,6 +730,8 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsDiskUseRate.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsDiskFilesystem.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsDiskSpaceLow} - warning or error on low disk space\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}\\ \snmpadd{HOST-RESOURCES-MIB::hrStorageDescr.<n>}\\ \snmpadd{HOST-RESOURCES-MIB::hrStorageSize.<n>}\\ \snmpadd{HOST-RESOURCES-MIB::hrStorageUsed.<n>} @@ -705,7 +757,9 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg1min}\\ \snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg5min}\\ \snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg15min}\\ - \snmpadd{WR-SWITCH-MIB::wrsCpuLoadHigh} - warning or error when CPU load too high + \snmpadd{WR-SWITCH-MIB::wrsCpuLoadHigh} - warning or error when CPU load too high\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Temperature inside the box too high} @@ -738,7 +792,9 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsTempThresholdPLL}\\ \snmpadd{WR-SWITCH-MIB::wrsTempThresholdPSL}\\ \snmpadd{WR-SWITCH-MIB::wrsTempThresholdPSR}\\ - \snmpadd{WR-SWITCH-MIB::wrsTemperatureWarning} + \snmpadd{WR-SWITCH-MIB::wrsTemperatureWarning}\\ + \snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf Not supported SFP plugged into the cage (especially non 1-Gb SFP)} @@ -758,7 +814,9 @@ between devices connected to the ports.\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpVS.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpGbE.<n>}\\ \snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpError.<n>}\\ - \snmpadd{WR-SWITCH-MIB::wrsSFPsStatus} - status word for SFPs' status + \snmpadd{WR-SWITCH-MIB::wrsSFPsStatus} - status word for SFPs' status\\ + \snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\ + \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} \end{pck_descr} \subsubsection{\bf File system / Memory corruption} -- GitLab