Commit 9b4d19eb authored by Adam Wujek's avatar Adam Wujek 💬 Committed by Grzegorz Daniluk

doc/wrs_failures: add statuses' objects to problems

Signed-off-by: Adam Wujek's avatarAdam Wujek <adam.wujek@cern.ch>
parent 4bde9828
......@@ -19,7 +19,10 @@ WR network.
\snmpadd{WR-SWITCH-MIB::wrsPtpServoState.<n>} -- PTP servo state as string\\
\snmpadd{WR-SWITCH-MIB::wrsPtpServoStateN.<n>} -- PTP servo state as number\\
\snmpadd{WR-SWITCH-MIB::wrsPtpServoStateErrCnt.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPTPStatus}
\snmpadd{WR-SWITCH-MIB::wrsPTPStatus} \\
\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\item [] \underline{Note}: PTP servo state is exported as a string and a number.
\end{pck_descr}
......@@ -38,7 +41,9 @@ WR network.
\snmpadd{WR-SWITCH-MIB::wrsPtpClockOffsetPsHR.<n>} -- 32-bit signed value of the offset in ps; with
saturation on overflow and underflow\\
\snmpadd{WR-SWITCH-MIB::wrsPtpClockOffsetErrCnt.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPTPStatus}
\snmpadd{WR-SWITCH-MIB::wrsPTPStatus} \\
\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Detected jump in the RTT value calculated by \emph{PTP/PPSi}}
......@@ -55,7 +60,9 @@ WR network.
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsPtpRTT.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPtpRTTErrCnt.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPTPStatus}
\snmpadd{WR-SWITCH-MIB::wrsPTPStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Wrong $\Delta_{TXM}$, $\Delta_{RXM}$, $\Delta_{TXS}$,
......@@ -75,7 +82,9 @@ WR network.
\snmpadd{WR-SWITCH-MIB::wrsPtpDeltaRxM.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPtpDeltaTxS.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPtpDeltaRxS.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPTPStatus}
\snmpadd{WR-SWITCH-MIB::wrsPTPStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf \emph{SoftPLL} became unlocked}
......@@ -101,7 +110,9 @@ WR network.
\snmpadd{WR-SWITCH-MIB::wrsSpllHlock}\\
\snmpadd{WR-SWITCH-MIB::wrsSpllMlock}\\
\snmpadd{WR-SWITCH-MIB::wrsSpllDelCnt}\\
\snmpadd{WR-SWITCH-MIB::wrsSoftPLLStatus}
\snmpadd{WR-SWITCH-MIB::wrsSoftPLLStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf \emph{SoftPLL} has crashed/restarted}
......@@ -140,7 +151,9 @@ WR network.
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}
\snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Link to WR Master is up for master}
......@@ -156,7 +169,9 @@ WR network.
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}
\snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf PTP frames don't reach ARM}
......@@ -181,7 +196,9 @@ WR network.
\snmpadd{WR-SWITCH-MIB::wrsPortStatusPtpRxFrames.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPTPFramesFlowing}
\snmpadd{WR-SWITCH-MIB::wrsPTPFramesFlowing}\\
\snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\item [] \underline{Note}: If the kernel driver crashes, there is not much
we can do. We end up with either our system frozen or a reboot. For
wrong VLAN configuration and HDL problems we can monitor if PTP frames
......@@ -213,7 +230,9 @@ WR network.
\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpInDB.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpGbE.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpError.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsSFPsStatus}
\snmpadd{WR-SWITCH-MIB::wrsSFPsStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\item [] \underline{Note}: WRS configuration allow to disable this check on some ports.
That is because ports may be used for regular (non-WR) PTP
synchronization or for data transfer only (no timing). In that case any
......@@ -235,7 +254,10 @@ WR network.
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsStartCntPTP}\\
\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\
\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}
\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf \emph{HAL} process has crashed/restarted}
......@@ -251,7 +273,10 @@ WR network.
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsStartCntHAL}\\
\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\
\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}
\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Wrong configuration applied}
......@@ -336,7 +361,10 @@ between devices connected to the ports.\\
However, we are not able to distinguish between them inside the switch.
\item [] \underline{SNMP objects}:\\
\snmpadd{IF-MIB::ifOperStatus.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}
\snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Fault in the Endpoint's transmission/reception path}
......@@ -356,7 +384,9 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPfilterDropped.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPCSErrors.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXCRCErrors.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsEndpointStatus}
\snmpadd{WR-SWITCH-MIB::wrsEndpointStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Problem with the SwCore or Endpoint HDL module}
......@@ -376,7 +406,9 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsGwWatchdogTimeouts}\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsTXFrames.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsForwarded.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus}
\snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\item [] \underline{Note}: For Endpoint monitoring we could compare
per-port \emph{RTUfwd} counter with the \emph{Tx} Endpoint counter for
each port. \emph{RTUfwd} counts all forwarding decisions from RTU to the
......@@ -396,7 +428,9 @@ between devices connected to the ports.\\
Rx path of the Endpoint.
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXDropRTUFull.<n>} \\
\snmpadd{WR-SWITCH-MIB::wrsRTUStatus}
\snmpadd{WR-SWITCH-MIB::wrsRTUStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Too much HP traffic / Per-priority queue full}
......@@ -412,17 +446,19 @@ between devices connected to the ports.\\
unacceptable.
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsFastMatchPriority.<n>} - HP frames on a port\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXFrames.<n>} - Total number of Rx frames on
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXFrames.<n>} -- Total number of Rx frames on
the port\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio0.<n>} - Rx priorities 0-7\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio1.<n>} \\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio2.<n>} \\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio3.<n>} \\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio4.<n>} \\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio5.<n>} \\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio6.<n>} \\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio7.<n>} \\
\snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus}
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio0.<n>} -- Rx priority 0\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio1.<n>} -- Rx priority 1\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio2.<n>} -- Rx priority 2\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio3.<n>} -- Rx priority 3\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio4.<n>} -- Rx priority 4\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio5.<n>} -- Rx priority 5\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio6.<n>} -- Rx priority 6\\
\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio7.<n>} -- Rx priority 7\\
\snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\item [] \underline{Note}: we need to get from SwCore the information
about per-priority queue utilization, or at least an event when it's
full.
......@@ -444,7 +480,10 @@ between devices connected to the ports.\\
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsStartCntRTUd}\\
\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\
\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}
\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Network loop - two or more identical MACs on two or more ports}
......@@ -515,7 +554,6 @@ between devices connected to the ports.\\
\item status of starting userspace daemons
\end{itemize}
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing whether switch booted correctly\\
\snmpadd{WR-SWITCH-MIB::wrsRestartReason}\\
\snmpadd{WR-SWITCH-MIB::wrsRestartReasonMonit}\\
\snmpadd{WR-SWITCH-MIB::wrsConfigSource}\\
......@@ -524,7 +562,10 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsBootLoadFPGA}\\
\snmpadd{WR-SWITCH-MIB::wrsBootLoadLM32}\\
\snmpadd{WR-SWITCH-MIB::wrsBootKernelModulesMissing}\\
\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}
\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing whether switch booted correctly\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\item [] \underline{Note}:
The idea is to reboot the system if it was not able to boot correctly.
Then we use the scratchpad registers of the processor to keep
......@@ -546,8 +587,6 @@ between devices connected to the ports.\\
it has to be verified before being applied. If downloading or verification has
failed an alarm is raised.
\item [] \underline{SNMP objects}:\\
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing
whether switch booted correctly\\
\snmpadd{WR-SWITCH-MIB::wrsConfigSource} -- source of a dot-config,
local, remote or get URL to the dot-config via DHCP. When
\texttt{wrsConfigSource} is set to the \texttt{tryDhcp}, then failure of
......@@ -555,7 +594,11 @@ between devices connected to the ports.\\
\texttt{wrsBootSuccessful}\\
\snmpadd{WR-SWITCH-MIB::wrsConfigSourceUrl} -- path to the dot-config
on a server (if not local)\\
\snmpadd{WR-SWITCH-MIB::wrsBootConfigStatus} -- result of the dot-config verification
\snmpadd{WR-SWITCH-MIB::wrsBootConfigStatus} -- result of the dot-config verification\\
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing
whether switch booted correctly\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Any userspace daemon has crashed/restarted}
......@@ -580,7 +623,9 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsStartCntWrsWatchdog}\\
\snmpadd{WR-SWITCH-MIB::wrsStartCntSPLL} \emph{(not implemented)}\\
\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing} - number of missing processes\\
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} - status word informing whether switch booted correctly
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} - status word informing whether switch booted correctly\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\item [] \underline{Note}: We shall distinguish between crucial
processes - error should be reported if one of them crashes; and less
important processes (warning should be reported if they crash). If any
......@@ -640,7 +685,10 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsRebootCnt}\\
\snmpadd{WR-SWITCH-MIB::wrsRestartReason}\\
\snmpadd{WR-SWITCH-MIB::wrsFaultIP} \emph{(not implemented)}\\
\snmpadd{WR-SWITCH-MIB::wrsFaultLR} \emph{(not implemented)}
\snmpadd{WR-SWITCH-MIB::wrsFaultLR} \emph{(not implemented)}\\
\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\item [] \underline{Note}:
Unfortunately, right now it is not possible to distinguish whether the
reboot was caused by the kernel panic function or the \texttt{reboot}
......@@ -661,7 +709,9 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsMemoryUsed}\\
\snmpadd{WR-SWITCH-MIB::wrsMemoryUsedPerc} - percentage of used memory\\
\snmpadd{WR-SWITCH-MIB::wrsMemoryFree}\\
\snmpadd{WR-SWITCH-MIB::wrsMemoryFreeLow} - warning or error on low memory
\snmpadd{WR-SWITCH-MIB::wrsMemoryFreeLow} - warning or error on low memory\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Disk space low}
\label{fail:other:no_disk}
......@@ -680,6 +730,8 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsDiskUseRate.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsDiskFilesystem.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsDiskSpaceLow} - warning or error on low disk space\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}\\
\snmpadd{HOST-RESOURCES-MIB::hrStorageDescr.<n>}\\
\snmpadd{HOST-RESOURCES-MIB::hrStorageSize.<n>}\\
\snmpadd{HOST-RESOURCES-MIB::hrStorageUsed.<n>}
......@@ -705,7 +757,9 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg1min}\\
\snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg5min}\\
\snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg15min}\\
\snmpadd{WR-SWITCH-MIB::wrsCpuLoadHigh} - warning or error when CPU load too high
\snmpadd{WR-SWITCH-MIB::wrsCpuLoadHigh} - warning or error when CPU load too high\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Temperature inside the box too high}
......@@ -738,7 +792,9 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsTempThresholdPLL}\\
\snmpadd{WR-SWITCH-MIB::wrsTempThresholdPSL}\\
\snmpadd{WR-SWITCH-MIB::wrsTempThresholdPSR}\\
\snmpadd{WR-SWITCH-MIB::wrsTemperatureWarning}
\snmpadd{WR-SWITCH-MIB::wrsTemperatureWarning}\\
\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf Not supported SFP plugged into the cage (especially non 1-Gb SFP)}
......@@ -758,7 +814,9 @@ between devices connected to the ports.\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpVS.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpGbE.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpError.<n>}\\
\snmpadd{WR-SWITCH-MIB::wrsSFPsStatus} - status word for SFPs' status
\snmpadd{WR-SWITCH-MIB::wrsSFPsStatus} - status word for SFPs' status\\
\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
\end{pck_descr}
\subsubsection{\bf File system / Memory corruption}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment