doc/wrs_failures: more cleanup and a/the fixes

6ff70c97 · Grzegorz Daniluk · c8593494 · 6ff70c97 · c8593494 · 6ff70c97
Commit 6ff70c97 authored Feb 01, 2016 by Grzegorz Daniluk
4 changed files
--- a/doc/wrs_failures/fail.tex
+++ b/doc/wrs_failures/fail.tex
 \subsection{Timing error}
-As a timing error we define WR Switch not being able to provide its slave
+\label{sec:timing_fail}
+As a timing error we define the WR Switch not being able to provide its slave
 nodes/switches with correct timing information consistent with the rest of the
 WR network.

@@ -13,15 +14,16 @@ WR network.
 			\item [] \underline{Mode}: \emph{Slave}
 			\item [] \underline{Description}:\\
 				If the \emph{PTP/PPSi} WR servo goes out of the \texttt{TRACK\_PHASE}
-				state, this means something bad has happened and switch lost the
+				state, this means something bad has happened and the switch lost the
 				synchronization to its Master.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPtpServoState.<n>} -- PTP servo state as string\\
 				\snmpadd{WR-SWITCH-MIB::wrsPtpServoStateN.<n>} -- PTP servo state as number\\
 				\snmpadd{WR-SWITCH-MIB::wrsPtpServoStateErrCnt.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPTPStatus} \\
 				\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }

 			\item [] \underline{Note}: PTP servo state is exported as a string and a number.
 		\end{pck_descr}
@@ -33,17 +35,17 @@ WR network.
 			\item [] \underline{Severity}: ERROR
 			\item [] \underline{Mode}: \emph{Slave}
 			\item [] \underline{Description}:\\
-				This may happen if Master resets its WR time counters (e.g. because it
-				lost the link to its Master higher in the hierarchy or to external
-				clock), but Slave switch does not follow the jump.
+				This may happen if the Master resets its WR time counters (e.g. because
+        it lost the link to its Master higher in the hierarchy or to external
+				clock), but the Slave switch does not follow the jump.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPtpClockOffsetPs.<n>} -- value of the offset in ps\\
-				\snmpadd{WR-SWITCH-MIB::wrsPtpClockOffsetPsHR.<n>} -- 32-bit signed value of the offset in ps; with
-				saturation on overflow and underflow\\
+				\snmpadd{WR-SWITCH-MIB::wrsPtpClockOffsetPsHR.<n>} -- 32-bit signed value of the offset in ps; with saturation\\
 				\snmpadd{WR-SWITCH-MIB::wrsPtpClockOffsetErrCnt.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPTPStatus} \\
 				\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Detected jump in the RTT value calculated by \emph{PTP/PPSi}}
@@ -53,16 +55,18 @@ WR network.
 			\item [] \underline{Severity}: ERROR
 			\item [] \underline{Mode}: \emph{Slave}
 			\item [] \underline{Description}:\\
-				Once WR link is established round-trip delay (RTT) can change smoothly
-				due to the temperature variations. If a sudden jump is detected, that
-				means erroneous timestamp was generated either on Master or Slave side.
+				Once a WR link is established the round-trip delay (RTT) can change
+        smoothly due to the temperature variations. However, if a sudden jump is
+        detected, that means that an erroneous timestamp was generated either on
+        the Master or the Slave side.
 				One cause of that could be the wrong value of the t24p transition point.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPtpRTT.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPtpRTTErrCnt.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPTPStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Wrong $\Delta_{TXM}$, $\Delta_{RXM}$, $\Delta_{TXS}$,
@@ -75,35 +79,37 @@ WR network.
 			\item [] \underline{Description}:\\
 				If \emph{PTP/PPSi} doesn't get the correct values of fixed hardware delays,
 				it won't be able to calculate a proper Master-to-Slave delay. Although
-				the estimated offset in \emph{PTP/PPSi} is close to 0, WRS won't be
-				synchronized to Master with the sub-nanosecond accuracy.
+				the estimated offset in \emph{PTP/PPSi} is close to 0, the WRS won't be
+				synchronized to the Master with the sub-nanosecond accuracy.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPtpDeltaTxM.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPtpDeltaRxM.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPtpDeltaTxS.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPtpDeltaRxS.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPTPStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf \emph{SoftPLL} became unlocked}
 		\label{fail:timing:spll_unlock}
 		\begin{pck_descr}
-			\item [] \underline{Status}: DONE
+      \item [] \underline{Status}: DONE (to be improved with holdover)
 			\item [] \underline{Severity}: ERROR
 			\item [] \underline{Mode}: \emph{all}
 			\item [] \underline{Description}:\\
-				If \emph{SoftPLL} loses lock, for any reason, Slave or Grand Master
+				If the \emph{SoftPLL} loses lock, for any reason, Slave or Grand Master
 				switch can no longer be syntonized and phase aligned with its time
 				source. WRS in Free-running mode without properly locked Helper PLL is
 				not able to perform reliable phase measurements for enhancing Rx
-				timestamps resolution. For Grand Master the reason of \emph{SoftPLL}
-				going out of lock might be disconnected 1-PPS/10MHz signals or external
-				clock down. In that case, the switch goes into Free-running mode and
-				resets WR time. Later we will have a holdover to keep the Grand Master
-				switch disciplined in case it loses external reference.
+				timestamps resolution. For a Grand Master the reason of \emph{SoftPLL}
+				going out of lock might be disconnected 1-PPS/10MHz signals or that the
+        external clock is down. In that case, the switch goes into Free-running
+        mode and resets the WR time. Later we will have a holdover to keep the
+        Grand Master switch disciplined in case it loses external reference.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsSpllMode}\\
 				\snmpadd{WR-SWITCH-MIB::wrsSpllSeqState}\\
 				\snmpadd{WR-SWITCH-MIB::wrsSpllAlignState}\\
@@ -112,23 +118,24 @@ WR network.
 				\snmpadd{WR-SWITCH-MIB::wrsSpllDelCnt}\\
 				\snmpadd{WR-SWITCH-MIB::wrsSoftPLLStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTimingStatus} \\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf \emph{SoftPLL} has crashed/restarted}
 		\label{fail:timing:spll_crash}
 		\begin{pck_descr}
-			\item [] \underline{Status}: TODO \emph{(depends on SoftPLL mem read), (require changes in lm32 software)}
+			\item [] \underline{Status}: TODO \emph{(depends on SoftPLL mem read), (requires changes in lm32 software)}
 			\item [] \underline{Severity}: ERROR
 			\item [] \underline{Mode}: \emph{all}
 			\item [] \underline{Description}:\\
-				If LM32 software crashes or restarts for some reason, its state may be
-				either reseted or random (if for some reason variables were overwritten
-				with junk values). In such case PLL becomes unlocked and switch is not
+				If the LM32 software crashes or restarts for some reason, its state may
+        be either reset or random (if for some reason variables were overwritten
+				with junk values). In such case, PLL becomes unlocked and switch is not
 				able to provide synchronization to other devices.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsSpllIrqCnt}\\
-				\snmpadd{WR-SWITCH-MIB::wrsStartCntSPLL} \emph{(not yet implemented)}
+        \snmpadd{WR-SWITCH-MIB::wrsStartCntSPLL} \emph{(not yet implemented)} }
 			\item [] \underline{Note}: We have a similar mechanism as in the
 				\emph{wrpc-sw} to detect if the LM32 program has restarted because of
 				the CPU following a NULL pointer. However, LM32 program hangs on
@@ -145,15 +152,16 @@ WR network.
 				switch-over)
 			\item [] \underline{Mode}: \emph{Slave}
 			\item [] \underline{Description}:\\
-				In that case, WR Switch loses timing reference, resets counters
-				responsible for keeping the WR time, and starts operating in a
-				Free-Running Master mode.
+				If a Boundary Clock switch loses the link on its Slave port, the timing
+        reference is lost. The switch resets counters responsible for keeping
+        the WR time, and starts operating in a Free-Running Master mode.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Link to WR Master is up for master}
@@ -163,15 +171,16 @@ WR network.
 			\item [] \underline{Severity}: ERROR
 			\item [] \underline{Mode}: \emph{Grand Master}, \emph{Free-Running Master}
 			\item [] \underline{Description}:\\
-				In that case there is probably wrong configuration. Neither the
+				In that case there is probably a wrong configuration. Neither the
 				\emph{Grand Master} nor the \emph{Free-Running Master} should be
 				connected to another WR Master.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf PTP frames don't reach ARM}
@@ -182,8 +191,8 @@ WR network.
 			\item [] \underline{Mode}: \emph{all}
 			\item [] \underline{Description}:\\
 				In this case, \emph{PTP/PPSi} will fail to stay synchronized and provide
-				synchronization. Even if WR servo is in the \texttt{TRACK\_PHASE} state,
-				it calculates new phase shift based on the Master-to-Slave delay
+				synchronization. Even if the WR servo is in the \texttt{TRACK\_PHASE}
+        state, it calculates a new phase shift based on the Master-to-Slave delay
 				variations. To calculate these variations, it still needs timestamped
 				PTP frames flowing. There could be several causes of such fault:
 				\begin{itemize}
@@ -192,13 +201,14 @@ WR network.
 					\item wrong VLANs configuration
 				\end{itemize}
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusPtpTxFrames.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusPtpRxFrames.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPTPFramesFlowing}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 			\item [] \underline{Note}: If the kernel driver crashes, there is not much
 				we can do. We end up with either our system frozen or a reboot. For
 				wrong VLAN configuration and HDL problems we can monitor if PTP frames
@@ -223,6 +233,7 @@ WR network.
 				Despite \emph{PTP/PPSi} offset being close to 0 \emph{ps}, the device won't
 				be properly synchronized.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusConfiguredMode.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpVN.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpPN.<n>}\\
@@ -232,13 +243,13 @@ WR network.
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpError.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsSFPsStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 			\item [] \underline{Note}: WRS configuration allow to disable this check on some ports.
 				That is because ports may be used for regular (non-WR) PTP
 				synchronization or for data transfer only (no timing). In that case any
 				Gigabit SFP can be used (also copper). Detecting if a non-Gigabit
-				Ethernet SFP is plugged into the cage is covered in a separate issue
-				\ref{fail:other:sfp}.
+				Ethernet SFP is plugged into the cage is covered in issue
+        \ref{fail:other:sfp}.
 		\end{pck_descr}

 \subsubsection{\bf \emph{PTP/PPSi} process has crashed/restarted}
@@ -252,12 +263,13 @@ WR network.
 				capabilities. Then \texttt{Monit} restarts the missing process.
 				The number of process starts is stored in a corresponding object.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsStartCntPTP}\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\
 				\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf \emph{HAL} process has crashed/restarted}
@@ -271,18 +283,19 @@ WR network.
 				the hardware i.e. read phase shift, get timestamps, phase shift the
 				clock etc. When \emph{HAL} crashes, \texttt{Monit} will restart it.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsStartCntHAL}\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\
 				\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Wrong configuration applied}
 		\label{fail:timing:wrong_config}
 		\begin{pck_descr}
-			\item [] \underline{Status}: TODO \emph{(to be done later)}
+			\item [] \underline{Status}: TODO
 			\item [] \underline{Severity}: WARNING
 			\item [] \underline{Mode}: \emph{all}
 			\item [] \underline{Description}:\\
@@ -296,9 +309,12 @@ WR network.
 				For misconfigured VLANs, we can monitor if PTP frames are flowing on
 				Slave port(s) of the switch.
 			\item [] \underline{SNMP objects}: \emph{(not yet implemented)}
-			\item [] \underline{Note}: monitor remote updates of key configuration
-				options (PTP/WR mode, fixed hardware delays)
-		\end{pck_descr}
+			\item [] \underline{Note}: When a new configuration file is fetched on
+        boot time, compare it with a previously used config (the whole file,
+        but especially timing-critical fields like PTP/WR mode, fixed hardware
+        delays). Report using the Syslog (\emph{info}/\emph{warning}) if the
+        configuration has changed.
+    \end{pck_descr}

 \subsubsection{\bf Switchover failed}
 		\begin{pck_descr}
@@ -336,10 +352,9 @@ WR network.

 \newpage
 \subsection{Data error}
-As a data error we define WR Switch not being able to forward Ethernet traffic
-between devices connected to the ports.\\
-
-\noindent This section contains the list of faults leading to a data error.
+When the WR switch is not able to forward Ethernet traffic between devices
+connected to the ports, we consider this a data error. This section contains the
+list of faults leading to a data error.

 \subsubsection{\bf Link down}
 		\label{fail:data:link_down}
@@ -350,21 +365,22 @@ between devices connected to the ports.\\
 			\item [] \underline{Description}:\\
 				This obviously stops the flow of frames on an Ethernet port and there is
 				not much we can do besides reporting an error. Topology redundancy is a
-				cure for that (if backup link is fine, and reconfiguration does not
+				cure for that (if a backup link is fine, and reconfiguration does not
 				fail). There might be several causes of a link down:
 				\begin{itemize}
 					\item unplugged fiber
 					\item broken fiber
 					\item broken SFP
-					\item wrong(non-complementary) pair of WDM SPFs used
+					\item wrong (non-complementary) pair of WDM SPFs used
 				\end{itemize}
 				However, we are not able to distinguish between them inside the switch.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{IF-MIB::ifOperStatus.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusLink.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsSlaveLinksStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTimingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Fault in the Endpoint's transmission/reception path}
@@ -377,6 +393,7 @@ between devices connected to the ports.\\
 				underrun in the Tx PCS or FIFO overrun in the Rx PCS, receiving invalid
 				\emph{8b10b} code, CRC error etc.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsTXUnderrun.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXOverrun.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXInvalidCode.<n>}\\
@@ -386,7 +403,7 @@ between devices connected to the ports.\\
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXCRCErrors.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsEndpointStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Problem with the SwCore or Endpoint HDL module}
@@ -399,16 +416,17 @@ between devices connected to the ports.\\
 				If the SwCore is hanging, then the Ethernet forwarding is not
 				performed on one or multiple ports. We have a HDL watchdog module which
 				constantly monitors if the SwCore is not stuck. If such a situation is
-				detected the whole SwCore is reset, all the frames enqueued in the
-				Endpoints are acknowledged and lost. After this the switch can continue
+				detected the whole SwCore is reset, all the frames queued in the
+        Endpoints are acknowledged and lost. After this the switch can continue
 				its operation and the watchdog triggers counter is incremented.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsGwWatchdogTimeouts}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsTXFrames.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsForwarded.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 			\item [] \underline{Note}: For Endpoint monitoring we could compare
 				per-port \emph{RTUfwd} counter with the \emph{Tx} Endpoint counter for
 				each port. \emph{RTUfwd} counts all forwarding decisions from RTU to the
@@ -427,10 +445,11 @@ between devices connected to the ports.\\
 				and generate new responses. In such case frames are dropped in the
 				Rx path of the Endpoint.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXDropRTUFull.<n>} \\
 				\snmpadd{WR-SWITCH-MIB::wrsRTUStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Too much HP traffic / Per-priority queue full}
@@ -445,9 +464,9 @@ between devices connected to the ports.\\
 				queue may become full and we start losing HP frames, which is
 				unacceptable.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsFastMatchPriority.<n>} -- HP frames on a port\\
-				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXFrames.<n>} -- Total number of Rx frames on
-				the port\\
+				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXFrames.<n>} -- Total number of Rx frames on the port\\
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio0.<n>} -- Rx priority 0\\
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio1.<n>} -- Rx priority 1\\
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio2.<n>} -- Rx priority 2\\
@@ -458,7 +477,7 @@ between devices connected to the ports.\\
 				\snmpadd{WR-SWITCH-MIB::wrsPstatsRXPrio7.<n>} -- Rx priority 7\\
 				\snmpadd{WR-SWITCH-MIB::wrsSwcoreStatus}\\
 				\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 			\item [] \underline{Note}: we need to get from SwCore the information
 				about per-priority queue utilization, or at least an event when it's
 				full.
@@ -478,18 +497,19 @@ between devices connected to the ports.\\
 				broadcast to all ports (within a VLAN). When \emph{RTUd} crashes,
 				\texttt{Monit} will restart it.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsStartCntRTUd}\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\
 				\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Network loop - two or more identical MACs on two or more ports}
 		\label{fail:data:net_loop}
 		\begin{pck_descr}
-			\item [] \underline{Status}: TODO \emph{(to be done later)}
+			\item [] \underline{Status}: TODO
 			\item [] \underline{Severity}: ERROR
 			\item [] \underline{Description}:\\
 				In such case we have a ping-pong situation. If two ports receive frames
@@ -517,10 +537,10 @@ between devices connected to the ports.\\
 			\item [] \underline{Status}: for later
 			\item [] \underline{Severity}: ERROR
 			\item [] \underline{Description}: \emph{(not yet implemented)}\\
-				Topology redundancy let's us prevent from losing data when the primary
+				Topology redundancy lets us prevent from losing data when the primary
 				uplink is down for some reason. However, if a backup link is also down
-				or reconfiguration to backup link fails, we start losing data and an
-				alarm should be raised.
+				or if the reconfiguration to backup link fails, we start losing data and
+        an alarm should be raised.
 			\item [] \underline{SNMP objects}: \emph{(not yet implemented)}
 			\item [] \underline{Note}: One thing we need to report is a backup link(s)
 				going down, but we should also think about how to determine if there is
@@ -554,6 +574,7 @@ between devices connected to the ports.\\
 					\item status of starting userspace daemons
 				\end{itemize}
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsRestartReason}\\
 				\snmpadd{WR-SWITCH-MIB::wrsRestartReasonMonit}\\
 				\snmpadd{WR-SWITCH-MIB::wrsConfigSource}\\
@@ -565,7 +586,7 @@ between devices connected to the ports.\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing}\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing whether switch booted correctly\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 			\item [] \underline{Note}: 
 				The idea is to reboot the system if it was not able to boot correctly.
 				Then we use the scratchpad registers of the processor to keep
@@ -581,12 +602,13 @@ between devices connected to the ports.\\
 			\item [] \underline{Status}: DONE
 			\item [] \underline{Severity}: ERROR
 			\item [] \underline{Description}:\\
-				Dot-config file used to configure the switch can be stored locally or
-				retrieved from a central server. Additionally URL to the remote dot-config
-				can be retrieved via DHCP request. When dot-config is fetch from the server
-				it has to be verified before being applied. If downloading or verification has
-				failed an alarm is raised.
+				A dot-config file used to configure the switch can be stored locally or
+				retrieved from a central server. Additionally a URL to the remote
+        dot-config can be retrieved via DHCP request. When the dot-config is
+        fetched from the server it has to be verified before being applied. If
+        downloading or verification has failed, an alarm is raised.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsConfigSource} -- source of a dot-config,
 					local, remote or get URL to the dot-config via DHCP. When
 					\texttt{wrsConfigSource} is set to the \texttt{tryDhcp}, then failure of
@@ -598,7 +620,7 @@ between devices connected to the ports.\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing
 					whether switch booted correctly\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Any userspace daemon has crashed/restarted}
@@ -612,6 +634,7 @@ between devices connected to the ports.\\
 				corresponding start counter. If a process is restarted 5 times within
 				100 seconds, then the entire switch is restarted.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{HOST-RESOURCES-MIB::hrSWRunName.<n>} -- list of processes in standard MIB\\
 				\snmpadd{WR-SWITCH-MIB::wrsStartCntHAL}\\
 				\snmpadd{WR-SWITCH-MIB::wrsStartCntPTP}\\
@@ -625,12 +648,15 @@ between devices connected to the ports.\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootUserspaceDaemonsMissing} -- number of missing processes\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful} -- status word informing whether switch booted correctly\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 			\item [] \underline{Note}: We shall distinguish between crucial
 				processes - error should be reported if one of them crashes; and less
 				important processes (warning should be reported if they crash). If any
 				of the processes has crashed, we need to restart it and increment a
-				per-process counter reported through the SNMP.
+				per-process counter reported through the SNMP. Dot-config should also
+        let us define which processes are not that important and the switch
+        should not restart even if such a process fails to start (e.g.
+        \emph{lighttpd}).

 				Crucial processes (Error report if any of them crashes):
 				\begin{itemize}
@@ -671,16 +697,17 @@ between devices connected to the ports.\\

 \subsubsection{\bf Kernel crash}
 		\begin{pck_descr}
-			\item [] \underline{Status}: DONE
+      \item [] \underline{Status}: TODO (preserving stats of IP/LR registers)
 			\item [] \underline{Severity}: ERROR
 			\item [] \underline{Description}:\\
-				If the Linux kernel has crashed, system reboots. Until the next boot we
-				have no synchronization, no SNMP to report the status, FPGA may be still
-				forwarding Ethernet traffic, but based on dynamic and static routing
-				rules from before the crash. Based on the SNMP objects below it is
-				possible to figure out that reboot took place and what was the reason of
-				the last reboot.
+				If the Linux kernel has crashed, the system reboots. Until the next boot
+        we have no synchronization, no SNMP to report the status, and the FPGA
+        may be still forwarding Ethernet traffic, but based on dynamic and
+        static routing rules from before the crash. Based on the SNMP objects
+        below it is possible to figure out that reboot took place and what was
+        the reason of the last reboot.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsBootCnt}\\
 				\snmpadd{WR-SWITCH-MIB::wrsRebootCnt}\\
 				\snmpadd{WR-SWITCH-MIB::wrsRestartReason}\\
@@ -688,7 +715,7 @@ between devices connected to the ports.\\
 				\snmpadd{WR-SWITCH-MIB::wrsFaultLR} \emph{(not implemented)}\\
 				\snmpadd{WR-SWITCH-MIB::wrsBootSuccessful}\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 			\item [] \underline{Note}:
 				Unfortunately, right now it is not possible to distinguish whether the
 				reboot was caused by the kernel panic function or the \texttt{reboot}
@@ -705,13 +732,14 @@ between devices connected to the ports.\\
 				raise an alarm if it's extremely low (but still enough to keep the
 				system running).
 			\item [] \underline{SNMP objects}:\\
-				\snmpadd{WR-SWITCH-MIB::wrsMemoryTotal}\\
+        {\footnotesize
+        \snmpadd{WR-SWITCH-MIB::wrsMemoryTotal}\\
 				\snmpadd{WR-SWITCH-MIB::wrsMemoryUsed}\\
 				\snmpadd{WR-SWITCH-MIB::wrsMemoryUsedPerc} -- percentage of used memory\\
 				\snmpadd{WR-SWITCH-MIB::wrsMemoryFree}\\
 				\snmpadd{WR-SWITCH-MIB::wrsMemoryFreeLow} -- warning or error on low memory\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}
 \subsubsection{\bf Disk space low}
 		\label{fail:other:no_disk}
@@ -723,6 +751,7 @@ between devices connected to the ports.\\
 				and raise an alarm if it's extremely low (but still enough to keep the
 				system running).
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsDiskMountPath.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsDiskSize.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsDiskUsed.<n>}\\
@@ -734,7 +763,7 @@ between devices connected to the ports.\\
 				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}\\
 				\snmpadd{HOST-RESOURCES-MIB::hrStorageDescr.<n>}\\
 				\snmpadd{HOST-RESOURCES-MIB::hrStorageSize.<n>}\\
-				\snmpadd{HOST-RESOURCES-MIB::hrStorageUsed.<n>}
+        \snmpadd{HOST-RESOURCES-MIB::hrStorageUsed.<n>} }
 			\item [] \underline{Note}:
 				Objects like \texttt{HOST-RESOURCES-MIB::hrStorage*.<n>} are available
 				via standard MIB. The same functionality is implemented in
@@ -748,18 +777,19 @@ between devices connected to the ports.\\
 			\item [] \underline{Status}: DONE
 			\item [] \underline{Severity}: WARNING
 			\item [] \underline{Description}:\\
-				On a healthy switch the average CPU load should be below \emph{0.1}.
+        On a healthy switch the average CPU load should be below \emph{0.1} (10\%).
 				Some actions like SNMP queries or web interface activity may increase
 				the average system load. The system load averages for the past 1, 5 and
 				15 minutes are exported via SNMP objects. Additionally
 				\texttt{wrsCpuLoadHigh} alerts when the load is too high.
 			\item [] \underline{SNMP objects}:\\
-				\snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg1min}\\
+        {\footnotesize
+        \snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg1min}\\
 				\snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg5min}\\
 				\snmpadd{WR-SWITCH-MIB::wrsCPULoadAvg15min}\\
 				\snmpadd{WR-SWITCH-MIB::wrsCpuLoadHigh} -- warning or error when CPU load too high\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Temperature inside the box too high}
@@ -781,9 +811,11 @@ between devices connected to the ports.\\
 				\end{itemize}
 				\texttt{wrsTemperatureWarning} is raised when the temperature read from
 				any of these sensors exceeds a threshold configured in the
-				\emph{dot-config}. When at least one threshold temperature is not set
-				\texttt{wrsTemperatureWarning} is set to \emph{Threshold-not-set}.
+        \emph{dot-config} (80 degrees by default). When at least one threshold
+        temperature is not set \texttt{wrsTemperatureWarning} is set to
+        \emph{Threshold-not-set}.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsTempFPGA}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTempPLL}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTempPSL}\\
@@ -794,7 +826,7 @@ between devices connected to the ports.\\
 				\snmpadd{WR-SWITCH-MIB::wrsTempThresholdPSR}\\
 				\snmpadd{WR-SWITCH-MIB::wrsTemperatureWarning}\\
 				\snmpadd{WR-SWITCH-MIB::wrsOSStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
 		\end{pck_descr}

 \subsubsection{\bf Not supported SFP plugged into the cage (especially non 1-Gb SFP)}
@@ -803,12 +835,15 @@ between devices connected to the ports.\\
 			\item [] \underline{Status}: DONE
 			\item [] \underline{Severity}: WARNING
 			\item [] \underline{Description}:\\
-				If a not supported Gigabit optical SFP is plugged into the cage, then
-				it's a timing issue \ref{fail:timing:wrong_sfp}. However, if a non 1-Gb
+        If a not supported Gigabit optical SFP (or an SFP that couldn't have
+        been matched with the \texttt{CONFIG\_SFP<XX>\_PARAMS} entries in the
+        configuration file) is plugged into the cage, then it's a timing issue
+        \ref{fail:timing:wrong_sfp}. However, if a non 1-Gb
 				SFP is used, then no Ethernet traffic would be flowing on that port.
 				It's due to the fact, that we don't have 10/100Mbit Ethernet implemented
 				inside the WRS.
 			\item [] \underline{SNMP objects}:\\
+        {\footnotesize
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpVN.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpPN.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpVS.<n>}\\
@@ -816,26 +851,86 @@ between devices connected to the ports.\\
 				\snmpadd{WR-SWITCH-MIB::wrsPortStatusSfpError.<n>}\\
 				\snmpadd{WR-SWITCH-MIB::wrsSFPsStatus} -- status word for SFPs' status\\
 				\snmpadd{WR-SWITCH-MIB::wrsNetworkingStatus}\\
-				\snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus}
+        \snmpadd{WR-SWITCH-MIB::wrsMainSystemStatus} }
+		\end{pck_descr}
+
+\subsubsection{\bf IP address on the management port has changed}
+		\begin{pck_descr}
+			\item [] \underline{Status}: TODO
+			\item [] \underline{Severity}: WARNING
+			\item [] \underline{Description}:\\
+        The change of an IP address on the management port might be a normal
+        situation or a result of an accidental modification of a DHCP server or
+        the WR Switch configuration. Notifying about such a situation is not
+        done through SNMP, since the IP address of a switch has to be known to
+        the SNMP manager prior querying the switch. Therefore, the switch only
+        generates a Syslog warning message if setting a new IP address is
+        detected.
+      \item [] \underline{SNMP objects}: \emph{(none)}, Syslog message is
+        generated
+		\end{pck_descr}
+
+\subsubsection{\bf Multiple unauthorized access attempts}
+		\begin{pck_descr}
+			\item [] \underline{Status}: TODO
+			\item [] \underline{Severity}: WARNING
+			\item [] \underline{Description}:\\
+				Many attempts to gain a root access through the ssh (or the web
+        interface), might mean that somebody tries to do something nasty. Every
+        unsuccessful attempt to login is reported as a Syslog warning message.
+			\item [] \underline{SNMP objects}: \emph{(none)}, Syslog message is
+        generated
+		\end{pck_descr}
+
+\subsubsection{\bf Network reconfiguration (RSTP)}
+		\label{fail:other:rstp}
+		\begin{pck_descr}
+			\item [] \underline{Status}: for later
+			\item [] \underline{Severity}: WARNING
+			\item [] \underline{Description}: \emph{(not yet implemented)}\\
+				If topology reconfiguration occurs because of the primary link failure,
+				this fact should be reported through SNMP as a warning. It's not
+				critical situation, WR network still works. However, further
+				investigation should be performed to repair the broken link.
+			\item [] \underline{SNMP objects}: \emph{(not yet implemented)}
 		\end{pck_descr}

+\subsubsection{\bf Backup link down}
+		\begin{pck_descr}
+			\item [] \underline{Status}: for later
+			\item [] \underline{Severity}: WARNING
+			\item [] \underline{Description}: \emph{(not yet implemented)}\\
+				This is related to the issue \ref{fail:other:rstp}. If the WRS uses
+				primary uplink, but the backup one fails, it's not a critical fault. WR
+				Network still works, but the link should be diagnosed and repaired to
+				have the backup link operational in case the primary one fails.
+			\item [] \underline{SNMP objects}: \emph{(not yet implemented)}
+		\end{pck_descr}
+
+\newpage
+\subsection{Undetectable errors}
+
+Beside the various errors already listed in previous sections, there are some
+situations when reporting a problem to the SNMP manager or Syslog server is not
+possible. This section lists some of them and proposes alternative ways of
+diagnostics.
+
 \subsubsection{\bf File system / Memory corruption}
 		\label{fail:other:memory}
 		\begin{pck_descr}
 			\item [] \underline{Description}:\\
+        Memory or file system corruption can produce unpredictable results. It
+        may cause a failure of any of the processes running on the switch.
 			\item [] \underline{SNMP objects}: \emph{(none)}
-			\item [] \underline{Note}: how shall we detect this? Based on the
-				\emph{dmesg} errors reported by UBI and system in general?  This is bad,
-				crazy things may happen, we can't do much about it.
 		\end{pck_descr}

 \subsubsection{\bf Kernel freeze}
 		\begin{pck_descr}
 			\item [] \underline{Description}:\\
-				If kernel freezes we can do nothing. It can freeze e.g. due to some
-				infinite loop in the irq handler. It's like with the power failure,
-				somebody has to go to the place where WRS is installed and
-				investigate/restart the device.
+				If the Linux kernel freezes there is nothing that can be done. It can
+        freeze e.g. due to some infinite loop in the irq handler. It is similar
+        to the power failure, somebody has to go to the place where the WRS is
+        installed and investigate/restart the device.
 			\item [] \underline{SNMP objects}: \emph{(none)}
 			\item [] \underline{Note}:
 				If we have watchdog in our CPU it should be used.
@@ -845,9 +940,8 @@ between devices connected to the ports.\\
 		\begin{pck_descr}
 			\item [] \underline{Description}:\\
 				Power failure may be either a WRS problem (i.e. broken power supply
-				inside the switch) or an external problem (i.e. providing voltage to the
-				device). There is not much reporting we can do in such case. It's up to
-				the Network Management Station to raise an alarm if the SNMP Agent does
+				inside the switch) or an external voltage problem. It's up to the
+        Network Management Station to raise an alarm if the SNMP Agent does
 				not respond to the SNMP requests.
 			\item [] \underline{SNMP objects}: \emph{(none)}
 		\end{pck_descr}
@@ -855,11 +949,14 @@ between devices connected to the ports.\\
 \subsubsection{\bf Hardware problem}
 		\begin{pck_descr}
 			\item [] \underline{Description}:\\
-				If any crucial hardware part breaks we'll most probably notice it as one
-				(or multiple) timing / data errors described previously. Besides that,
-				we don't have any self-diagnostics on-board. Few examples:
+				If any crucial hardware part breaks, it will be most probably noticed
+        as one (or multiple) timing / data errors described in the previous
+        sections. Besides that, there is no self-diagnostics built-in on the
+        switch hardware boards. A few examples of hardware failures and problems
+        it may cause:
 				\begin{itemize}
-					\item DAC / VCO -- problems with synchronization
+          \item DAC / VCO -- problems with synchronization (failures in
+            \ref{sec:timing_fail})
 					\item cooling fans -- rise of the temperature inside the WRS box
 						(failure \ref{fail:other:temp})
 					\item power supply, ARM, FPGA -- booting problem (failure
@@ -882,67 +979,15 @@ between devices connected to the ports.\\
 \subsubsection{\bf No static IP on the management port \& failed to DHCP}
 		\begin{pck_descr}
 			\item [] \underline{Description}:\\
-				From operator's point of view it is similar to the issue
+				From the operator's point of view it is similar to the issue
 				\ref{fail:other:management_link}. WRS is not accessible through the
 				management port, so its status cannot be reported. This should be
 				detected and reported by the NMS if it does not receive SNMP and ICMP
-				responses from the WRS. In such case WR expert should make a physical
-				connection to the management USB port of the WRS to diagnose the
-				problem.
+				responses from the WRS. In such case the configuration of the switch and
+        management network should be verified.
 			\item [] \underline{SNMP objects}: \emph{(none)}
 		\end{pck_descr}

-\subsubsection{\bf IP address on the management port has changed}
-		\begin{pck_descr}
-			\item [] \underline{Status}: TODO
-			\item [] \underline{Severity}: WARNING
-			\item [] \underline{Description}:\\
-				I'm not yet sure how we should report this. Probably SNMP is not the
-				best choice because if the IP changes we're no longer able to poll SNMP
-				objects (until IP is updated also in the Network Management Station). We
-				should either generate SNMP trap to NMS or send Syslog message to a
-				central server.
-			\item [] \underline{SNMP objects}: \emph{(not yet implemented)}
-		\end{pck_descr}
-
-\subsubsection{\bf Multiple unauthorized access attempts}
-		\begin{pck_descr}
-			\item [] \underline{Status}: for later
-			\item [] \underline{Severity}: WARNING
-			\item [] \underline{Description}:\\
-				If we observe many attempts to gain a root access through the ssh (or
-				the web interface) this might be somebody trying to do something nasty.
-				We should report such situation as a Warning.
-			\item [] \underline{SNMP objects}: \emph{(not yet implemented)}
-			\item [] \underline{Note}: Bad password event is reported by Syslog as a
-				warning. We should probably use this information to add an SNMP object.
-		\end{pck_descr}
-
-\subsubsection{\bf Network reconfiguration (RSTP)}
-		\label{fail:other:rstp}
-		\begin{pck_descr}
-			\item [] \underline{Status}: for later
-			\item [] \underline{Severity}: WARNING
-			\item [] \underline{Description}: \emph{(not yet implemented)}\\
-				If topology reconfiguration occurs because of the primary link failure,
-				this fact should be reported through SNMP as a warning. It's not
-				critical situation, WR network still works. However, further
-				investigation should be performed to repair the broken link.
-			\item [] \underline{SNMP objects}: \emph{(not yet implemented)}
-		\end{pck_descr}
-
-\subsubsection{\bf Backup link down}
-		\begin{pck_descr}
-			\item [] \underline{Status}: for later
-			\item [] \underline{Severity}: WARNING
-			\item [] \underline{Description}: \emph{(not yet implemented)}\\
-				This is related to the issue \ref{fail:other:rstp}. If the WRS uses
-				primary uplink, but the backup one fails, it's not a critical fault. WR
-				Network still works, but the link should be diagnosed and repaired to
-				have the backup link operational in case the primary one fails.
-			\item [] \underline{SNMP objects}: \emph{(not yet implemented)}
-		\end{pck_descr}
-
 %\subsection{Switch out of sync to Master}
 %
 %\subsection{Switch made a big offset jump to follow Master}

--- a/doc/wrs_failures/procedures.tex
+++ b/doc/wrs_failures/procedures.tex
-\section{Repair procedures}
-
-General rules:
-\begin{itemize}
-  \item Linux inside the WR Switch enumerates WR interfaces starting from 0.
-    This means we have to use internally port indexes 0..17. However, the
-    port numbers printed on the front panel are 1..18. Syslog messages
-    generated from the switch use the Linux port numbering. The consequence is
-    that every time Syslog says there is a problem on port X, this refers to
-    port index X+1 on the front panel of the switch.
-  \item If a procedure given for a specific SNMP object does not solve the
-    problem. Please contact WR experts to perform more in-depth analysis of your
-    network. For this, you should provide a complete dump of the WRS status
-    generated in the first step of each procedure.
-  \item If a solving procedure requires restarting or replacing a broken WR
-    Switch, please make sure that all other WR devices connected to the affected
-    switch are synchronized and do not report any problems.
-  \item If procedure requires replacing switch with a new unit, the broken one
-    should be handled to WR experts to investigate the problem.
-\end{itemize}
-
-\begin{itemize}
-  \item \texttt{wrsBootSuccessful}
-    \begin{enumerate}
-      \item Dump state
-      \item Check \texttt{WR-SWITCH-MIB::wrsBootConfigStatus}, if it reports an
-        error, please verify your WRS configuration.
-      \item Restart the switch
-      \item Please consult WR experts if the problem persists.
-    \end{enumerate}
-
-  \item \texttt{wrsTemperatureWarning}
-    \begin{enumerate}
-      \item Dump state
-      \item Verify if cooling of the rack where WR Switch is installed works
-        properly.
-      \item Verify if both cooling fans in the back of the WR Switch case are
-        working.
-      \item Replace the switch with a new unit and consult the WR Switch
-        manufacturer for a repair.
-    \end{enumerate}
-
-  \item \texttt{wrsMemoryFreeLow}
-    \begin{enumerate}
-      \item Dump state
-      \item Restart the switch
-      \item Send the dumped state of the switch to WR experts for analysis as
-        this might mean there is some internal problem in the WRS firmware.
-    \end{enumerate}
-
-  \item \texttt{wrsCpuLoadHigh}
-    \begin{enumerate}
-      \item Dump state
-      \item Restart the switch
-      \item Send the dumped state of the switch to WR experts for analysis as
-        this might mean there is some internal problem in the WRS firmware.
-    \end{enumerate}
-
-  \item \texttt{wrsDiskSpaceLow}
-    \begin{enumerate}
-      \item Dump state
-      \item Check the values of \emph{CONFIG\_WRS\_LOG\_*} configuration options
-        on the switch. These are the parameters describing where log messages
-        should be sent from various processes in the switch. Normally users
-        don't need to modify them, but if any of them is set to a file in the
-        WRS filesystem (e.g. /tmp/snmp.log) this may reduce the free space after
-        some time of operation.
-      \item Restart the switch
-      \item Send the dumped state of the switch to WR experts for analysis as
-        this might mean there is some internal problem in the WRS firmware.
-    \end{enumerate}
-
-\end{itemize}
-
-\begin{itemize}
-  \item \texttt{wrsPTPStatus}
-    \begin{enumerate}
-      \item Dump state
-      \item Check \texttt{wrsSoftPLLStatus} on the Master (WR device one step
-        higher in a timing hierarchy). Eventually proceed to investigate the
-        problem on the Master switch. Otherwise, continue with the primary WRS.
-      \item Verify if the link to WR Master was not lost by checking the object
-        \texttt{wrsSlaveLinksStatus}.
-      \item If this is not the case, restart the switch.
-      \item If the problem persists replace the switch with a new unit (see
-        \ref{cern:wrs_replacement}).
-    \end{enumerate}
-
-  \item \texttt{wrsSoftPLLStatus}\\
-    For GrandMaster WRS:
-    \begin{enumerate}
-      \item Dump state
-      \item Check 1-PPS and 10 MHz signals coming from an external source.
-        Verify if they are properly connected and, in case of GPS receiver,
-        check if it is synchronized and locked.
-      \item Restart the GrandMaster switch.
-      \item If the problem persists, replace the switch with a new unit (see
-        \ref{cern:wrs_replacement}).
-    \end{enumerate}
-
-    For Boundary Clock WRS:
-    \begin{enumerate}
-      \item Dump state
-      \item Check \texttt{wrsSoftPLLStatus} on the Master. Eventually proceed to
-        investigate the problem on the Master switch.
-      \item Verify if the link to WR Master was not lost by checking the object
-        \texttt{wrsSlaveLinksStatus}.
-      \item Restart the switch.
-      \item If the problem persists, replace the switch with a new unit (see
-        \ref{cern:wrs_replacement}).
-    \end{enumerate}
-
-  \item \texttt{wrsSlaveLinksStatus}\\
-    For Master/GrandMaster WRS:
-    \begin{enumerate}
-      \item Check the configuration of the switch. Especially if the
-        \emph{Timing Mode} is correctly set (i.e. if it was not accidentally set
-        to \emph{Boundary Clock}).
-      \item Check the role of each port timing configuration. They should be all
-        set to \emph{master}. If any of them is set to \emph{slave} you should
-        verify if there is no WR Master connected to it.
-    \end{enumerate}
-
-    For Boundary Clock WRS:
-    \begin{enumerate}
-      \item Check the fiber connection on the slave port of the WRS.
-      \item Check the configuration of the switch. Especially if the
-        \emph{Timing Mode} is correctly set (i.e. if it was not accidentally set
-        to \emph{Grand-Master} or \emph{Free-Running Master}).
-      \item Check the status of the WR Master connected to the slave port of the
-        WRS.
-      \item Replace the faulty switch with a new unit, if this does not solve
-        the problem, make sure your fiber link is not broken.
-    \end{enumerate}
-
-  \item \texttt{wrsPTPFramesFlowing}
-    % non-WR device connected, but port not set to non-WR mode
-    % device on the other side has some problem
-    % HDL / kernel crash or another problem on WRS
-    \begin{enumerate}
-      \item Check Syslog message to determine the WR port on which the
-        problem is reported. You should see a message similar to this one:\\
-        \texttt{SNMP: wrsPTPFramesFlowing failed for port 1}
-      \item Check your network layout and the WR Switch configuration. If you
-        have some non-WR devices connected to ports of the WR Switch (e.g.
-        computer sending/receiving only data, without the need of
-        synchronization), these ports should have their role in the timing
-        configuration set to \emph{non-wr}.
-      \item Check the status of a WR device connected to the reported port.
-      \item Restart the switch.
-      \item If the problem persists, please contact WR experts for in-depth
-        investigation.
-    \end{enumerate}
-\end{itemize}
-
-\begin{itemize}
-  \item \texttt{wrsSFPsStatus}
-    \begin{enumerate}
-      \item Check Syslog messages to determine the WR port on which the problem
-        is reported. You should see a message similar to this one:\\
-        \texttt{Unknown SFP vn="AVAGO" pn="ABCU-5710RZ" vs="AN1151PD8A" on port
-        wr1}
-      \item If the reported port is intended to be used to connect a device that
-        does not require WR synchronization (e.g. using a copper SFP module),
-        then you should verify whether the role in the timing configuration for
-        this port is set to \emph{non-wr}.
-      \item Otherwise, you should use a WR-supported SFP module and make sure it
-        is declared together with calibration values in the WRS configuration.
-    \end{enumerate}
-
-  \item \texttt{wrsEndpointStatus}
-    % link problem (e.g. broken SFP, fiber)
-    % gateware problem
-    \begin{enumerate}
-      \item Make several state dumps.
-      \item Restart the switch.
-      \item Check Syslog messages to determine the WR port on which the problem
-        is reported. You should see a message similar to this one:\\
-        \texttt{SNMP: wrsEndpointStatus failed for port 1}
-      \item Check the fiber link on a reported port, i.e. try replacing SFP
-        transceivers on both sides of the link, try using another fiber.
-      \item If the problem persists, please contact WR experts for in-depth
-        investigation.
-    \end{enumerate}
-
-  \item \texttt{wrsSwcoreStatus}
-    \begin{enumerate}
-      \item Dump state.
-      \item Restart the switch.
-      \item Please contact WR experts since this might mean that either there is
-        too much high priority traffic in your network, or there is some
-        internal problem in the WRS firmware.
-    \end{enumerate}
-
-  \item \texttt{wrsRTUStatus}
-    \begin{enumerate}
-      \item Dump state
-      \item Restart the switch.
-      \item If possible, try reducing the load of small Ethernet frames flowing
-        through your switch. If possible in your application, try using larger
-        Ethernet frames with lower load to transfer information.
-    \end{enumerate}
-\end{itemize}
-
-\subsection{Replacing WR Switch with a new unit}
-\label{cern:wrs_replacement}
-This just a reference holder to point to the CERN wikis with the description of
-updating MAC in network database so that the same configuration is used.
--- a/doc/wrs_failures/snmp_exports.tex
+++ b/doc/wrs_failures/snmp_exports.tex
@@ -30,21 +30,21 @@ are some common remarks that apply to all situations:
    that every time Syslog says there is a problem on port X, this refers to
    port index X+1 on the front panel of the switch.
  \item If a procedure given for a specific SNMP object does not solve the
-    problem. Please contact WR experts to perform more in-depth analysis of your
-    network. For this, you should provide a complete dump of the WRS status
+    problem, please contact WR experts to perform a more in-depth analysis of
+    the network. For this, you should provide a complete dump of the WRS status
    generated in the first step of each procedure.
-  \item First action in most of the procedures below named \emph{Dump state}
+  \item The first action in most of the procedures below named \emph{Dump state}
    requires simply calling a tool provided by WR developers that reads all the
    detailed information from the switch and writes it to a single file that can
    be later analyzed by the experts.\\
    {\bf TODO: point to the tool once it's done}
-  \item If solving procedure requires restarting or replacing a broken WR
-    Switch, please make sure that after the repair, all other WR devices
+  \item If a problem solving procedure requires restarting or replacing a broken
+    WR Switch, please make sure that after the repair, all other WR devices
    connected to the affected switch are synchronized and do not report any
    problems.
-  \item If a procedure requires replacing switch with a new unit, the broken one
-    should be handled to WR experts or the switch manufacturer to investigate
-    the problem.
+  \item If a procedure requires replacing a switch with a new unit, the broken
+    one should be handled to WR experts or the switch manufacturer to
+    investigate the problem.
 \end{itemize}

 \subsection{General status objects for operators}
@@ -52,7 +52,7 @@ are some common remarks that apply to all situations:
 This section describes the general status MIB objects that represent the overall
 status of a device and its subsystems. They are organized in a tree structure
 (fig.\ref{fig:snmp_oper}) where each object reports a problem based on the
-status of its child objects. SNMP object in the third layer of this tree are
+status of its child objects. SNMP objects in the third layer of this tree are
 calculated based on the SNMP expert objects. Most of the status objects
 described in this section can have one of the following values:
 \begin{figure}[ht]
@@ -69,12 +69,12 @@ described in this section can have one of the following values:
  \item \texttt{Warning} -- objects used to calculate this value are outside the
    proper values, but problem in not critical enough to report \texttt{Error}.
  \item \texttt{WarningNA} -- at least one of the objects used to calculate the
-    status has a value \texttt{NA} or \texttt{WarningNA}.
+    status has a value \texttt{NA} (or \texttt{WarningNA}).
  \item \texttt{Error} -- error in values used to calculate the particular
    object.
  \item \texttt{FirstRead} -- the value of the object cannot be calculated
    because at least one condition uses deltas between the current and previous
-    value. This value should appear only at first SNMP read. Threated as a
+    value. This value should appear only at first SNMP read. To be treated as a
    correct value.
  \item \texttt{Bug} -- Something wrong has happened while calculating the
    object. If you see this please report to WR developers.

--- a/doc/wrs_failures/snmp_objects.tex
+++ b/doc/wrs_failures/snmp_objects.tex
@@ -8,24 +8,32 @@
  subsystems.}

  \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsMainSystemStatus}{
+    \underline{Description:}
    WRS general status of a switch can be \texttt{OK}, \texttt{Warning} or
    \texttt{Error}. In case of an error or warning, please check the values of
    \texttt{\glshyperlink{WR-SWITCH-MIB::wrsOSStatus}},
    \texttt{\glshyperlink{WR-SWITCH-MIB::wrsTimingStatus}} and
    \texttt{\glshyperlink{WR-SWITCH-MIB::wrsNetworkingStatus}} to find out which
-    subsystem causes the problem.}
+    subsystem causes the problem.
+    \glspar \underline{Related problems:}}
  \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsOSStatus}{
+    \underline{Description:}
    Collective status of the operating system running on WR switch. In case of
    an error or warning, please check status objects in the
-    \texttt{\glshyperlink{WR-SWITCH-MIB::wrsOSStatusGroup}}.}
+    \texttt{\glshyperlink{WR-SWITCH-MIB::wrsOSStatusGroup}}.
+    \glspar \underline{Related problems:}}
  \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsTimingStatus}{
+    \underline{Description:}
    Collective status of the synchronization subsystem. In case of an
    error or warning, please check status objects in the
-    \texttt{\glshyperlink{WR-SWITCH-MIB::wrsTimingStatusGroup}}.}
+    \texttt{\glshyperlink{WR-SWITCH-MIB::wrsTimingStatusGroup}}.
+    \glspar \underline{Related problems:}}
  \snmpentrys{WR-SWITCH-MIB}{wrsGeneralStatusGroup}{wrsNetworkingStatus}{
+    \underline{Description:}
    Collective status of the Ethernet switching subsystem. In case of an error
    or warning, please check status objects in the
-    \texttt{\glshyperlink{WR-SWITCH-MIB::wrsNetworkingStatusGroup}}.}
+    \texttt{\glshyperlink{WR-SWITCH-MIB::wrsNetworkingStatusGroup}}.
+    \glspar \underline{Related problems:}}

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \snmpentrys{WR-SWITCH-MIB}{}{wrsDetailedStatusesGroup}{
@@ -174,7 +182,7 @@
    \begin{pck_proc}
      \item Dump state
      \item Check 1-PPS and 10 MHz signals coming from an external source.
-        Verify if they are properly connected and, in case of GPS receiver,
+        Verify if they are properly connected and, in case of a GPS receiver,
        check if it is synchronized and locked.
      \item Restart the GrandMaster switch.
      \item If the problem persists, replace the switch with a new unit.
@@ -388,7 +396,7 @@

  \snmpentrys{WR-SWITCH-MIB}{wrsVersionGroup}{wrsVersionLastUpdateDate}{
    \underline{Description:}
-    Date and time of the last firmware update, this information may not be
+    Date and time of the last firmware update. This information may not be
    accurate, due to hard restarts or lack of the proper time during the
    upgrade.}